# Developing a Machine Learning Model

In the other notebooks we've done some exploratory analysis, then developed a data cleaning pipeline to prepare the data for a ML model. Here we will import our training and testing data sets, run them through the pipeline and begin exploring what kind of ML model we can use to predict the median home prices of US Census Bureau districts.

In [6]:
# data tools
import pandas as pd
import numpy as np

In [24]:
""Copy and pasted our pipeline code from PreprocessingPipeline.ipynb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X["total_rooms"]/X["households"]
        bedrooms_per_room = X["total_bedrooms"]/X["total_rooms"]
        population_per_household = X["population"]/X["households"]
        
        return np.c_[X, rooms_per_household, bedrooms_per_room, population_per_household]

# create a pipeline for our numerical attributes only
num_pipeline = Pipeline([
    ('attribs_adder', CombinedAttributesAdder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

# combine the numerical pipeline with the categorical one (just one-hot encoding)
num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 
               'population', 'households', 'median_income']
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [27]:
# load our data, which we've already split into training and test sets
housing_train = pd.read_pickle("StratifiedTrainingSet.pkl")
housing_test = pd.read_pickle("StratifiedTestSet.pkl")

# load the data cleaning pipeline from the script we wrote
from DataCleaningPipeline import RunPipeline

# seperate labels from features, run data cleaning on the features
# our variable naming conventions: X for features, y for labels
X_train = full_pipeline.fit_transform(housing_train.drop(columns=["median_house_value"],axis=1))
y_train = housing_train["median_house_value"].to_numpy()

# IMPORTANT: don't touch the test sets until we have a model we are confident in
X_test = full_pipeline.fit_transform(housing_test.drop(columns=["median_house_value"],axis=1))
y_test = housing_test["median_house_value"].to_numpy()

# at this point, we have the following features available to us
X_cols = ['longitude','latitude','housing_median_age','total_rooms',
          'total_bedrooms','population','households','median_income',
          'rooms_per_household', 'bedrooms_per_room', 'population_per_household', 
          '<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
y_cols = ["meadian_house_value"]

# Training and Validating ML Models

In [18]:
"""
Decision Tree Model
"""
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree.mse)
tree_rmse

ValueError: could not convert string to float: 'longitude'