# Developing a Machine Learning Model

In the other notebooks we've done some exploratory analysis, then developed a data cleaning pipeline to prepare the data for a ML model. Here we will import our training and testing data sets, run them through the pipeline and begin exploring what kind of ML model we can use to predict the median home prices of US Census Bureau districts.

In [1]:
# data tools
import pandas as pd
import numpy as np

In [2]:
"""
Copy and pasted our pipeline code from PreprocessingPipeline.ipynb
"""
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X["total_rooms"]/X["households"]
        bedrooms_per_room = X["total_bedrooms"]/X["total_rooms"]
        population_per_household = X["population"]/X["households"]
        
        return np.c_[X, rooms_per_household, bedrooms_per_room, population_per_household]

# create a pipeline for our numerical attributes only
num_pipeline = Pipeline([
    ('attribs_adder', CombinedAttributesAdder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

# combine the numerical pipeline with the categorical one (just one-hot encoding)
num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 
               'population', 'households', 'median_income']
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [3]:
# load our data, which we've already split into training and test sets
housing_train = pd.read_pickle("StratifiedTrainingSet.pkl")
housing_test = pd.read_pickle("StratifiedTestSet.pkl")

# seperate labels from features, run data cleaning on the features
# our variable naming conventions: X for features, y for labels
X_train = full_pipeline.fit_transform(housing_train.drop(columns=["median_house_value"],axis=1))
y_train = housing_train["median_house_value"]

# IMPORTANT: don't touch the test sets until we have a model we are confident in
X_test = full_pipeline.fit_transform(housing_test.drop(columns=["median_house_value"],axis=1))
y_test = housing_test["median_house_value"]

# reintroduce column labels by changing X back into a DataFrame
X_cols = ['longitude','latitude','housing_median_age','total_rooms',
          'total_bedrooms','population','households','median_income',
          'rooms_per_household', 'bedrooms_per_room', 'population_per_household', 
          '<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

X_train = pd.DataFrame(X_train, columns=X_cols)
X_test = pd.DataFrame(X_test, columns=X_cols)

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 16512 non-null  float64
 1   latitude                  16512 non-null  float64
 2   housing_median_age        16512 non-null  float64
 3   total_rooms               16512 non-null  float64
 4   total_bedrooms            16512 non-null  float64
 5   population                16512 non-null  float64
 6   households                16512 non-null  float64
 7   median_income             16512 non-null  float64
 8   rooms_per_household       16512 non-null  float64
 9   bedrooms_per_room         16512 non-null  float64
 10  population_per_household  16512 non-null  float64
 11  <1H OCEAN                 16512 non-null  float64
 12  INLAND                    16512 non-null  float64
 13  ISLAND                    16512 non-null  float64
 14  NEAR B

In [5]:
# At this point we are going to remove any features that we don't intend to train the model with
dumpcolumns = ["longitude","latitude","total_rooms","total_bedrooms","population","households"]
print(dumpcolumns)
X_train = X_train.drop(columns=dumpcolumns,axis=1)
X_test = X_test.drop(columns=dumpcolumns,axis=1)

X_train.info()

['longitude', 'latitude', 'total_rooms', 'total_bedrooms', 'population', 'households']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   housing_median_age        16512 non-null  float64
 1   median_income             16512 non-null  float64
 2   rooms_per_household       16512 non-null  float64
 3   bedrooms_per_room         16512 non-null  float64
 4   population_per_household  16512 non-null  float64
 5   <1H OCEAN                 16512 non-null  float64
 6   INLAND                    16512 non-null  float64
 7   ISLAND                    16512 non-null  float64
 8   NEAR BAY                  16512 non-null  float64
 9   NEAR OCEAN                16512 non-null  float64
dtypes: float64(10)
memory usage: 1.3 MB


# Training and Validating ML Models

In [6]:
from sklearn.model_selection import cross_val_score
# simple function for initial judgement of these models
def scoremodel(reg_model):
    # testing the model with K-fold cross validation (using 10 folds)
    scores = cross_val_score(reg_model, X_train, y_train, scoring="neg_mean_squared_error",cv=10)
    rmse_scores = np.sqrt(-scores)
    print("Scores: ",rmse_scores)
    print("Mean: ",rmse_scores.mean())
    print("Standard Deviation: ",rmse_scores.std())

In [7]:
"""
Starting simple with a Linear Regression
"""
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
scoremodel(lin_reg)

Scores:  [69642.60584811 71379.12594638 71369.98715096 74555.94013165
 72968.70315288 74831.00002703 68361.14512849 72746.22540469
 75218.09095873 72202.00186093]
Mean:  72327.48256098537
Standard Deviation:  2120.0959388507335


In [8]:
"""
Decision Tree Model
"""
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
scoremodel(tree_reg)

Scores:  [82110.43278754 82712.58740762 86490.43903517 84680.54997075
 83549.21695164 86322.63260083 82044.94372132 80694.62721931
 85151.01207758 88838.54193181]
Mean:  84259.49837035524
Standard Deviation:  2372.8003476435833


In [9]:
"""
Random Forest Model
"""
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
scoremodel(forest_reg)

Scores:  [59036.82723391 57820.56650092 60702.32252645 63087.62833502
 60970.86418776 63220.1745083  57520.05471056 57944.6887337
 62916.70059696 60504.40248355]
Mean:  60372.42298171179
Standard Deviation:  2115.784664379974
