<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [2]:
import matplotlib.pyplot as plt
import numpy as np 
import os
import pandas as pd 
import pickle
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [28]:
working_dir = os.getcwd()
data_path = working_dir + '/data/'
processed_path = data_path + 'processed/'
model_path = working_dir + "/models/"

In [41]:
#load our training and testing data
X = pd.read_csv(processed_path + 'X.csv', index_col='Id')
y = np.load(processed_path + 'y.npy')
testing_set = pd.read_csv(processed_path + "testing_set.csv", index_col='Id')

In [42]:
X.shape

(1460, 85)

In [43]:
y.shape

(1460,)

In [48]:
testing_set.shape

(1459, 310)

In [49]:
testing_set

<1459x310 sparse matrix of type '<class 'numpy.float64'>'
	with 124003 stored elements in Compressed Sparse Row format>

In [7]:
#split train and test sets, validation set not needed as we will be using RandomSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8,
                                                      test_size=0.2, random_state = 42)

In [47]:
testing_set = preprocessor.transform(testing_set)

In [46]:
X = preprocessor.fit_transform(X)

In [45]:
#NUMERICAL PIPELINE
num_cols = X.select_dtypes(exclude="object").columns
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('quantile_transformer', QuantileTransformer(output_distribution='normal',
                                                 n_quantiles=700,
                                                random_state=42))
])

#CATEGORICAL PIPELINE
cat_cols = X.select_dtypes(include="object").columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

#PREPROCESSOR
preprocessor = ColumnTransformer([
    ('numerical', num_transformer, num_cols),
    ('categorical', cat_transformer, cat_cols),
])

In [19]:
#the preprocessor will transform our data by 
#    - filling missing values with the mean
#    - normalizing the distribution
#    - One Hot Encoding categorical features
preprocessor

ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='constant')),
                                                 ('quantile_transformer',
                                                  QuantileTransformer(n_quantiles=700,
                                                                      output_distribution='normal',
                                                                      random_state=42))]),
                                 Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFin...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'Ga

In [38]:
#save our preprocessor in our /models directory
with open(model_path + f'preprocess_pipeline.h5', 'wb') as f:
        pickle.dump(preprocessor, f)

In [165]:
#DEFINE THE BASE MODEL
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', RandomForestRegressor(random_state = 42))])

In [140]:
#define model param grid for our randomized CV search

rand_search_params = {
                         'model__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 400],
                         'model__max_features': ['auto', 'sqrt'],
                         'model__min_samples_leaf': [1, 2, 4, 8, 10],
                         'model__min_samples_split': [2, 5, 10, 12],
                         'model__n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
                    }

In [141]:
#use the random grid search to explore the hyperparemeter space
#we will further tune these hyperparams using a GridSearch
rf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = rand_search_params, 
                               n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
#fit random search using 5 fold cross validation
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numerical',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='constant')),
                                                                                               ('quantile_transformer',
                                                                                                QuantileTransformer(n_quantiles=700,
                                                                                                                    output_distribution='normal',
                                                                                                                    random_state=42))]),
                                             

In [142]:
#print best params
rf_random.best_params_

{'model__n_estimators': 1400,
 'model__min_samples_split': 5,
 'model__min_samples_leaf': 2,
 'model__max_features': 'auto',
 'model__max_depth': 50}

In [176]:
#define a function evaluate our model by computing the  accuracy, root mean square error, and mean absolute error
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    rmse = mean_squared_error(predictions, test_labels, squared = False)
    mae = mean_absolute_error(predictions, test_labels)
    print('Model Performance')
    print('RMSE = {:0.4f}.'.format(rmse))
    print('Mean Absolute Error = {:0.4f}.'.format(mae))
    
    return rmse

In [177]:
#now evaluate the performance of our model on the testing data
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
RMSE = 29259.2885.
Mean Absolute Error = 17491.2912.


In [166]:
#now we will use a GridSearch CV to further tune the best hyperparameters found in the RandomSearch
grid_search_params = { 
                     'model__n_estimators': [1000,1200,1300,1400,1500,1600,1800],
                     'model__min_samples_split': [3,5,7],
                     'model__min_samples_leaf': [2,3,5],
                     'model__max_features': ['auto'],
                     'model__max_depth': [40,45,50,55,60],
                     }

In [167]:
#use grid search to turn hyperparameters further
grid_search = GridSearchCV(estimator = rf_model, param_grid = grid_search_params, 
                          cv = 3, n_jobs = -1, verbose = 2)
#fit grid search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 315 candidates, totalling 945 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='constant')),
                                                                                         ('quantile_transformer',
                                                                                          QuantileTransformer(n_quantiles=700,
                                                                                                              output_distribution='normal',
                                                                                                              random_state=42))]),
                                                                         Index(['MSSubClass', 'LotF

In [178]:
#print best parameters from the grid search
grid_search.best_params_

{'model__max_depth': 40,
 'model__max_features': 'auto',
 'model__min_samples_leaf': 2,
 'model__min_samples_split': 5,
 'model__n_estimators': 1600}

In [179]:
#now we will evaluate the grid search model
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Model Performance
RMSE = 29253.4480.
Mean Absolute Error = 17460.5198.


In [186]:
#we can see that we had a slight decrease in the RMSE and MAE 
#so we will use this model to make our competition predictions
final_pred = best_grid.predict(testing_set)
#output predictions for Kaggle competition
final_pred = pd.DataFrame({'Id': testing_set.index,
                       'SalePrice': final_pred})
final_pred.to_csv(data_path + 'predictions/random_forest_submission.csv', index=False)

In [188]:
#finally we will save our best randomCV model and our best gridCV model
rand_forest_path = model_path + "random_forest/"
with open(rand_forest_path + f'best_RandomForest.pickle', 'wb') as f:
        pickle.dump(best_grid, f)

This model achieved a RMSE score of 16368.78345 on the Kaggle competition data.
<br> Next we will explore boosting models like XGBoost and ensemble learning.