In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import models as models

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet, SGDClassifier
from sklearn.svm import SVR, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import set_config
from joblib import dump, load

In [2]:
df_train_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/train.csv')

df = df_train_raw.copy()

In [3]:
df_test_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/test.csv')
df_test = df_test_raw.copy()

In [4]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [5]:
# This will use the entire dataset
X= df.drop('SalePrice', axis=1)
y= df['SalePrice']

In [6]:
# set up baseline estimators without hyperparameter optimization

estimator_list = (LinearRegression(n_jobs=-1), 
                  LogisticRegression(n_jobs=-1, max_iter=500_000), 
                  Ridge(max_iter=500_000), 
                  Lasso(max_iter=500_000), 
                  ElasticNet(max_iter=500_000), 
                  KNeighborsRegressor(n_jobs=-1), 
                  SVR())

In [7]:
# set up common naming convention
index = ['LinearRegression()', 'LogisticRegression()', 'Ridge()', 'Lasso()', 'ElasticNet()', 'KNeighborsRegressor()', 'SVR()']

In [8]:
# Set up GridSearchCV Params for all 6 GridSearch instances

In [9]:
estimator_params_true = {
    LinearRegression(): {
        'linearregression__fit_intercept': [True],
        'linearregression__n_jobs': [-1]},
    Ridge():{
        'ridge__alpha': list(range(1,11)),
        'ridge__fit_intercept': [True],
        'ridge__max_iter': [100_000],
    },
    Lasso(): {
        'lasso__alpha': list(range(1,11)),
        'lasso__fit_intercept': [True],
        'lasso__max_iter': [100_000],
    }, 
    ElasticNet():{
        'elasticnet__alpha': list(range(1,11)),
        'elasticnet__fit_intercept': [True],
        'elasticnet__l1_ratio': np.arange(0.1, 1, 0.2),
        'elasticnet__max_iter': [100_000],
    }, 
    }

In [10]:
# Hyperparameter range for kbest was found through a manual kbestfeatures

In [11]:
estimator_params_true_k = {
    LinearRegression(): {
        'linearregression__fit_intercept': [True],
        'linearregression__n_jobs': [-1],
        'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
        'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    },
    Ridge():{
        'ridge__alpha': list(range(1,11)),
        'ridge__fit_intercept': [True],
        'ridge__max_iter': [100_000],
        'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
        'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    },
    Lasso(): {
        'lasso__alpha': list(range(1,11)),
        'lasso__fit_intercept': [True],
        'lasso__max_iter': [100_000],
        'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
        'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    }, 
    ElasticNet():{
        'elasticnet__alpha': list(range(1,11)),
        'elasticnet__fit_intercept': [True],
        'elasticnet__l1_ratio': np.arange(0.1, 1, 0.2),
        'elasticnet__max_iter': [100_000],
        'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
        'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5))
    }, 
    }

In [12]:
estimator_params_false = {
    LinearRegression(): {
    'linearregression__fit_intercept': [False],
    'linearregression__n_jobs': [-1]},
    Ridge():{
    'ridge__alpha': list(range(1,11)),
    'ridge__fit_intercept': [False],
    'ridge__max_iter': [100_000]}, 
    Lasso(): {
    'lasso__alpha': list(range(1,51,5)),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [100_000]},
    ElasticNet():{
    'elasticnet__alpha': list(range(1,11)),
    'elasticnet__fit_intercept': [False],
    'elasticnet__l1_ratio': np.arange(0.1, 1, 0.2),
    'elasticnet__max_iter': [100_000]}, 

}

In [13]:
estimator_params_false_k = {
    LinearRegression(): {
    'linearregression__fit_intercept': [False],
    'linearregression__n_jobs': [-1],
    'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
    'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    },
    Ridge():{
    'ridge__alpha': list(range(1,11)),
    'ridge__fit_intercept': [False],
    'ridge__max_iter': [100_000],
    'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
    'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    }, 
    Lasso(): {
    'lasso__alpha': list(range(1,51,5)),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [100_000],
    'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
    'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    },
    ElasticNet():{
    'elasticnet__alpha': list(range(1,11)),
    'elasticnet__fit_intercept': [False],
    'elasticnet__l1_ratio': np.arange(0.1, 1, 0.2),
    'elasticnet__max_iter': [100_000],
    'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
    'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    }, 

}

In [14]:
estimator_params_other = {
    KNeighborsRegressor():{
        'kneighborsregressor__n_neighbors': list(range(5, 31,2)),
        'kneighborsregressor__leaf_size': list(range(25, 45,2)),
        'kneighborsregressor__n_jobs': [-1]}, 
    SVR():{
        'svr__C': [0.001, 0.01, 0.1, 1, 10],
        'svr__gamma': [0.001, 0.01, 0.1, 1],
}}

In [15]:
estimator_params_other_k = {
    KNeighborsRegressor():{
        'kneighborsregressor__n_neighbors': list(range(25, 45,2)),
        'kneighborsregressor__leaf_size': list(range(25, 45,2)),
        'kneighborsregressor__n_jobs': [-1],
        'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
        'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
    }, 
    SVR():{
        'svr__C': [0.001, 0.01, 0.1, 1, 10],
        'svr__gamma': [0.001, 0.01, 0.1, 1],
        'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)), 
        'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5)),
}}

In [16]:
# Create Custom Preprocessing Module to include KBest
# This is the version which we will train a baseline with kfeatures
# There is a basic version of preprocessing inside class ModelSelection which does not handle feature selection

numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer()),
    ('num_scaler', StandardScaler()),
    ('num_kbest', SelectKBest())])

categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore')),
    ('cat_scaler', StandardScaler(with_mean=False)),
    ('cat_kbest', SelectKBest(score_func=chi2))])


preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [17]:
# Instantiate Grid and Pipe CV Class Object
ms = models.ModelSelection(X, y)

In [18]:
# Set up Baseline Pipeline
pipe_list = ms.make_pipe(estimator_list)

In [19]:
# Display Pipeline to verify creation
set_config(display='diagram')
pipe_list[0]

In [20]:
# Evaluate Baseline Pipeline

In [21]:
fitted_pipe_list, fitted_pipe_objects = ms.evaluate_pipes(pipe_list=pipe_list)

In [22]:
# Create Results Dataframe, Calculate RSME as benchmark for Kaggle

In [23]:
df_results = pd.DataFrame(fitted_pipe_list, index=index, columns=['Baseline Test Score'])

In [24]:
rsme_list = ms.calculate_rsme(fitted_pipe_objects)

In [25]:
rsme_list = (pd.DataFrame(rsme_list, columns=['rsme'], index=index))

In [26]:
df_results = df_results.join(rsme_list)

In [27]:
df_results

Unnamed: 0,Baseline Test Score,rsme
LinearRegression(),0.743534,40023.587103
LogisticRegression(),0.017544,46334.515213
Ridge(),0.748093,39666.22496
Lasso(),0.744031,39984.777598
ElasticNet(),0.814004,34084.183871
KNeighborsRegressor(),0.758171,38864.71112
SVR(),-0.051186,81029.055916


In [28]:
# Instantiate Baseline with KBest Features
pipe_list_wfs = ms.make_pipe(preprocessing=preprocessing, estimator_list=estimator_list)

In [29]:
# Display Baseline Pipeline with KBest Features
pipe_list_wfs[0]

In [30]:
# Evaluate
fitted_pipe_list_wfs, fitted_pipe_objects_wfs = ms.evaluate_pipes(pipe_list=pipe_list_wfs)

In [31]:
fitted_pipe_list_wfs

[-3.787655638031341e+26,
 0.011695906432748537,
 0.7061440577342226,
 0.7059646622194216,
 0.7124669160037669,
 0.8592406946881062,
 -0.047637674961855]

In [32]:
# Calculate RSME, add to results Dataframe

In [33]:
df_results_wfs = pd.DataFrame(fitted_pipe_list_wfs, index=index, columns=['Baseline w/ KBest'])

In [34]:
rsme_list_wfs = ms.calculate_rsme(fitted_pipe_objects_wfs)

In [35]:
rsme_list_wfs = (pd.DataFrame(rsme_list_wfs, columns=['rsme_kbest'], index=index))

In [36]:
df_results_wfs = df_results_wfs.join(rsme_list_wfs)

In [37]:
df_results = df_results.join(df_results_wfs)

In [38]:
df_results

Unnamed: 0,Baseline Test Score,rsme,Baseline w/ KBest,rsme_kbest
LinearRegression(),0.743534,40023.587103,-3.787656e+26,1.538106e+18
LogisticRegression(),0.017544,46334.515213,0.01169591,43618.38
Ridge(),0.748093,39666.22496,0.7061441,42841.84
Lasso(),0.744031,39984.777598,0.7059647,42854.92
ElasticNet(),0.814004,34084.183871,0.7124669,42378.42
KNeighborsRegressor(),0.758171,38864.71112,0.8592407,29651.01
SVR(),-0.051186,81029.055916,-0.04763767,80892.16


In [39]:
# Optional. save current Dataframe to csv for reports
#df_results.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/notebooks/outputted work/baseline-924.csv')

In [40]:
# Instantiate 6 Gridsear`ch CV for model optimization

In [41]:
pipe_list_true, grid_list_true = ms.make_grid_search(list(estimator_params_true.keys()), 
                                                     list(estimator_params_true.values()))

In [42]:
pipe_list_true_k, grid_list_true_k = ms.make_grid_search_wfs(preprocessing=preprocessing, 
                                                         estimator_list=list(estimator_params_true_k.keys()), 
                                                         params=list(estimator_params_true_k.values()))

In [43]:
pipe_list_false, grid_list_false = ms.make_grid_search(list(estimator_params_false.keys()), 
                                                       list(estimator_params_false.values()))

In [44]:
pipe_list_false_k, grid_list_false_k = ms.make_grid_search_wfs(preprocessing=preprocessing, 
                                                           estimator_list=list(estimator_params_false_k.keys()), 
                                                           params=list(estimator_params_false_k.values()))

In [45]:
pipe_list_other, grid_list_other = ms.make_grid_search(list(estimator_params_other.keys()), 
                                                       list(estimator_params_other.values()))

In [46]:
pipe_list_other_k, grid_list_other_k = ms.make_grid_search_wfs(preprocessing=preprocessing, 
                                                           estimator_list=list(estimator_params_other_k.keys()), 
                                                           params=list(estimator_params_other_k.values()))

In [47]:
# Display 6 GridSearchCV objects to verify it was created correctly

In [48]:
grid_list_true[0]

In [49]:
grid_list_true_k[0]

In [50]:
grid_list_false[0]

In [51]:
grid_list_false_k[0]

In [52]:
grid_list_other[0]

In [53]:
grid_list_other_k[0]

In [54]:
# Evaluate all 6 models. This section will take a while
# Note, when I ran this section, it took about 10 hours on an AMD FX8320e with multiprocessing turned on

In [55]:
grid_fit_scores, grid_fit_objects = ms.evaluate_grid_search(grid_list_true)

In [56]:
grid_fit_scores

[0.7435337489033035,
 0.7525954939567431,
 0.7483895532027648,
 0.7824933657185111]

In [57]:
grid_fit_scores_k, grid_fit_objects_k = ms.evaluate_grid_search(grid_list_true_k)

In [58]:
grid_fit_scores_k

[-2.317916202081439e+23,
 0.6827763148880717,
 0.6788585222043683,
 0.708124394175332]

In [59]:
grid_fit_scores_f, grid_fit_objects_f = ms.evaluate_grid_search(grid_list_false)

In [60]:
grid_fit_scores_f

[0.7462170781641819,
 0.7990646376038946,
 0.7570436152718114,
 0.7874192603057474]

In [61]:
grid_fit_scores_f_k, grid_fit_objects_f_k = ms.evaluate_grid_search(grid_list_false_k)

In [62]:
grid_fit_scores_f_k

[-2.0992380627915587e+25,
 -4.622090821587138,
 -4.694550789019159,
 -4.351671737924874]

In [63]:
grid_fit_scores_o, grid_fit_objects_o = ms.evaluate_grid_search(grid_list_other)

In [64]:
grid_fit_scores_o

[0.7508672035569244, -0.04485611005249379]

In [65]:
# I have given up trying to estimate this portion. This took over 24 hours and never converged
#grid_fit_scores_o_k, grid_fit_objects_o_k = ms.evaluate_grid_search(grid_list_other_k)

In [66]:
#grid_fit_scores_o_k

In [67]:
# Create Individual Dataframes for each GridSearchCV Object, calculate RSME for each

In [68]:
w_intercepts = pd.DataFrame(grid_fit_scores, 
                            index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split(), 
                            columns=['w/ Intercept Score'])

In [71]:
rsme_intercepts = ms.calculate_rsme(grid_fit_objects)

In [72]:
rsme_intercepts = (pd.DataFrame(rsme_intercepts, columns=['rsme_i'], 
                                index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split()))

In [73]:
w_intercepts_k = pd.DataFrame(grid_fit_scores_k, 
                            index=str(list(estimator_params_true_k.keys()))[1:-1].replace(',','').split(), 
                            columns=['w/ Intercept Score, KBest'])

In [76]:
rsme_intercepts_k = ms.calculate_rsme(preprocessing=preprocessing, fitted_pipe_objects=grid_fit_objects_k)

In [77]:
rsme_intercepts_k = (pd.DataFrame(rsme_intercepts_k, columns=['rsme_i_kbest'], 
                                index=str(list(estimator_params_true_k.keys()))[1:-1].replace(',','').split()))

In [78]:
w_intercepts_k

Unnamed: 0,"w/ Intercept Score, KBest"
LinearRegression(),-2.317916e+23
Ridge(),0.6827763
Lasso(),0.6788585
ElasticNet(),0.7081244


In [79]:
rsme_intercepts_k

Unnamed: 0,rsme_i_kbest
LinearRegression(),3.804957e+16
Ridge(),44512.68
Lasso(),44786.7
ElasticNet(),42697.24


In [80]:
n_intercepts = pd.DataFrame(grid_fit_scores_f, 
                            index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split(), 
                            columns=['n/ Intercept Score'])

In [82]:
rsme_n_intercepts = ms.calculate_rsme(grid_fit_objects_f)

In [83]:
rsme_n_intercepts = (pd.DataFrame(rsme_n_intercepts,
                                  columns=['rsme_n_i'], index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split()))

In [84]:
n_intercepts_k = pd.DataFrame(grid_fit_scores_f_k, 
                            index=str(list(estimator_params_false_k.keys()))[1:-1].replace(',','').split(), 
                            columns=['n/ Intercept Score w/ KBest'])

In [87]:
rsme_n_intercepts_k = ms.calculate_rsme(preprocessing=preprocessing, fitted_pipe_objects=grid_fit_objects_f_k)

In [88]:
rsme_n_intercepts_k = (pd.DataFrame(rsme_n_intercepts_k, 
                                  columns=['rsme_n_i_kbest'], index=str(list(estimator_params_false_k.keys()))[1:-1].replace(',','').split()))

In [89]:
other = pd.DataFrame(grid_fit_scores_o, 
                     index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split(), 
                     columns=['other Score'])

In [91]:
rsme_other = ms.calculate_rsme(grid_fit_objects_o)

In [92]:
rsme_other = (pd.DataFrame(rsme_other, 
                           columns=['rsme_o'], 
                           index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split()))

In [93]:
# Remove this portion from knn with feature selection

In [94]:
#other_k = pd.DataFrame(grid_fit_scores_o_k, 
#                     index=str(list(estimator_params_other_k.keys()))[1:-1].replace(',','').split(), 
#                     columns=['other Score w/ kbest'])

In [95]:
#rsme_other_k = ms.calculate_rsme(preprocessing, grid_fit_objects_o_k)

In [96]:
#rsme_other_k = (pd.DataFrame(rsme_other_k, 
#                           columns=['rsme_o_k'], 
#                           index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split()))

In [97]:
# Add DataFrames from 6 models to df_results

In [98]:
df_results = df_results.join(w_intercepts)

In [99]:
df_results = df_results.join(rsme_intercepts)

In [100]:
df_results = df_results.join(w_intercepts_k)

In [101]:
df_results = df_results.join(rsme_intercepts_k)

In [102]:
df_results = df_results.join(n_intercepts)

In [103]:
df_results = df_results.join(rsme_n_intercepts)

In [104]:
f_results = df_results.join(n_intercepts_k)

In [105]:
df_results = df_results.join(rsme_n_intercepts_k)

In [106]:
df_results = df_results.join(other)

In [107]:
df_results = df_results.join(rsme_other)

In [108]:
#df_results = df_results.join(other_k)

In [109]:
#df_results = df_results.join(rsme_other_k)

In [110]:
# let's see what's the best by looking at the lowest RSME score. Our goal is to find a model with the lowest RSME so we can do a deep dive
# We can use joblib to save the entire state of our model and load it into another Jupyter Notebook

In [111]:
df_results

Unnamed: 0,Baseline Test Score,rsme,Baseline w/ KBest,rsme_kbest,w/ Intercept Score,rsme_i,"w/ Intercept Score, KBest",rsme_i_kbest,n/ Intercept Score,rsme_n_i,rsme_n_i_kbest,other Score,rsme_o
LinearRegression(),0.743534,40023.587103,-3.787656e+26,1.538106e+18,0.743534,40023.587103,-2.317916e+23,3.804957e+16,0.746217,39813.659178,3.621027e+17,,
LogisticRegression(),0.017544,46334.515213,0.01169591,43618.38,,,,,,,,,
Ridge(),0.748093,39666.22496,0.7061441,42841.84,0.752595,39310.149949,0.6827763,44512.68,0.799065,35426.568292,187391.5,,
Lasso(),0.744031,39984.777598,0.7059647,42854.92,0.74839,39642.883142,0.6788585,44786.7,0.757044,38955.165705,188595.2,,
ElasticNet(),0.814004,34084.183871,0.7124669,42378.42,0.782493,36858.45712,0.7081244,42697.24,0.787419,36438.69834,182829.2,,
KNeighborsRegressor(),0.758171,38864.71112,0.8592407,29651.01,,,,,,,,0.750867,39447.215191
SVR(),-0.051186,81029.055916,-0.04763767,80892.16,,,,,,,,-0.044856,80784.70458


In [112]:
grid_fit_objects_k[0].best_score_

-2.6290946154685833e+23

In [113]:
#save = dump(grid_fit_objects_f[2], '/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/lasso-rsme-22887.joblib')

In [114]:
grid_fit_objects_f[2].best_params_

{'lasso__alpha': 46, 'lasso__fit_intercept': False, 'lasso__max_iter': 100000}