In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import models as models

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet, SGDClassifier
from sklearn.svm import SVR, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import set_config
from joblib import dump, load

In [3]:
df_train_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/train.csv')

df = df_train_raw.copy()

In [4]:
df_test_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/test.csv')
df_test = df_test_raw.copy()

In [5]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
# This will use the entire dataset
X= df.drop('SalePrice', axis=1)
y= df['SalePrice']

In [7]:
# set up baseline estimators without hyperparameter optimization

estimator_list = (LinearRegression(n_jobs=-1), 
                  LogisticRegression(n_jobs=-1, max_iter=500_000), 
                  Ridge(max_iter=500_000), 
                  Lasso(max_iter=500_000), 
                  ElasticNet(max_iter=500_000), 
                  KNeighborsRegressor(n_jobs=-1), 
                  SVR())

In [8]:
# set up common naming convention
index = ['LinearRegression()', 'LogisticRegression()', 'Ridge()', 'Lasso()', 'ElasticNet()', 'KNeighborsRegressor()', 'SVR()']

In [9]:
# Set up GridSearchCV Params for all 6 GridSearch instances

In [10]:
estimator_params_true = {
    LinearRegression(): {
        'linearregression__fit_intercept': [True],
        'linearregression__n_jobs': [-1]},
    Ridge():{
        'ridge__alpha': list(range(1,11)),
        'ridge__fit_intercept': [True],
        'ridge__max_iter': [100_000],
    },
    Lasso(): {
        'lasso__alpha': list(range(1,11)),
        'lasso__fit_intercept': [True],
        'lasso__max_iter': [100_000],
    }, 
    ElasticNet():{
        'elasticnet__alpha': list(range(1,11)),
        'elasticnet__fit_intercept': [True],
        'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
        'elasticnet__max_iter': [100_000],
    }, 
    }

In [11]:
# Hyperparameter range for kbest was found through a manual kbestfeatures

In [12]:
estimator_params_true_k = {
    LinearRegression(): {
        'linearregression__fit_intercept': [True],
        'linearregression__n_jobs': [-1],
        'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
        'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    },
    Ridge():{
        'ridge__alpha': list(range(1,11)),
        'ridge__fit_intercept': [True],
        'ridge__max_iter': [100_000],
        'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
        'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    },
    Lasso(): {
        'lasso__alpha': list(range(1,11)),
        'lasso__fit_intercept': [True],
        'lasso__max_iter': [100_000],
        'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
        'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    }, 
    ElasticNet():{
        'elasticnet__alpha': list(range(1,11)),
        'elasticnet__fit_intercept': [True],
        'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
        'elasticnet__max_iter': [100_000],
        'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
        'columntransformer__cat__cat_kbest__k': list(range(10, 25,2))
    }, 
    }

In [13]:
estimator_params_false = {
    LinearRegression(): {
    'linearregression__fit_intercept': [False],
    'linearregression__n_jobs': [-1]},
    Ridge():{
    'ridge__alpha': list(range(1,11)),
    'ridge__fit_intercept': [False],
    'ridge__max_iter': [100_000]}, 
    Lasso(): {
    'lasso__alpha': list(range(1,51,2)),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [100_000]},
    ElasticNet():{
    'elasticnet__alpha': list(range(1,11)),
    'elasticnet__fit_intercept': [False],
    'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
    'elasticnet__max_iter': [100_000]}, 

}

In [14]:
estimator_params_false_k = {
    LinearRegression(): {
    'linearregression__fit_intercept': [False],
    'linearregression__n_jobs': [-1],
    'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
    'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    },
    Ridge():{
    'ridge__alpha': list(range(1,11)),
    'ridge__fit_intercept': [False],
    'ridge__max_iter': [100_000],
    'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
    'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    }, 
    Lasso(): {
    'lasso__alpha': list(range(1,51)),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [100_000],
    'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
    'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    },
    ElasticNet():{
    'elasticnet__alpha': list(range(1,11)),
    'elasticnet__fit_intercept': [False],
    'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
    'elasticnet__max_iter': [100_000],
    'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
    'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    }, 

}

In [15]:
estimator_params_other = {
    KNeighborsRegressor():{
        'kneighborsregressor__n_neighbors': list(range(5, 31,2)),
        'kneighborsregressor__leaf_size': list(range(25, 45,2)),
        'kneighborsregressor__n_jobs': [-1]}, 
    SVR():{
        'svr__C': [0.001, 0.01, 0.1, 1, 10],
        'svr__gamma': [0.001, 0.01, 0.1, 1],
}}

In [16]:
estimator_params_other_k = {
    KNeighborsRegressor():{
        'kneighborsregressor__n_neighbors': list(range(25, 45,2)),
        'kneighborsregressor__leaf_size': list(range(25, 45,2)),
        'kneighborsregressor__n_jobs': [-1],
        'columntransformer__num__num_kbest__k': list(range(10, 20,2)),
        'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
    }, 
    SVR():{
        'svr__C': [0.001, 0.01, 0.1, 1, 10],
        'svr__gamma': [0.001, 0.01, 0.1, 1],
        'columntransformer__num__num_kbest__k': list(range(10, 20,2)), 
        'columntransformer__cat__cat_kbest__k': list(range(10, 25,2)),
}}

In [17]:
# Create Custom Preprocessing Module to include KBest
# This is the version which we will train a baseline with kfeatures
# There is a basic version of preprocessing inside class ModelSelection which does not handle feature selection

numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer()),
    ('num_scaler', StandardScaler()),
    ('num_kbest', SelectKBest())])

categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore')),
    ('cat_scaler', StandardScaler(with_mean=False)),
    ('cat_kbest', SelectKBest(score_func=chi2))])


preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [18]:
# Instantiate Grid and Pipe CV Class Object
ms = models.ModelSelection(X, y)

In [19]:
# Set up Baseline Pipeline
pipe_list = ms.make_pipe(estimator_list)

In [20]:
# Display Pipeline to verify creation
set_config(display='diagram')
pipe_list[0]

In [21]:
# Evaluate Baseline Pipeline

In [None]:
fitted_pipe_list, fitted_pipe_objects = ms.evaluate_pipes(pipe_list=pipe_list)

In [None]:
# Create Results Dataframe, Calculate RSME as benchmark for Kaggle

In [None]:
df_results = pd.DataFrame(fitted_pipe_list, index=index, columns=['Baseline Test Score'])

In [None]:
rsme_list = ms.calculate_rsme(fitted_pipe_objects)

In [None]:
rsme_list = (pd.DataFrame(rsme_list, columns=['rsme'], index=index))

In [None]:
df_results = df_results.join(rsme_list)

In [None]:
df_results

In [None]:
# Instantiate Baseline with KBest Features
pipe_list_wfs = ms.make_pipe(preprocessing=preprocessing, estimator_list=estimator_list)

In [None]:
# Display Baseline Pipeline with KBest Features
pipe_list_wfs[0]

In [None]:
# Evaluate
fitted_pipe_list_wfs, fitted_pipe_objects_wfs = ms.evaluate_pipes(pipe_list=pipe_list_wfs)

In [None]:
fitted_pipe_list_wfs

In [None]:
# Calculate RSME, add to results Dataframe

In [None]:
df_results_wfs = pd.DataFrame(fitted_pipe_list_wfs, index=index, columns=['Baseline w/ KBest'])

In [None]:
rsme_list_wfs = ms.calculate_rsme(fitted_pipe_objects_wfs)

In [None]:
rsme_list_wfs = (pd.DataFrame(rsme_list_wfs, columns=['rsme_kbest'], index=index))

In [None]:
df_results_wfs = df_results_wfs.join(rsme_list_wfs)

In [None]:
df_results = df_results.join(df_results_wfs)

In [None]:
df_results

In [None]:
# Optional. save current Dataframe to csv for reports
#df_results.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/notebooks/outputted work/baseline-924.csv')

In [None]:
# Instantiate 6 Gridsear`ch CV for model optimization

In [None]:
pipe_list_true, grid_list_true = ms.make_grid_search(list(estimator_params_true.keys()), 
                                                     list(estimator_params_true.values()))

In [None]:
pipe_list_true_k, grid_list_true_k = ms.make_grid_search_wfs(preprocessing=preprocessing, 
                                                         estimator_list=list(estimator_params_true_k.keys()), 
                                                         params=list(estimator_params_true_k.values()))

In [None]:
pipe_list_false, grid_list_false = ms.make_grid_search(list(estimator_params_false.keys()), 
                                                       list(estimator_params_false.values()))

In [None]:
pipe_list_false_k, grid_list_false_k = ms.make_grid_search_wfs(preprocessing=preprocessing, 
                                                           estimator_list=list(estimator_params_false_k.keys()), 
                                                           params=list(estimator_params_false_k.values()))

In [None]:
pipe_list_other, grid_list_other = ms.make_grid_search(list(estimator_params_other.keys()), 
                                                       list(estimator_params_other.values()))

In [None]:
pipe_list_other_k, grid_list_other_k = ms.make_grid_search_wfs(preprocessing=preprocessing, 
                                                           estimator_list=list(estimator_params_other_k.keys()), 
                                                           params=list(estimator_params_other_k.values()))

In [None]:
# Display 6 GridSearchCV objects to verify it was created correctly

In [None]:
grid_list_true[0]

In [None]:
grid_list_true_k[0]

In [None]:
grid_list_false[0]

In [None]:
grid_list_false_k[0]

In [None]:
grid_list_other[0]

In [None]:
grid_list_other_k[0]

In [None]:
# Evaluate all 6 models. This section will take a while
# Note, when I ran this section, it took about 10 hours on an AMD FX8320e with multiprocessing turned on

In [None]:
grid_fit_scores, grid_fit_objects = ms.evaluate_grid_search(grid_list_true)

In [None]:
grid_fit_scores

In [None]:
grid_fit_scores_k, grid_fit_objects_k = ms.evaluate_grid_search(grid_list_true_k)

In [None]:
grid_fit_scores_k

In [None]:
grid_fit_scores_f, grid_fit_objects_f = ms.evaluate_grid_search(grid_list_false)

In [None]:
grid_fit_scores_f

In [None]:
grid_fit_scores_f_k, grid_fit_objects_f_k = ms.evaluate_grid_search(grid_list_false_k)

In [None]:
grid_fit_scores_f_k

In [None]:
grid_fit_scores_o, grid_fit_objects_o = ms.evaluate_grid_search(grid_list_other)

In [None]:
grid_fit_scores_o

In [None]:
# I have given up trying to estimate this portion. This took over 24 hours and never converged
#grid_fit_scores_o_k, grid_fit_objects_o_k = ms.evaluate_grid_search(grid_list_other_k)

In [None]:
#grid_fit_scores_o_k

In [None]:
# Create Individual Dataframes for each GridSearchCV Object, calculate RSME for each

In [None]:
w_intercepts = pd.DataFrame(grid_fit_scores, 
                            index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split(), 
                            columns=['w/ Intercept Score'])

In [None]:
rsme_intercepts = ms.calculate_rsme(ms.preprocessing, grid_fit_objects)

In [None]:
rsme_intercepts = (pd.DataFrame(rsme_intercepts, columns=['rsme_i'], 
                                index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split()))

In [None]:
w_intercepts_k = pd.DataFrame(grid_fit_scores_k, 
                            index=str(list(estimator_params_true_k.keys()))[1:-1].replace(',','').split(), 
                            columns=['w/ Intercept Score, KBest'])

In [None]:
rsme_intercepts_k = ms.calculate_rsme(preprocessing, grid_fit_objects_k)

In [None]:
rsme_intercepts_k = (pd.DataFrame(rsme_intercepts_k, columns=['rsme_i_kbest'], 
                                index=str(list(estimator_params_true_k.keys()))[1:-1].replace(',','').split()))

In [None]:
w_intercepts_k

In [None]:
rsme_intercepts_k

In [None]:
n_intercepts = pd.DataFrame(grid_fit_scores_f, 
                            index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split(), 
                            columns=['n/ Intercept Score'])

In [None]:
rsme_n_intercepts = ms.calculate_rsme(ms.preprocessing, grid_fit_objects_f)

In [None]:
rsme_n_intercepts = (pd.DataFrame(rsme_n_intercepts,
                                  columns=['rsme_n_i'], index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split()))

In [None]:
n_intercepts_k = pd.DataFrame(grid_fit_scores_f_k, 
                            index=str(list(estimator_params_false_k.keys()))[1:-1].replace(',','').split(), 
                            columns=['n/ Intercept Score w/ KBest'])

In [None]:
rsme_n_intercepts_k = ms.calculate_rsme(preprocessing, grid_fit_objects_f_k)

In [None]:
rsme_n_intercepts_k = (pd.DataFrame(rsme_n_intercepts_k, 
                                  columns=['rsme_n_i_kbest'], index=str(list(estimator_params_false_k.keys()))[1:-1].replace(',','').split()))

In [None]:
other = pd.DataFrame(grid_fit_scores_o, 
                     index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split(), 
                     columns=['other Score'])

In [None]:
rsme_other = ms.calculate_rsme(ms.preprocessing, grid_fit_objects_o)

In [None]:
rsme_other = (pd.DataFrame(rsme_other, 
                           columns=['rsme_o'], 
                           index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split()))

In [None]:
# Remove this portion from knn with feature selection

In [None]:
#other_k = pd.DataFrame(grid_fit_scores_o_k, 
#                     index=str(list(estimator_params_other_k.keys()))[1:-1].replace(',','').split(), 
#                     columns=['other Score w/ kbest'])

In [None]:
#rsme_other_k = ms.calculate_rsme(preprocessing, grid_fit_objects_o_k)

In [None]:
#rsme_other_k = (pd.DataFrame(rsme_other_k, 
#                           columns=['rsme_o_k'], 
#                           index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split()))

In [None]:
# Add DataFrames from 6 models to df_results

In [None]:
df_results = df_results.join(w_intercepts)

In [None]:
df_results = df_results.join(rsme_intercepts)

In [None]:
df_results = df_results.join(w_intercepts_k)

In [None]:
df_results = df_results.join(rsme_intercepts_k)

In [None]:
df_results = df_results.join(n_intercepts)

In [None]:
df_results = df_results.join(rsme_n_intercepts)

In [None]:
f_results = df_results.join(n_intercepts_k)

In [None]:
df_results = df_results.join(rsme_n_intercepts_k)

In [None]:
df_results = df_results.join(other)

In [None]:
df_results = df_results.join(rsme_other)

In [None]:
#df_results = df_results.join(other_k)

In [None]:
#df_results = df_results.join(rsme_other_k)

In [None]:
# let's see what's the best by looking at the lowest RSME score. Our goal is to find a model with the lowest RSME so we can do a deep dive
# We can use joblib to save the entire state of our model and load it into another Jupyter Notebook

In [None]:
df_results

In [None]:
grid_fit_objects_k[0].best_score_

In [None]:
#save = dump(grid_fit_objects_f[2], '/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/lasso-rsme-22887.joblib')

In [None]:
grid_fit_objects_f[2].best_params_