In [75]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet, SGDClassifier
from sklearn.svm import SVR, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error

from sklearn import set_config
from joblib import dump, load

In [2]:
df_train_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/train.csv')

df = df_train_raw.copy()

In [79]:
df_test_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/test.csv')
df_test = df_test_raw.copy()

In [3]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
df_dummies = pd.get_dummies(df)

In [5]:
df_dummies.corr()['SalePrice'].sort_values(ascending=False).head(10)

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Bsmt Qual_Ex       0.586497
Year Built         0.571849
Kitchen Qual_Ex    0.551284
Name: SalePrice, dtype: float64

In [6]:
#X = df[['Overall Qual', 'Gr Liv Area', 'Total Bsmt SF']]
#y = df['SalePrice']

In [7]:
# This will use the entire dataset
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [8]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#numeric_features = list(X.loc[:, X.dtypes == object].columns)
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', StandardScaler())])

#cat_features = list(X.loc[:, X.dtypes != object].columns)
categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [10]:
estimator_list = (LinearRegression(n_jobs=-1), 
                  LogisticRegression(n_jobs=-1, max_iter=100_000), 
                  Ridge(max_iter=100_000), 
                  Lasso(max_iter=100_000), 
                  ElasticNet(max_iter=100_000), 
                  KNeighborsRegressor(n_jobs=-1), 
                  SVR())

In [11]:
index = ['LinearRegression()', 'LogisticRegression()', 'Ridge()', 'Lasso()', 'ElasticNet()', 'KNeighborsRegressor()', 'SVR()']

In [38]:
estimator_params_true = {
    LinearRegression(): {
        'linearregression__fit_intercept': [True],
        'linearregression__n_jobs': [-1]},
    Ridge():{
        'ridge__alpha': list(range(1,11)),
        'ridge__fit_intercept': [True],
        'ridge__max_iter': [100_000],
    },
    Lasso(): {
        'lasso__alpha': list(range(1,11)),
        'lasso__fit_intercept': [True],
        'lasso__max_iter': [100_000],
    }, 
    ElasticNet():{
        'elasticnet__alpha': list(range(1,11)),
        'elasticnet__fit_intercept': [True],
        'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
        'elasticnet__max_iter': [100_000],
    }, 
    }

In [39]:
estimator_params_false = {
    LinearRegression(): {
    'linearregression__fit_intercept': [False],
    'linearregression__n_jobs': [-1]},
    Ridge():{
    'ridge__alpha': list(range(1,11)),
    'ridge__fit_intercept': [False],
    'ridge__max_iter': [100_000]}, 
    Lasso(): {
    'lasso__alpha': list(range(1,51)),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [100_000]},
    ElasticNet():{
    'elasticnet__alpha': list(range(1,11)),
    'elasticnet__fit_intercept': [False],
    'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
    'elasticnet__max_iter': [100_000]}, 

}

In [100]:
estimator_params_other = {
    KNeighborsRegressor():{
        'kneighborsregressor__n_neighbors': list(range(1, 51)),
        'kneighborsregressor__leaf_size': list(range(1, 51)),
        'kneighborsregressor__n_jobs': [-1]}, 
    SVR():{
        'svr__C': [0.001, 0.01, 0.1, 1, 10],
        'svr__gamma': [0.001, 0.01, 0.1, 1],
}}

In [101]:
class ModelSelection:
    '''
    This holds several functions
    '''
    
    def __init__(self, X, y):
        '''
        This creates the X_train, X_test, y_train, y_test arrays
        '''
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def make_pipe(self, preprocessing, estimator_list):
        '''
        This takes in a list of estimators and a preprocessing pipe object
        It outputs a list of pipes with a preprocessing object and an estimator object
        '''
        pipe_list = []
        for estimator in estimator_list:
            pipe_list.append(make_pipeline(preprocessing, estimator))
        
        return pipe_list
        
    
    def evaluate_pipes(self, pipe_list):
        '''
        This evaluates each pipe object in a list and returns back a list of scores and best params
        It takes in two X and Y, uses train_test_split to separate into 4 different arrays
        Then it evaluates each model on a test set and measures the accuracy
        The function returns the list of scores
        '''
        
        scores = []
        objects = []
        for pipe in pipe_list:
            pipe_object = pipe.fit(self.X_train,self.y_train)
            scores.append(pipe_object.score(self.X_test, self.y_test))
            objects.append(pipe_object)
        return scores, objects
    
    def calculate_rsme(self, preprocessing, fitted_pipe_objects):
        '''
        This takes in a list of fitted pipe objects, self variables
        find residuials
        
        '''
        list_rsme = []
        #processed_x_test = preprocessing.transform(self.X_test)
        for pipe in fitted_pipe_objects:
            preds = pipe.predict(self.X_test)
            rsme = mean_squared_error(self.y_test, preds, squared=False)
            list_rsme.append(rsme)
            
        return list_rsme
            
    def make_grid_search(self, preprocessing, estimator_list, params):
        '''
        This is a function that takes in a preprocessing pipeline, a list of estimator objects and params
        This returns a list of fitted GridSearchCV Objects that can then be iterated through
        '''
        
        pipe_array = self.make_pipe(preprocessing, estimator_list) 
        grid_array = []
        
        for pipe_object, param in list(zip(pipe_array, params)):
            grid_array.append(
                GridSearchCV(
                    estimator=pipe_object, 
                    param_grid=param,
                    n_jobs=-1))

        return pipe_array, grid_array
    
    def evaluate_grid_search(self, grid_list):
        '''
        Takes in a list of grid objects
        Trains them on initial X and Y train test split
        evaluates on test dataset
        outputs a list of accuracy scores, list of fitted grid objects
        '''
        scores = []
        objects = []
        
        for grid in grid_list:
            grid_object = grid.fit(self.X_train, self.y_train)
            scores.append(grid.score(self.X_test, self.y_test))
            objects.append(grid_object)
        
        return scores, objects

In [102]:
# Instantiate class
ms = ModelSelection(X, y)

In [17]:
pipe_list = ms.make_pipe(preprocessing, estimator_list)

In [18]:
set_config(display='diagram')
#pipe_list[0]

In [19]:
fitted_pipe_list, fitted_pipe_objects = ms.evaluate_pipes(pipe_list=pipe_list)

In [20]:
df_results = pd.DataFrame(fitted_pipe_list, index=index, columns=['Baseline Test Score'])

In [21]:
rsme_list = ms.calculate_rsme(preprocessing, fitted_pipe_objects)

In [24]:
rsme_list = (pd.DataFrame(rsme_list, columns=['rsme'], index=index))

In [25]:
df_results = df_results.join(rsme_list)

In [27]:
#df_results.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/baseline.csv')

In [42]:
pipe_list_true, grid_list_true = ms.make_grid_search(preprocessing, list(estimator_params_true.keys()), list(estimator_params_true.values()))

In [43]:
pipe_list_false, grid_list_false = ms.make_grid_search(preprocessing, list(estimator_params_false.keys()), list(estimator_params_false.values()))

In [103]:
pipe_list_other, grid_list_other = ms.make_grid_search(preprocessing, list(estimator_params_other.keys()), list(estimator_params_other.values()))

In [45]:
grid_fit_scores, grid_fit_objects = ms.evaluate_grid_search(grid_list_true)

In [46]:
grid_fit_scores

[0.9058126620778286,
 0.9143666369230723,
 0.9220971038346236,
 0.9071318657493225]

In [47]:
grid_fit_scores_f, grid_fit_objects_f = ms.evaluate_grid_search(grid_list_false)

In [48]:
grid_fit_scores_f

[0.9057916045702099,
 0.9120538765826537,
 0.9210485938777743,
 0.9042972068268573]

In [104]:
grid_fit_scores_o, grid_fit_objects_o = ms.evaluate_grid_search(grid_list_other)

In [50]:
grid_fit_scores_o

[0.8805638295039733, -0.05986265437585714]

In [51]:
w_intercepts = pd.DataFrame(grid_fit_scores, index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split(), columns=['w/ Intercept Score'])

In [52]:
rsme_intercepts = ms.calculate_rsme(preprocessing, grid_fit_objects)

In [53]:
rsme_intercepts = (pd.DataFrame(rsme_intercepts, columns=['rsme_i'], index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split()))

In [54]:
n_intercepts = pd.DataFrame(grid_fit_scores_f, index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split(), columns=['n/ Intercept Score'])

In [55]:
rsme_n_intercepts = ms.calculate_rsme(preprocessing, grid_fit_objects_f)

In [56]:
rsme_n_intercepts = (pd.DataFrame(rsme_n_intercepts, columns=['rsme_n_i'], index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split()))

In [105]:
other = pd.DataFrame(grid_fit_scores_o, index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split(), columns=['other Score'])

In [106]:
rsme_other = ms.calculate_rsme(preprocessing, grid_fit_objects_o)

In [107]:
rsme_other = (pd.DataFrame(rsme_other, columns=['rsme_o'], index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split()))

In [60]:
df_results = df_results.join(w_intercepts)

In [61]:
df_results = df_results.join(rsme_intercepts)

In [62]:
df_results = df_results.join(n_intercepts)

In [63]:
df_results = df_results.join(rsme_n_intercepts)

In [64]:
df_results = df_results.join(other)

In [65]:
df_results = df_results.join(rsme_other)

In [69]:
#df_results.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/whole-dataset-model-comparison.csv')

In [67]:
df_results

Unnamed: 0,Baseline Test Score,rsme,w/ Intercept Score,rsme_i,n/ Intercept Score,rsme_n_i,other Score,rsme_o
LinearRegression(),0.905813,24998.404132,0.905813,24998.404132,0.905792,25001.198428,,
LogisticRegression(),0.013645,44278.091201,,,,,,
Ridge(),0.917541,23390.271443,0.914367,23836.227607,0.912054,24155.964018,,
Lasso(),0.914111,23871.784549,0.922097,22734.888329,0.921049,22887.373564,,
ElasticNet(),0.883311,27824.6956,0.907132,24822.720877,0.904297,25198.71126,,
KNeighborsRegressor(),0.875086,28788.655751,,,,,0.880564,28150.360351
SVR(),-0.073246,84385.063471,,,,,-0.059863,83857.28904


In [70]:
grid_fit_objects_o[0].best_params_

{'kneighborsregressor__leaf_size': 20,
 'kneighborsregressor__n_jobs': -1,
 'kneighborsregressor__n_neighbors': 7}

In [76]:
#save = dump(grid_fit_objects_f[2], '/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/lasso-rsme-22887.joblib')

In [91]:
predictions = grid_fit_objects_f[2].predict(df_test)

In [92]:
predictions = pd.DataFrame(predictions)

In [93]:
predictions = predictions.rename({0:'SalePrice'}, axis=1)

In [94]:
predictions = predictions.join(df_test['Id'])

In [95]:
predictions = predictions[['Id', 'SalePrice']]

In [98]:
predictions.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/920-lasso-predictions.csv', index=False)

In [112]:
grid_fit_objects_f[2].best_params_

{'lasso__alpha': 10, 'lasso__fit_intercept': False, 'lasso__max_iter': 100000}