In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error

from sklearn import set_config
from joblib import dump, load

In [4]:
df_train_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/train.csv')

df = df_train_raw.copy()

In [5]:
df_test_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/test.csv')
df_test = df_test_raw.copy()

In [6]:
# This will use the entire dataset
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [7]:
Lasso()

Lasso()

In [8]:
params = {
    'lasso__alpha': np.arange(0.1,30,0.1),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [-1]
},


In [9]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
#numeric_features = list(X.loc[:, X.dtypes == object].columns)
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('num_scaler', StandardScaler())])

#cat_features = list(X.loc[:, X.dtypes != object].columns)
categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [10]:
class ModelSelection:
    '''
    This holds several functions
    '''
    
    def __init__(self, X, y):
        '''
        This creates the X_train, X_test, y_train, y_test arrays
        '''
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def make_pipe(self, preprocessing, estimator_list):
        '''
        This takes in a list of estimators and a preprocessing pipe object
        It outputs a list of pipes with a preprocessing object and an estimator object
        '''
        pipe_list = []
        for estimator in estimator_list:
            pipe_list.append(make_pipeline(preprocessing, estimator))
        
        return pipe_list
        
    
    def evaluate_pipes(self, pipe_list):
        '''
        This evaluates each pipe object in a list and returns back a list of scores and best params
        It takes in two X and Y, uses train_test_split to separate into 4 different arrays
        Then it evaluates each model on a test set and measures the accuracy
        The function returns the list of scores
        '''
        
        scores = []
        objects = []
        for pipe in pipe_list:
            pipe_object = pipe.fit(self.X_train,self.y_train)
            scores.append(pipe_object.score(self.X_test, self.y_test))
            objects.append(pipe_object)
        return scores, objects
    
    def calculate_rsme(self, preprocessing, fitted_pipe_objects):
        '''
        This takes in a list of fitted pipe objects, self variables
        find residuials
        
        '''
        list_rsme = []
        #processed_x_test = preprocessing.transform(self.X_test)
        for pipe in fitted_pipe_objects:
            preds = pipe.predict(self.X_test)
            rsme = mean_squared_error(self.y_test, preds, squared=False)
            list_rsme.append(rsme)
            
        return list_rsme
            
    def make_grid_search(self, preprocessing, estimator_list, params):
        '''
        This is a function that takes in a preprocessing pipeline, a list of estimator objects and params
        This returns a list of fitted GridSearchCV Objects that can then be iterated through
        '''
        
        pipe_array = self.make_pipe(preprocessing, estimator_list) 
        grid_array = []
        
        for pipe_object, param in list(zip(pipe_array, params)):
            grid_array.append(
                GridSearchCV(
                    estimator=pipe_object, 
                    param_grid=param,
                    n_jobs=-1))

        return pipe_array, grid_array
    
    def evaluate_grid_search(self, grid_list):
        '''
        Takes in a list of grid objects
        Trains them on initial X and Y train test split
        evaluates on test dataset
        outputs a list of accuracy scores, list of fitted grid objects
        '''
        scores = []
        objects = []
        
        for grid in grid_list:
            grid_object = grid.fit(self.X_train, self.y_train)
            scores.append(grid.score(self.X_test, self.y_test))
            objects.append(grid_object)
        
        return scores, objects

In [11]:
# Instantiate class
ms = ModelSelection(X, y)

In [12]:
pipe = make_pipeline(preprocessing, Lasso())

In [13]:
gs = GridSearchCV(pipe, params, n_jobs=-1)

In [14]:
scores, objects = ms.evaluate_grid_search([gs])

In [15]:
scores


[0.9092953872829259]

In [25]:
objects[0].best_params_

{'lasso__alpha': 29.900000000000002,
 'lasso__fit_intercept': False,
 'lasso__max_iter': -1}

In [26]:
dump(objects[0], '/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/overfit-')

['/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/overfit-']