In [54]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import models as model

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, chi2

from sklearn import set_config
from joblib import dump, load

In [55]:
df_test_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/test.csv')
df_test = df_test_raw.copy()

In [56]:
df_train_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/train.csv')

df = df_train_raw.copy()

In [57]:
# This will use the entire dataset
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [None]:
Lasso()

In [142]:
params = {
    'lasso__alpha': [100,150,200],
    'lasso__fit_intercept': [True],
    'lasso__normalize': [True],
    'lasso__max_iter': [-1],
    'columntransformer__num__num_kbest__k': [35],#list(range(10, 31,5)),
    'columntransformer__cat__cat_kbest__k': [30],#list(range(10, 31,5))
},


In [143]:
# Create Custom Preprocessing Module to include KBest
# This is the version which we will train a baseline with kfeatures
# There is a basic version of preprocessing inside class ModelSelection which does not handle feature selection

numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer()),
    ('num_scaler', StandardScaler()),
    ('num_kbest', SelectKBest())])

categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('cat_onehot', OneHotEncoder(handle_unknown='ignore')),
    ('cat_scaler', StandardScaler(with_mean=False)),
    ('cat_kbest', SelectKBest(score_func=chi2))])


preprocessing = ColumnTransformer(transformers=[
    ('num', numeric_transformer, make_column_selector(dtype_include=np.number)),
    ('cat', categorical_transformer, make_column_selector(dtype_include='object'))
])

In [144]:
# Instantiate class
ms = model.ModelSelection(X, y)

In [145]:
pipe = make_pipeline(preprocessing, Lasso())

In [146]:
gs = GridSearchCV(pipe, params, n_jobs=-1)

In [147]:
set_config(display='diagram')
gs

In [148]:
scores, objects = ms.evaluate_grid_search([gs])

In [149]:
scores

[0.8320765146765675]

In [150]:
objects[0].best_params_

{'columntransformer__cat__cat_kbest__k': 30,
 'columntransformer__num__num_kbest__k': 35,
 'lasso__alpha': 100,
 'lasso__fit_intercept': True,
 'lasso__max_iter': -1,
 'lasso__normalize': True}

In [151]:
#dump(objects[0], '/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/overfit-')

In [152]:
ms.calculate_rsme(objects)

[31091.306493114662]

In [49]:
def predictions(object_fitted, test_data, to_file=False):
    '''
    Output: Model Predictions

    This takes in a single fitted object like a fitted pipeline or a fitted gridsearchcv object
    The output format is in the Kaggle Required documents.

    See the competition here: https://www.kaggle.com/c/ga-dsir-824-project-2-regression-challenge/leaderboard#score

    If to_file=True, it will output the Dataframe as a csv, ignoring the index and 
    place the csv file into your current working directory
    '''
    predictions = object_fitted.predict(test_data)
    predictions = pd.DataFrame(predictions)
    predictions = predictions.rename({0:'SalePrice'}, axis=1)
    predictions = predictions.join(test_data['Id'])
    predictions = predictions[['Id', 'SalePrice']]

    if to_file == True:
        predictions.to_csv('predictions.csv', index=False)
        return predictions
    else:
        return predictions

In [50]:
predict = predictions(objects[0], df_test)

In [53]:
predict.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/other/predictions/925-lasso-kbest', index=False)