In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import models as models

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet, SGDClassifier
from sklearn.svm import SVR, LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold, SelectKBest, GenericUnivariateSelect, RFE, SelectFromModel

from sklearn import set_config
from joblib import dump, load

In [3]:
df_train_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/train.csv')

df = df_train_raw.copy()

In [4]:
df_test_raw = pd.read_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/datasets/test.csv')
df_test = df_test_raw.copy()

In [5]:
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
df_dummies = pd.get_dummies(df)

In [7]:
df_dummies.corr()['SalePrice'].sort_values(ascending=False).head(10)

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650270
Garage Cars        0.648220
Total Bsmt SF      0.628925
1st Flr SF         0.618486
Bsmt Qual_Ex       0.586497
Year Built         0.571849
Kitchen Qual_Ex    0.551284
Name: SalePrice, dtype: float64

In [8]:
#X = df[['Overall Qual', 'Gr Liv Area', 'Total Bsmt SF']]
#y = df['SalePrice']

In [9]:
# This will use the entire dataset
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [10]:
#X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
estimator_list = (LinearRegression(n_jobs=-1), 
                  LogisticRegression(n_jobs=-1, max_iter=500_000), 
                  Ridge(max_iter=500_000), 
                  Lasso(max_iter=500_000), 
                  ElasticNet(max_iter=500_000), 
                  KNeighborsRegressor(n_jobs=-1), 
                  SVR())

In [12]:
index = ['LinearRegression()', 'LogisticRegression()', 'Ridge()', 'Lasso()', 'ElasticNet()', 'KNeighborsRegressor()', 'SVR()']

In [13]:
estimator_params_true = {
    LinearRegression(): {
        'linearregression__fit_intercept': [True],
        'linearregression__n_jobs': [-1]},
    Ridge():{
        'ridge__alpha': list(range(1,11)),
        'ridge__fit_intercept': [True],
        'ridge__max_iter': [100_000],
    },
    Lasso(): {
        'lasso__alpha': list(range(1,11)),
        'lasso__fit_intercept': [True],
        'lasso__max_iter': [100_000],
    }, 
    ElasticNet():{
        'elasticnet__alpha': list(range(1,11)),
        'elasticnet__fit_intercept': [True],
        'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
        'elasticnet__max_iter': [100_000],
    }, 
    }

In [14]:
estimator_params_false = {
    LinearRegression(): {
    'linearregression__fit_intercept': [False],
    'linearregression__n_jobs': [-1]},
    Ridge():{
    'ridge__alpha': list(range(1,11)),
    'ridge__fit_intercept': [False],
    'ridge__max_iter': [100_000]}, 
    Lasso(): {
    'lasso__alpha': list(range(1,51)),
    'lasso__fit_intercept': [False],
    'lasso__max_iter': [100_000]},
    ElasticNet():{
    'elasticnet__alpha': list(range(1,11)),
    'elasticnet__fit_intercept': [False],
    'elasticnet__l1_ratio': np.arange(0.1, 1, 0.1),
    'elasticnet__max_iter': [100_000]}, 

}

In [15]:
estimator_params_other = {
    KNeighborsRegressor():{
        'kneighborsregressor__n_neighbors': list(range(1, 51)),
        'kneighborsregressor__leaf_size': list(range(1, 51)),
        'kneighborsregressor__n_jobs': [-1]}, 
    SVR():{
        'svr__C': [0.001, 0.01, 0.1, 1, 10],
        'svr__gamma': [0.001, 0.01, 0.1, 1],
}}

In [16]:
feature_selection = [
    SelectKBest(score_func=chi2, k=)
]

# next step add RFE, SelectFromModel(), GenericUnivariateSelect()

In [17]:
# Instantiate class
ms = models.ModelSelection(X, y)

In [18]:
pipe_list = ms.make_pipe(ms.preprocessing, estimator_list)

In [19]:
set_config(display='diagram')
pipe_list[0]

In [20]:
fitted_pipe_list, fitted_pipe_objects = ms.evaluate_pipes(pipe_list=pipe_list)

In [21]:
df_results = pd.DataFrame(fitted_pipe_list, index=index, columns=['Baseline Test Score'])

In [22]:
rsme_list = ms.calculate_rsme(ms.preprocessing, fitted_pipe_objects)

In [23]:
rsme_list = (pd.DataFrame(rsme_list, columns=['rsme'], index=index))

In [24]:
df_results = df_results.join(rsme_list)

In [25]:
df_results

Unnamed: 0,Baseline Test Score,rsme
LinearRegression(),0.804687,34042.070001
LogisticRegression(),0.019493,43806.061479
Ridge(),0.805147,34001.92593
Lasso(),0.804431,34064.38691
ElasticNet(),0.863694,28438.54896
KNeighborsRegressor(),0.822713,32433.135906
SVR(),-0.047366,78831.428462


In [26]:
pipe_list_wfs = ms.make_pipe_wfs(ms.preprocessing, feature_selection, estimator_list)

In [27]:
pipe_list_wfs[0]

In [28]:
fitted_pipe_list_wfs, fitted_pipe_objects_wfs = ms.evaluate_pipes(pipe_list=pipe_list_wfs)

  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


In [29]:
fitted_pipe_list_wfs

[0.0010275804373232056,
 0.005847953216374269,
 0.0010358080659581548,
 0.0010280797276505105,
 0.002860068093443635,
 -0.3468761006260599,
 -0.04769438351143762]

In [None]:
df_results_wfs = pd.DataFrame(fitted_pipe_list_wfs, index=index, columns=['BScore with FeatSel'])

In [None]:
rsme_list_wfs = ms.calculate_rsme(ms.preprocessing, fitted_pipe_objects_wfs)

In [None]:
rsme_list_wfs = (pd.DataFrame(rsme_list_wfs, columns=['rsme_wfs'], index=index))

In [None]:
df_results_wfs = df_results_wfs.join(rsme_list_wfs)

In [None]:
df_results = df_results.join(df_results_wfs)

In [None]:
df_results

In [None]:
#df_results.to_csv('/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/baseline.csv')

In [None]:
pipe_list_true, grid_list_true = ms.make_grid_search(preprocessing, list(estimator_params_true.keys()), list(estimator_params_true.values()))

In [None]:
pipe_list_false, grid_list_false = ms.make_grid_search(preprocessing, list(estimator_params_false.keys()), list(estimator_params_false.values()))

In [None]:
pipe_list_other, grid_list_other = ms.make_grid_search(preprocessing, list(estimator_params_other.keys()), list(estimator_params_other.values()))

In [None]:
grid_fit_scores, grid_fit_objects = ms.evaluate_grid_search(grid_list_true)

In [None]:
grid_fit_scores

In [None]:
grid_fit_scores_wfs, grid_fit_objects_wfs

In [None]:
grid_fit_scores_f, grid_fit_objects_f = ms.evaluate_grid_search(grid_list_false)

In [None]:
grid_fit_scores_f

In [None]:
grid_fit_scores_wfs, grid_fit_objects_wfs

In [None]:
grid_fit_scores_o, grid_fit_objects_o = ms.evaluate_grid_search(grid_list_other)

In [None]:
grid_fit_scores_o

In [None]:
grid_fit_scores_wfs, grid_fit_objects_wfs

In [None]:
w_intercepts = pd.DataFrame(grid_fit_scores, index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split(), columns=['w/ Intercept Score'])

In [None]:
rsme_intercepts = ms.calculate_rsme(preprocessing, grid_fit_objects)

In [None]:
rsme_intercepts = (pd.DataFrame(rsme_intercepts, columns=['rsme_i'], index=str(list(estimator_params_true.keys()))[1:-1].replace(',','').split()))

In [None]:
n_intercepts = pd.DataFrame(grid_fit_scores_f, index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split(), columns=['n/ Intercept Score'])

In [None]:
rsme_n_intercepts = ms.calculate_rsme(preprocessing, grid_fit_objects_f)

In [None]:
rsme_n_intercepts = (pd.DataFrame(rsme_n_intercepts, columns=['rsme_n_i'], index=str(list(estimator_params_false.keys()))[1:-1].replace(',','').split()))

In [None]:
other = pd.DataFrame(grid_fit_scores_o, index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split(), columns=['other Score'])

In [None]:
rsme_other = ms.calculate_rsme(preprocessing, grid_fit_objects_o)

In [None]:
rsme_other = (pd.DataFrame(rsme_other, columns=['rsme_o'], index=str(list(estimator_params_other.keys()))[1:-1].replace(',','').split()))

In [None]:
df_results = df_results.join(w_intercepts)

In [None]:
df_results = df_results.join(rsme_intercepts)

In [None]:
df_results = df_results.join(n_intercepts)

In [None]:
df_results = df_results.join(rsme_n_intercepts)

In [None]:
df_results = df_results.join(other)

In [None]:
df_results = df_results.join(rsme_other)

In [None]:
df_results

In [None]:
grid_fit_objects_o[0].best_params_

In [None]:
#save = dump(grid_fit_objects_f[2], '/home/blue/general-assembly/dsir-824/submissions/projects/project-2-master/outputted work/lasso-rsme-22887.joblib')

In [None]:
grid_fit_objects_f[2].best_params_