In [7]:
import numpy as np
import pandas as pd
import patsy

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
import statsmodels as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sys
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
sys.path.append('/Users/ganeshsivam/Mods')
import corr

#### Model testing & tuning

Create a function to create & test different linear regression models and return the one which fares the best. This function takes the data from the previous notebooks and runs Linear Regression, Lasso Regression & Ridge Regression using GridSeachCV to choose the best hyperparameters. The model with the best metrics is returned, results are stored in a dataframe and a kaggle submission is created for this particular trial. We will test different versions of our feature engineering as well as implent Recursive Feature Elimination using this function too 

In [4]:
def get_mod(test_name, df,fe=True,cols = None,train_file_path=None,test_file_path=None,sub_name=None):
    sys.path.append('/Users/ganeshsivam/Mods')
    import corr
    if train_file_path:
        train = pd.read_csv(train_file_path)
        test = pd.read_csv(test_file_path)
    elif fe == False:
        train = pd.read_csv("./data/train_bef_FE.csv")
        test = pd.read_csv("./data/test_bef_FE.csv")
    else:
        train = pd.read_csv('./data/train_final.csv')
        test = pd.read_csv('./data/test_final.csv')
    
    X = train[[c for c in train.columns if c != "SalePrice"]]
    if cols:
        X = X[cols]
        test = test[cols]
    y = train['SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X.values,y.values,test_size=0.2,random_state=42)
    #print(X_train.shape)
    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X = scaler.transform(X)
    test = test[[c for c in test.columns if c != "SalePrice"]]
    test = scaler.transform(test)
    
    
    lr = LinearRegression()
    lr = lr.fit(X_train,y_train)
    train_score = lr.score(X_train,y_train)
    test_score = lr.score(X_test,y_test)
    #return(y_test,y_pred)
    y_pred = lr.predict(X_test)
    temp_df = corr.reg_metrics(y_pred,y_test).transpose()
    
    temp_df.index = ["Linear Regression"]
    temp_df.loc['Linear Regression',"Model Train Score"] = train_score
    temp_df.loc['Linear Regression',"Model Test Score"] = test_score
    #temp_df.loc['Linear Regression','Train RMSE'] = rmse_cv_train(lr)
    #temp_df.loc['Linear Regression','Test RMSE'] = rmse_cv_test(lr)
    d = pd.DataFrame()
    d = d.append(temp_df,sort=True)
    
    from sklearn.model_selection import GridSearchCV
    from sklearn.feature_selection import RFE
    
    lasso = Lasso(tol=0.0001)
    params = {'alpha':np.linspace(100,2000,10)}
    lasso_grid = GridSearchCV(lasso,params,cv=5,n_jobs=-1)
    lasso_grid = lasso_grid.fit(X_train,y_train)
    alpha = lasso_grid.best_params_['alpha']
    n = np.random.normal(loc=alpha, scale=alpha*0.05, size=10)
    params = {'alpha':n}
    lasso_grid = GridSearchCV(lasso,params,cv=5,n_jobs=-1)
    lasso_grid = lasso_grid.fit(X_train,y_train)
    alpha = lasso_grid.best_params_['alpha']
    ls = Lasso(alpha=alpha)
    #selector = RFE(ls,200 ,step=1)
    ls = ls.fit(X_train,y_train)
    y_pred = ls.predict(X_test)
    
    train_score = ls.score(X_train,y_train)
    test_score = ls.score(X_test,y_test)
    #return(y_test,y_pred)
    temp_df = corr.reg_metrics(y_pred,y_test).transpose()
    
    temp_df.index = ["Lasso"]
    temp_df.loc["Lasso","Model Train Score"] = train_score
    temp_df.loc["Lasso","Model Test Score"] = test_score
    temp_df.loc["Lasso","Alpha"] = alpha
    d = d.append(temp_df,sort=True)
    
    #from sklearn.model_selection import GridSearchCV
    ridge = Ridge()
    params = {'alpha':np.logspace(0.1,3,100)}
    ridge_grid = GridSearchCV(ridge,params,cv=5,n_jobs=-1)
    ridge_grid = ridge_grid.fit(X_train,y_train)
    alpha = ridge_grid.best_params_['alpha']
    n = np.random.normal(loc=alpha, scale=alpha*0.05, size=50)
    params = {'alpha':n}
    ridge_grid = GridSearchCV(ridge,params,cv=5,n_jobs=-1)
    ridge_grid = ridge_grid.fit(X_train,y_train)
    alpha = ridge_grid.best_params_['alpha']
    ri = Ridge(alpha=alpha)
    ri = ri.fit(X_train,y_train)
    y_pred = ri.predict(X_test)
    
    train_score = ri.score(X_train,y_train)
    test_score = ri.score(X_test,y_test)
    #return(y_test,y_pred)
    temp_df = corr.reg_metrics(y_pred,y_test).transpose()
    
    temp_df.index = ["ridge"]
    temp_df.loc["ridge","Model Train Score"] = train_score
    temp_df.loc["ridge","Model Test Score"] = test_score
    temp_df.loc["ridge","Alpha"] = alpha
    d = d.append(temp_df,sort=True)
    
    
    d.sort_values("Mean Absolute Error",inplace=True)
    best_mod = d.index[0]
    if best_mod == "Lasso":
        mod = ls
        y_pred = mod.predict(X_test)
    elif best_mod == "Linear Regression":
        mod = lr
        y_pred = mod.predict(X_test)
    elif best_mod == "ridge":
        mod = ri
        y_pred = mod.predict(X_test)
    
    l = [[test_name,best_mod,d.loc[best_mod,"Alpha"],d.loc[best_mod,"R_squared"],d.loc[best_mod,"Mean Absolute Error"]]]
    y = pd.DataFrame(l,columns=df.columns)
    df = df.append(y,sort=False)
    df.reset_index(drop=True,inplace=True)
    if sub_name:
        create_sub(mod,test,name=sub_name)
    return d,mod,scaler,df

def create_sub(mod,test,name="",):
    
   
    op = mod.predict(test)
    ids = pd.read_csv("./data/test_id.csv")
    ids =ids['Id']
    sub = pd.DataFrame()
    sub["Id"] = ids
    sub["SalePrice"] = op
    sub.to_csv('./Submissions/{}.csv'.format(name),index=False)

### Create a dataframe to store our results

In [6]:
results_df = pd.DataFrame(columns=["Test Name","Model used","Alpha","R-squared","Mean Absolute Error"])

NameError: name 'pd' is not defined

### Test our data before any feature engineering was implemented

In [5]:
df_no_fe,mod_no_fe,scaler,results_df = get_mod(test_name="No Feature Engineering",df=results_df,fe=False,sub_name = "NFE")

NameError: name 'results_df' is not defined

In [31]:
results_df

Unnamed: 0,Test Name,Model used,Alpha,R-squared,Mean Absolute Error
0,No Feature Engineering,ridge,250.464,0.9025,16071.1293


### Test after our feature engineering

In [32]:
df_1_fe,mod_1_fe,scaler,results_df = get_mod(test_name="1st Feature Engineering",df=results_df,fe=True,train_file_path="./data/train_nopoly.csv",test_file_path="./data/test_nopoly.csv",sub_name="1FE")



In [33]:
results_df

Unnamed: 0,Test Name,Model used,Alpha,R-squared,Mean Absolute Error
0,No Feature Engineering,ridge,250.464,0.9025,16071.1293
1,1st Feature Engineering,ridge,289.487,0.9025,15991.0874


### Our Mean Absolute Error & R-squared values have imporved after implementing Feature Engineering

#### Made some changes to FE, added the polynomial features for the features with the strongest correlation to the target, let's see if this improves our scores

In [34]:
df_2_fe,mod_2_fe,scaler,results_df = get_mod(test_name="2nd Feature Engineering",df=results_df,fe=True,sub_name="2FE")

In [35]:
results_df

Unnamed: 0,Test Name,Model used,Alpha,R-squared,Mean Absolute Error
0,No Feature Engineering,ridge,250.464,0.9025,16071.1293
1,1st Feature Engineering,ridge,289.487,0.9025,15991.0874
2,2nd Feature Engineering,ridge,321.3835,0.9233,14407.5733


### Try implementing SKLEARN's Recursive Feature Elimination function to remove some features. Current model has 309 features, we will reduce this to 200 and see effect on R-squard & Mean Absolute Error

In [36]:
def rfe_cols(n):
    import warnings
    warnings.filterwarnings('ignore')
    from sklearn.feature_selection import RFE
    selector = RFE(mod_2_fe,n ,step=1)
    train = pd.read_csv('./data/train_final.csv')    
    X = train[[c for c in train.columns if c != "SalePrice"]]
    y = train['SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X.values,y.values,test_size=0.2,random_state=42)
    selector = selector.fit(X_train,y_train)
    c = list(X.columns[selector.support_])
    return c 


In [37]:
c = rfe_cols(200)
df_RFE,mod_RFE,scaler,results_df = get_mod(test_name="Recursive Feaeture Elimination - 200",df=results_df,fe=True,cols=c,sub_name="RFE")

In [39]:
results_df

Unnamed: 0,Test Name,Model used,Alpha,R-squared,Mean Absolute Error
0,No Feature Engineering,ridge,250.464,0.9025,16071.1293
1,1st Feature Engineering,ridge,289.487,0.9025,15991.0874
2,2nd Feature Engineering,ridge,321.3835,0.9233,14407.5733
3,Recursive Feaeture Elimination - 200,ridge,9.1011,0.8805,16587.5144


### Try 150 & 250 variables too

In [3]:
c = rfe_cols(150)
df_RFE150,mod_RFE150,scaler,results_df = get_mod(test_name="Recursive Feaeture Elimination - 150",df=results_df,fe=True,cols=c,sub_name="RFE2")




NameError: name 'rfe_cols' is not defined

In [42]:
c = rfe_cols(250)
df_RFE250,mod_RFE250,scaler,results_df = get_mod(test_name="Recursive Feaeture Elimination - 250",df=results_df,fe=True,cols=c,sub_name="RFE3")


In [43]:
results_df

Unnamed: 0,Test Name,Model used,Alpha,R-squared,Mean Absolute Error
0,No Feature Engineering,ridge,250.464,0.9025,16071.1293
1,1st Feature Engineering,ridge,289.487,0.9025,15991.0874
2,2nd Feature Engineering,ridge,321.3835,0.9233,14407.5733
3,Recursive Feaeture Elimination - 200,ridge,9.1011,0.8805,16587.5144
4,Recursive Feaeture Elimination - 150,ridge,1.0797,0.8846,16292.8619
5,Recursive Feaeture Elimination - 250,ridge,3.5818,0.9132,15178.3269


### Add Kaggle submission results to our results dataframe

In [44]:
results_df['Public Score'] = ""
results_df['Private Score'] = ""

In [45]:
results_df.loc[0,"Public Score"] = 29427
results_df.loc[0,"Private Score"] = 30164

In [46]:
results_df.loc[1,"Public Score"] = 29207
results_df.loc[1,"Private Score"] = 30160

In [47]:
results_df.loc[2,"Public Score"] = 23529
results_df.loc[2,"Private Score"] = 32005

In [48]:
results_df.loc[2,"Public Score"] = 23529
results_df.loc[2,"Private Score"] = 32005

In [49]:
results_df.loc[3,"Public Score"] = 28388
results_df.loc[3,"Private Score"] = 27568

In [50]:
results_df.loc[4,"Public Score"] = 28395
results_df.loc[4,"Private Score"] = 27219

In [51]:
results_df.loc[5,"Public Score"] = 25830
results_df.loc[5,"Private Score"] = 28446

In [52]:
results_df

Unnamed: 0,Test Name,Model used,Alpha,R-squared,Mean Absolute Error,Public Score,Private Score
0,No Feature Engineering,ridge,250.464,0.9025,16071.1293,29427,30164
1,1st Feature Engineering,ridge,289.487,0.9025,15991.0874,29207,30160
2,2nd Feature Engineering,ridge,321.3835,0.9233,14407.5733,23529,32005
3,Recursive Feaeture Elimination - 200,ridge,9.1011,0.8805,16587.5144,28388,27568
4,Recursive Feaeture Elimination - 150,ridge,1.0797,0.8846,16292.8619,28395,27219
5,Recursive Feaeture Elimination - 250,ridge,3.5818,0.9132,15178.3269,25830,28446


In [186]:
final_model_df = pd.DataFrame(columns=["Test Name","Model used","Alpha","R-squared","Mean Absolute Error"])
df_RFE150,final_mod,final_scaler,results_df = get_mod(test_name="Recursive Feaeture Elimination150",df=final_model_df,fe=True,cols=rfe_cols(150),sub_name="RFE2")


The Kaggle scores are pretty good. The best model for the private score places in the top 10 and for the public scores it's in the top 30. The model that scores the best for the public score, fares the worst in the private score. According to Kaggle, the public score is based on 30% of the test data while the private score is based on the other 70%

#### The Recursive Feature Elminiation model which reduced the no. of features to 150 fared the best on the Private score category on Kaggle. Strangely, this model had the 2nd lowest R-square & Meant Absolute Error scores of all the models i created

#### Every iteration returned Ridge Regression as the best model for predicting sales prices

In [2]:
mod_RFE150

NameError: name 'mod_RFE150' is not defined