In [126]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [163]:
one_hot_df = pd.read_csv('./Data/one_hot_df.csv')
label_df = pd.read_csv('./Data/label_df.csv')
y = one_hot_df['SalePrice']
X = one_hot_df.drop(['SalePrice', 'Unnamed: 0'], axis =1)
label_X = label_df.drop(['SalePrice', 'Unnamed: 0'], axis =1)

LotArea          float64
OverallQual      float64
OverallCond      float64
YearBuilt        float64
YearRemodAdd     float64
MasVnrArea       float64
BsmtFinSF1       float64
BsmtUnfSF        float64
TotalBsmtSF      float64
1stFlrSF         float64
2ndFlrSF         float64
GrLivArea        float64
BsmtFullBath     float64
FullBath         float64
HalfBath         float64
BedroomAbvGr     float64
KitchenAbvGr     float64
TotRmsAbvGrd     float64
Fireplaces       float64
GarageCars       float64
GarageArea       float64
WoodDeckSF       float64
OpenPorchSF      float64
EnclosedPorch    float64
ScreenPorch      float64
MSSubClass         int64
MSZoning           int64
LotShape           int64
LandContour        int64
LotConfig          int64
Neighborhood       int64
Condition1         int64
BldgType           int64
HouseStyle         int64
RoofStyle          int64
Exterior1st        int64
Exterior2nd        int64
MasVnrType         int64
ExterQual          int64
ExterCond          int64


## Linear Regression

In [129]:
lr_model = LinearRegression()
metrics = cross_validate(lr_model, X,y,cv = 5, scoring = ('neg_root_mean_squared_error'))
lr_cv = -metrics['test_score'].mean()
print('CV PREDICTION ERROR FOR MULTIVARIATE LR')
print(lr_cv)

CV PREDICTION ERROR FOR MULTIVARIATE LR
2656881023.263894


In [88]:
lr_model.fit(X,y)
lr_model.score(X,y)

0.9258276520543647

The linear regression model gives a very high R squared of 0.926, but this is expected because we have included almost all features inside our regression model. Looking for adjusted r squared metric. We have 272 predictor variables in our model. 

The CV prediction error we are getting is abnormally high. We need to investigate this. Does not make sense, because it is a log(sales price) the RMSE cannot go into the millions

## Ridge Regression

In [62]:
param_list = []
start = 10
for i in range(30):
    param_list.append(start)
    start = start + 0.25

parameters = {'alpha': param_list} 
rr = Ridge()
metrics = cross_validate(rr, X, y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('CV PREDICTION ERROR FOR UNTUNED RR')
print(-metrics['test_score'].mean())

CV PREDICTION ERROR FOR UNTUNED RR
0.14151972402903668


Tuning the ridge regression model

In [50]:
tuned_rr = GridSearchCV(rr, parameters, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_rr.fit(X,y)
print('CV PREDICTION ERROR FOR TUNED RR')
print(-tuned_rr.best_score_)
print(tuned_rr.best_params_)

CV PREDICTION ERROR FOR TUNED RR
0.1363310366409711
{'alpha': 14.75}


We find the value of alpha that gives us the lowest cross validation prediction error. The alpha 

## Lasso Regression

In [9]:
param_list = []
start = 0.0001
for i in range(30):
    param_list.append(start)
    start = start + 0.0001
parameters2 = {'alpha': param_list}
lasso = Lasso()
lasso.fit(X,y)
metrics = cross_validate(lasso, X,y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('SCORE FOR UNTUNED LASSO')
print(-metrics['test_score'].mean())

SCORE FOR UNTUNED LASSO
0.3994558059361331


Tuning the lasso regression model

In [10]:
tuned_lasso = GridSearchCV(lasso, parameters2, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_lasso.fit(X,y)
best_param = tuned_lasso.best_params_['alpha']
lasso_tuned = Lasso(alpha = best_param)
lasso_tuned.fit(X,y)
metrics = cross_validate(lasso_tuned, X,y, cv = 5, scoring = ('neg_root_mean_squared_error'))


Analysis of tuned lasso regression model

In [11]:
print('BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:')
print(best_param)
print(-tuned_lasso.best_score_)
print()
model_coefs = list(lasso_tuned.coef_)

good_features = []
for i in range(len(model_coefs)):
    if model_coefs[i]> 0:
        good_features.append(list(X.columns)[i])

print('\n'+'FEATURES WITH NON ZERO COEFFICIENTS FOR LASSO')
print(good_features)

BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:
0.0005
0.13375088484596281


FEATURES WITH NON ZERO COEFFICIENTS FOR LASSO
['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'MSSubClass_20', 'MSSubClass_50', 'MSZoning_RL', 'LotShape_IR2', 'LotConfig_Corner', 'LotConfig_CulDSac', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_Crawfor', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Condition1_Norm', 'Condition1_RRAn', 'BldgType_1Fam', 'Exterior1st_BrkFace', 'Exterior1st_MetalSd', 'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'ExterQual_Gd', 'ExterCond_TA', 'Foundation_PConc', 'BsmtQual_Ex', 'BsmtCond_TA', 'BsmtExposure_Gd', 'BsmtFinType1_ALQ', 'BsmtFinType1_GLQ

In [146]:
for i in good_features:
    if '_' in i:
        print(i, end = ', ')

MSSubClass_20, MSSubClass_50, MSZoning_RL, LotShape_IR2, LotConfig_Corner, LotConfig_CulDSac, Neighborhood_BrkSide, Neighborhood_ClearCr, Neighborhood_Crawfor, Neighborhood_NoRidge, Neighborhood_NridgHt, Neighborhood_Somerst, Neighborhood_StoneBr, Condition1_Norm, Condition1_RRAn, BldgType_1Fam, Exterior1st_BrkFace, Exterior1st_MetalSd, Exterior1st_VinylSd, Exterior2nd_VinylSd, Exterior2nd_Wd Sdng, ExterQual_Gd, ExterCond_TA, Foundation_PConc, BsmtQual_Ex, BsmtCond_TA, BsmtExposure_Gd, BsmtFinType1_ALQ, BsmtFinType1_GLQ, BsmtFinType2_Unf, Heating_GasW, HeatingQC_Ex, HeatingQC_Gd, Electrical_FuseA, KitchenQual_Ex, Functional_Typ, GarageType_Attchd, GarageType_Detchd, GarageQual_Gd, GarageCond_TA, PavedDrive_Y, MoSold_5, MoSold_6, MoSold_7, SaleCondition_Normal, 

We perform feature selection based on the coefficients of our lasso regression model, narrowing our features down from 273 to 66

The value of alpha that gives us the lowest CV prediction error is 0.0004. This alpha value is very low: when alpha is low it means that the result (model coefficients) become similar to that of the linear regression model. Because the penalty that is placed on the coefficients in the optimisation function is lower

Interesting selection of features: Based on the one hot encoding, not all were important. For example, for months, if variable was in month 5,6,7 seemed to have an impact. This incentivizes us to think that perhaps doing some feature engineering might improve the results. Change the classes found within some of the features which are categorical variables.

## Reworking based on feature selection

In [147]:
new_feature_set = X[good_features]
lr_model_selected = LinearRegression()
metrics = cross_validate(lr_model_selected, new_feature_set, y, cv = 5,scoring = ('neg_root_mean_squared_error'))
print('CV PREDICTION ERROR FOR NEW FEATURE SET MULTIVARIATE LR')
-metrics['test_score'].mean()

CV PREDICTION ERROR FOR NEW FEATURE SET MULTIVARIATE LR


0.1363506795272889

Running a linear regression model based on feature selection with our lasso regression model helps us derive a significantly lower CV prediction error at 0.136350

In [149]:
param_list = []
start = 4
for i in range(30):
    param_list.append(start)
    start = start + 0.1
#     start = start +

parameters = {'alpha': param_list} 
rr = Ridge()
metrics = cross_validate(rr, X[good_features], y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('CV PREDICTION ERROR FOR UNTUNED RR')
print(-metrics['test_score'].mean())

tuned_rr = GridSearchCV(rr, parameters, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_rr.fit(X[good_features],y)
print('CV PREDICTION ERROR FOR TUNED RR')
print(-tuned_rr.best_score_)
print(tuned_rr.best_params_)

CV PREDICTION ERROR FOR UNTUNED RR
0.13615168713871634
CV PREDICTION ERROR FOR TUNED RR
0.1358895272237749
{'alpha': 5.199999999999996}


We see little difference between our original ridge regression model and the new ridge regression model. CV prediction model is very similar, slight improvement when we incorporate feature selection

In [155]:
param_list = []
start = 0.0001
for i in range(30):
    param_list.append(start)
    start = start + 0.0001
parameters2 = {'alpha': param_list}
lasso = Lasso()
lasso.fit(new_feature_set,y)
metrics = cross_validate(lasso, new_feature_set,y, cv = 5, scoring = ('neg_root_mean_squared_error'))
print('SCORE FOR UNTUNED LASSO')
print(-metrics['test_score'].mean())

SCORE FOR UNTUNED LASSO
0.3992282792085989


In [153]:
tuned_lasso = GridSearchCV(lasso, parameters2, scoring = 'neg_root_mean_squared_error', cv = 5)
tuned_lasso.fit(new_feature_set,y)
best_param = tuned_lasso.best_params_['alpha']
lasso_tuned = Lasso(alpha = best_param)
lasso_tuned.fit(X,y)
metrics = cross_validate(lasso_tuned, new_feature_set,y, cv = 5, scoring = ('neg_root_mean_squared_error'))


In [156]:
print('BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:')
print(best_param)
print(-tuned_lasso.best_score_)
print()
model_coefs = list(lasso_tuned.coef_)

BASED ON CV, WE FOUND OPTIMAL ALPHA TO BE:
0.0002
0.1361328789193335



We found that the CV prediction error does not change after we include feature selection. This is because of the mechanism of the lasso regression, whereby insignificant variables are already shrunk towards 0

## Random Forest

In [158]:
rf = RandomForestRegressor(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1, oob_score = True, bootstrap = True)
rf.fit(label_X, y)
print(rf.get_params())
print('\n This is the oob score: ', rf.oob_score_)
rf_features = []

for name, score in zip(list(label_X.columns),rf.feature_importances_):
    if score > 0:
#         print(name,score)
        rf_features.append((name,score))


{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': 16, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': True, 'random_state': None, 'verbose': 0, 'warm_start': False}

 This is the oob score:  0.8121027361263089


In [159]:
rf_features.sort(key = lambda x:x[1], reverse = True)
rf_features

[('OverallQual', 0.665821012926399),
 ('GrLivArea', 0.11319715527923772),
 ('GarageCars', 0.051139507510936266),
 ('TotalBsmtSF', 0.042138666143041875),
 ('GarageArea', 0.0225711109943079),
 ('1stFlrSF', 0.01677671817892325),
 ('CentralAir', 0.013689246078000417),
 ('BsmtFinSF1', 0.012097073784310838),
 ('YearBuilt', 0.011694318905487168),
 ('GarageType', 0.010332965288313966),
 ('MSZoning', 0.006169996959914037),
 ('OverallCond', 0.005829875773830595),
 ('Fireplaces', 0.00478158055565348),
 ('LotArea', 0.0032406192086300635),
 ('2ndFlrSF', 0.002549869963349421),
 ('YearRemodAdd', 0.001999980907294007),
 ('BsmtQual', 0.0016517997813276057),
 ('KitchenAbvGr', 0.0013285668278279449),
 ('FullBath', 0.0011972895141757205),
 ('KitchenQual', 0.00102208167417436),
 ('Neighborhood', 0.0009461010438868233),
 ('PavedDrive', 0.0008789794047336572),
 ('LotShape', 0.0008760005713518509),
 ('GarageQual', 0.0007916029271450408),
 ('ExterQual', 0.0006951435314089695),
 ('ExterCond', 0.0006597847371302

In [120]:
print('LASSO REGRESSION FEATURES SELECTED: \n', good_features)
print()
print('FEATURES USED TO SPLIT RF: \n', rf_features)

LASSO REGRESSION FEATURES SELECTED: 
 ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'MSSubClass_20', 'MSSubClass_50', 'MSZoning_RL', 'LotShape_IR2', 'LotConfig_Corner', 'LotConfig_CulDSac', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_Crawfor', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Condition1_Norm', 'Condition1_RRAn', 'BldgType_1Fam', 'Exterior1st_BrkFace', 'Exterior1st_MetalSd', 'Exterior1st_VinylSd', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'ExterQual_Gd', 'ExterCond_TA', 'Foundation_PConc', 'BsmtQual_Ex', 'BsmtCond_TA', 'BsmtExposure_Gd', 'BsmtFinType1_ALQ', 'BsmtFinType1_GLQ', 'BsmtFinType2_Unf', 'Heating_GasW', 'HeatingQC_Ex', 'HeatingQC_Gd', 'Electric

Running our analysis we can see very similar features being used to split our decision trees, and the features selected by the LASSO regression model

## Tuning of RF model

Using the OOB score to evaluate the random forest model gives us an extremely high OOB r^2 score. 
Based on the model, we also see that many features have very low importance, and this incentivizes us to revisit the variable selection methodology that we adopted earlier.

See if these features are similar to the features that selected based on the lasso regression model

We try hyperparameter search using random hyperparameter grid, so that we can find optimum hyperparamters for our random forest model

In [98]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num =10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num =11)]
max_depth.append(None)
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [100]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv =5, verbose = 2, random_state = 42, n_jobs = -1, scoring = 'neg_root_mean_squared_error')
rf_random.fit(label_X, y)
print('THIS IS THE BEST SCORE')
print(-rf_random.best_score_)
print('THIS IS THE BEST PARAMS')
print(rf_random.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  3.6min finished


THIS IS THE BEST SCORE
0.13586182777048264
THIS IS THE BEST PARAMS
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}


We narrowed down the scope of the analysis. We can now adopt grid search with cross validation to do a more in depth search of the optimum hyperparameters

Using our paramaters, we derived CV prediction error of 0.13586, based on 5 fold CV.

In [91]:

param_grid = {
    'max_depth': [20,25,15],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'max_features' : ['sqrt'],
    'n_estimators': [800,850, 750]
#     'bootstrap': [False]
}


In [160]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 5, n_jobs= -1, verbose = 2, scoring = 'neg_root_mean_squared_error')
grid_search.fit(X,y)
# print(grid_search.best_params_)
print('SCORE FOR RF')
print(-grid_search.best_score_)

NameError: name 'param_grid' is not defined

## Other concerns and questions

1. Incorporating variable selection into our model 
2. On the internet, everybody is approaching with validation set approach so that they can check whether there is overfitting. How can we check for overfitting when we use a random forest model?
3. Why do people still use the validation set approach when using cross validation is supposed to help with this? You check across the different parameters to see which set of parameters gives you the lowest CV error