## Import Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import  LassoCV, RidgeCV,ElasticNet
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import  RFE
from sklearn.pipeline import Pipeline

%matplotlib inline

In [2]:
X_top = pd.read_csv('../datasets/X_top.csv',index_col='Id') 
y = pd.read_csv('../datasets/y.csv',index_col='Id') 


In [3]:
X_top

Unnamed: 0_level_0,Lot Area,Lot Shape,Overall Qual,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,BsmtFin SF 1,Total Bsmt SF,Heating QC,...,house age,reno newness,MS SubClass_SC60,MS Zoning_RM,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_StoneBr,Year Built_2008,Roof Style_Gable,Roof Style_Hip
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,13517,3,6,289.0,4,3,0,533.0,725.0,5,...,34,5,1,0,0,0,0,0,1,0
544,11492,3,7,132.0,4,4,0,637.0,913.0,5,...,13,12,1,0,0,0,0,0,1,0
153,7922,4,5,0.0,3,3,0,731.0,1057.0,3,...,57,3,0,0,0,0,0,0,1,0
318,9802,4,5,0.0,3,4,0,0.0,384.0,4,...,4,3,1,0,0,0,0,0,1,0
255,14235,3,6,0.0,3,2,0,0.0,676.0,3,...,110,17,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,11449,3,8,0.0,4,4,2,1011.0,1884.0,5,...,1,1,0,0,0,0,0,0,1,0
785,12342,3,4,0.0,3,3,0,262.0,861.0,5,...,69,59,0,0,0,0,0,0,1,0
916,7558,4,6,0.0,3,3,0,0.0,896.0,4,...,81,59,0,0,0,0,0,0,1,0
639,10400,4,4,0.0,3,3,0,155.0,1200.0,3,...,53,53,0,0,0,0,0,0,1,0


In [4]:
y= y["SalePrice"]

In [5]:
X_top_train, X_top_test, y_top_train, y_top_test = train_test_split(X_top,y,test_size = 0.2, random_state=42)

In [6]:
ss = StandardScaler()
ss.fit(X_top_train)

X_top_train_ss = ss.transform(X_top_train)
X_top_test_ss = ss.transform(X_top_test)

## Lasso Model: Pipeline, Tuning and Evaluation

In [7]:
# Instantiate pipeline object.

pipe_l = Pipeline([('ss', StandardScaler()), ('lasso', LassoCV())])

In [8]:
# Define dictionary of hyperparameters.

pipe_l_params = {'ss__with_mean': [True, False], 
                'ss__with_std': [True, False],
                'lasso__n_alphas': [200],
                 'lasso__cv' : [3,5,10],
                'lasso__normalize': [True, False]}

In [9]:
# Instantiate our GridSearchCV object.

gs_l = GridSearchCV(pipe_l, # What is the model we want to fit?
                               pipe_l_params, # What is the dictionary of hyperparameters?
                                 cv = 10,# What number of folds in CV will we use?
                                verbose=1)

In [10]:
gs_l.fit(X_top_train_ss, y_top_train);

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  1.1min finished


In [11]:
gs_l.score(X_top_train_ss, y_top_train)

0.8912272830399843

In [12]:
gs_l.score(X_top_test_ss, y_top_test)

0.8849294707228798

In [13]:
print('RMSE:', np.sqrt(mean_squared_error(y_top_test, gs_l.predict(X_top_test_ss))))

RMSE: 26199.13320425627


## Ridge Model : Pipeline, Tuning and Evaluation

In [14]:
# Instantiate pipeline object.

pipe_r = Pipeline([('ss', StandardScaler()), ('ridge', RidgeCV())])

In [15]:
# Define dictionary of hyperparameters.

pipe_r_params = {'ss__with_mean': [True, False], 
                'ss__with_std': [True, False],

                 'ridge__cv' : [3,5,10],
                'ridge__normalize': [True, False]}

In [16]:
# Instantiate our GridSearchCV object.

gs_r = GridSearchCV(pipe_r, pipe_r_params, cv = 10, verbose=1)

In [17]:
gs_r.fit(X_top_train_ss, y_top_train);

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:   12.7s finished


In [18]:
gs_r.score(X_top_train_ss, y_top_train)

0.8913843400271821

In [19]:
gs_r.score(X_top_test_ss, y_top_test)

0.8854439515653612

In [20]:
print('RMSE:', np.sqrt(mean_squared_error(y_top_test, gs_r.predict(X_top_test_ss))))

RMSE: 26140.499372917016


## ElasticNet Model: Pipeline, Tuning and Evaluation

In [21]:
# Instantiate pipeline object.
pipe_e = Pipeline([('ss', StandardScaler()), ('enet', ElasticNet())])



In [22]:
# Define dictionary of hyperparameters.

pipe_e_params = {'ss__with_mean': [True, False], 
                'ss__with_std': [True, False],
                'enet__alpha': np.arange(0.01, 1.0, 0.05),
                'enet__l1_ratio': np.linspace(0.01, 1.0, 25),
                 'enet__normalize': [True, False],
                }



In [23]:
# Instantiate our GridSearchCV object.

gs_e = GridSearchCV(pipe_e, # What is the model we want to fit?
                               pipe_e_params, # What is the dictionary of hyperparameters?
                                 cv = 10,# What number of folds in CV will we use?
                                verbose=1)

In [24]:
gs_e.fit(X_top_train_ss, y_top_train);

Fitting 10 folds for each of 4000 candidates, totalling 40000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 40000 out of 40000 | elapsed:  3.8min finished


In [25]:
gs_e.score(X_top_train_ss, y_top_train)

0.8910928842580639

In [26]:
gs_e.score(X_top_test_ss, y_top_test)

0.8866403045116943

In [27]:
print('RMSE:', np.sqrt(mean_squared_error(y_top_test, gs_e.predict(X_top_test_ss))))

RMSE: 26003.64347264249


## Model Evaluation

Lasso, Ridge and ElasticNet models all performed better than the baseline, Linear Regressio model.

All 3 models performed slightly better on the training sets than on the test sets which indicates slight overfitting.

ElasticNet model had the highest R2 score out of the 3 models and the lowest root mean square error.

The ElasticNet model is selected to be the production model