In [1]:
#Import basic EDA and Visualisation libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
housing_train = pd.read_csv('datasets/Train_Cleaned.csv')

In [3]:
housing_train.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Overall Qual,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,BsmtFin Type 1,...,Exterior 1st_Stucco,Exterior 1st_VinylSd,Exterior 1st_Wd Sdng,Exterior 1st_WdShing,Roof Style_Flat,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed
0,109,68.0,13517,2,6,289.0,3,3,1,6,...,0,0,0,0,0,1,0,0,0,0
1,544,43.0,11492,2,7,132.0,3,4,1,6,...,0,1,0,0,0,1,0,0,0,0
2,153,68.0,7922,3,5,0.0,2,3,1,6,...,0,1,0,0,0,1,0,0,0,0
3,318,73.0,9802,3,5,0.0,2,4,1,1,...,0,1,0,0,0,1,0,0,0,0
4,255,82.0,14235,2,6,0.0,2,2,1,1,...,0,0,1,0,0,1,0,0,0,0


In [4]:
housing_train.set_index('Id', inplace = True)

In [5]:
housing_train.shape

(1969, 178)

In [6]:
housing_test = pd.read_csv('datasets/Test_Cleaned.csv')
housing_test.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Overall Qual,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,BsmtFin Type 1,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,69.0,9142,3,6,0.0,2,2,1,1,...,0,0,0,0,0,0,0,0,0,1
1,2718,69.6,9662,2,5,0.0,2,4,1,1,...,0,0,0,0,0,0,0,0,0,1
2,2414,58.0,17104,2,7,0.0,3,4,3,6,...,0,0,0,0,0,0,1,0,0,0
3,1989,60.0,8520,3,5,0.0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,1
4,625,69.6,9500,2,6,247.0,2,4,1,4,...,0,0,0,0,0,0,0,0,0,1


In [7]:
housing_test.set_index('Id', inplace = True)
housing_test.shape

(879, 177)

In [8]:
test_features = housing_test.columns.to_list()
test_features

['Lot Frontage',
 'Lot Area',
 'Lot Shape',
 'Overall Qual',
 'Mas Vnr Area',
 'Exter Qual',
 'Bsmt Qual',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'Total Bsmt SF',
 'Heating QC',
 '1st Flr SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'Kitchen Qual',
 'TotRms AbvGrd',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Finish',
 'Garage Cars',
 'Garage Area',
 'Garage Qual',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 'Screen Porch',
 'Fence',
 'Mo Sold',
 'Garage Age',
 'House Age',
 'House Remod Yrs',
 'Bldg Type_1Fam',
 'Bldg Type_2fmCon',
 'Bldg Type_Duplex',
 'Bldg Type_Twnhs',
 'Bldg Type_TwnhsE',
 'Condition 1_Artery',
 'Condition 1_Feedr',
 'Condition 1_Norm',
 'Condition 1_PosA',
 'Condition 1_PosN',
 'Condition 1_RRAe',
 'Condition 1_RRAn',
 'Condition 1_RRNe',
 'Condition 1_RRNn',
 'Exterior 1st_AsbShng',
 'Exterior 1st_AsphShn',
 'Exterior 1st_BrkComm',
 'Exterior 1st_BrkFace',
 'Ex

In [9]:
train_features = housing_train.columns.to_list()
train_features

['Lot Frontage',
 'Lot Area',
 'Lot Shape',
 'Overall Qual',
 'Mas Vnr Area',
 'Exter Qual',
 'Bsmt Qual',
 'Bsmt Exposure',
 'BsmtFin Type 1',
 'BsmtFin SF 1',
 'Total Bsmt SF',
 'Heating QC',
 '1st Flr SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'Kitchen Qual',
 'TotRms AbvGrd',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Finish',
 'Garage Cars',
 'Garage Area',
 'Garage Qual',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 'Screen Porch',
 'Fence',
 'Mo Sold',
 'SalePrice',
 'Garage Age',
 'House Age',
 'House Remod Yrs',
 'Bldg Type_1Fam',
 'Bldg Type_2fmCon',
 'Bldg Type_Duplex',
 'Bldg Type_Twnhs',
 'Bldg Type_TwnhsE',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_Greens',
 'Neighborhood_GrnHill',
 'Neighbo

In [10]:
len(train_features)

178

In [11]:
len(test_features)

177

In [12]:
common_features = set(test_features).intersection(set(train_features))
common_features

{'1st Flr SF',
 'Bedroom AbvGr',
 'Bldg Type_1Fam',
 'Bldg Type_2fmCon',
 'Bldg Type_Duplex',
 'Bldg Type_Twnhs',
 'Bldg Type_TwnhsE',
 'Bsmt Exposure',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Bsmt Qual',
 'BsmtFin SF 1',
 'BsmtFin Type 1',
 'Condition 1_Artery',
 'Condition 1_Feedr',
 'Condition 1_Norm',
 'Condition 1_PosA',
 'Condition 1_PosN',
 'Condition 1_RRAe',
 'Condition 1_RRAn',
 'Condition 1_RRNe',
 'Condition 1_RRNn',
 'Enclosed Porch',
 'Exter Qual',
 'Exterior 1st_AsbShng',
 'Exterior 1st_AsphShn',
 'Exterior 1st_BrkComm',
 'Exterior 1st_BrkFace',
 'Exterior 1st_CemntBd',
 'Exterior 1st_HdBoard',
 'Exterior 1st_MetalSd',
 'Exterior 1st_Plywood',
 'Exterior 1st_Stucco',
 'Exterior 1st_VinylSd',
 'Exterior 1st_Wd Sdng',
 'Exterior 1st_WdShing',
 'Exterior 2nd_AsbShng',
 'Exterior 2nd_AsphShn',
 'Exterior 2nd_Brk Cmn',
 'Exterior 2nd_BrkFace',
 'Exterior 2nd_CBlock',
 'Exterior 2nd_CmentBd',
 'Exterior 2nd_HdBoard',
 'Exterior 2nd_ImStucc',
 'Exterior 2nd_MetalSd',
 'Exterior

In [13]:
len(common_features)

170

In [14]:
common_features = list(common_features)

In [15]:
X = housing_train[common_features]
X.shape

(1969, 170)

In [16]:
y = housing_train['SalePrice']
y.shape

(1969,)

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [18]:
X_train.shape

(1476, 170)

In [19]:
X_val.shape

(493, 170)

In [20]:
ss = StandardScaler()
ss.fit(X_train)

X_train_scaled = ss.transform(X_train)
X_val_scaled = ss.transform(X_val)

In [21]:
X_train_scaled.shape

(1476, 170)

In [22]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

LinearRegression()

In [23]:
lr_cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv = 10)
lr_cv_scores.mean()

-4.9538109606398114e+22

In [24]:
ridge = RidgeCV(alphas = np.logspace(0, 5, 1000))
ridge.fit(X_train_scaled, y_train)
ridge.alpha_

391.3745601980384

In [25]:
ridge_cv_scores = cross_val_score(ridge, X_train_scaled, y_train, cv = 5)
ridge_cv_scores.mean()

0.839363164749256

In [35]:
lasso = LassoCV(n_alphas = 200)
lasso.fit(X_train_scaled, y_train)

LassoCV(n_alphas=200)

In [36]:
lasso_cv_scores = cross_val_score(lasso, X_train_scaled, y_train, cv = 5)
lasso_cv_scores.mean()

0.840585853584703

In [37]:
len(lasso.coef_)

170

In [38]:
lasso_coef = pd.Series(lasso.coef_, index = X.columns).sort_values(ascending = False).to_frame(name = 'Coef')

In [50]:
lasso_coef.tail()

Unnamed: 0,Coef
Roof Style_Mansard,-1123.203958
Exterior 1st_Stucco,-1181.697856
Land Contour_Bnk,-1942.958593
House Remod Yrs,-2342.348242
Neighborhood_Edwards,-2440.17765


In [51]:
lasso_coef.head(25)

Unnamed: 0,Coef
Gr Liv Area,15569.638946
Overall Qual,13999.544956
Neighborhood_NridgHt,10389.454231
Neighborhood_StoneBr,7435.131918
Neighborhood_NoRidge,6733.185481
Garage Cars,6125.419284
Exter Qual,5821.874861
Bsmt Exposure,5774.984453
Kitchen Qual,5653.038817
Bldg Type_1Fam,5316.001884


In [52]:
coef_cond = ((lasso_coef['Coef'] > 2000) | (lasso_coef['Coef'] < (-1000)))

In [48]:
significant_features = lasso_coef[coef_cond].index.to_list()
significant_features

['Gr Liv Area',
 'Overall Qual',
 'Neighborhood_NridgHt',
 'Neighborhood_StoneBr',
 'Neighborhood_NoRidge',
 'Garage Cars',
 'Exter Qual',
 'Bsmt Exposure',
 'Kitchen Qual',
 'Bldg Type_1Fam',
 'Bsmt Qual',
 'Screen Porch',
 'BsmtFin Type 1',
 'Sale Type_New',
 'Bsmt Full Bath',
 'Roof Style_Hip',
 'Lot Area',
 'TotRms AbvGrd',
 'Fireplace Qu',
 'Neighborhood_Crawfor',
 'MS SubClass_20',
 'Mas Vnr Area',
 'Exterior 1st_BrkFace',
 'Full Bath',
 'House Style_SLvl',
 'Bldg Type_Twnhs',
 'Exterior 2nd_Stucco',
 'Roof Style_Mansard',
 'Exterior 1st_Stucco',
 'Land Contour_Bnk',
 'House Remod Yrs',
 'Neighborhood_Edwards']

In [53]:
len(significant_features)

32