In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error

%matplotlib inline

In [2]:
housing = pd.read_csv('datasets/train.csv', index_col='Id') 

In [3]:
housing_test=pd.read_csv('datasets/test.csv',index_col='Id')

In [4]:
housing.columns

Index(['PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish'

In [None]:
#housing =pd.get_dummies(housing, columns =["Neighborhood"], drop_first = True)

In [5]:
housing.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,60,RL,,13517,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2010,WD,130500
544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2009,WD,220000
153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,1,2010,WD,109000
318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,174000
255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2010,WD,138500


##### Conversion of NA values

In [6]:
housing.loc[:, "Alley"] = housing.loc[:, "Alley"].fillna("None")

housing.loc[:, "BedroomA bvGr"] = housing.loc[:, "Bedroom AbvGr"].fillna(0)

housing.loc[:, "Bsmt Qual"] = housing.loc[:, "Bsmt Qual"].fillna("No")
housing.loc[:, "Bsmt Cond"] = housing.loc[:, "Bsmt Cond"].fillna("No")
housing.loc[:, "Bsmt Exposure"] = housing.loc[:, "Bsmt Exposure"].fillna("No")
housing.loc[:, "BsmtFin Type 1"] = housing.loc[:, "BsmtFin Type 1"].fillna("No")
housing.loc[:, "BsmtFin Type 2"] = housing.loc[:, "BsmtFin Type 2"].fillna("No")
housing.loc[:, "Bsmt Full Bath"] = housing.loc[:, "Bsmt Full Bath"].fillna(0)
housing.loc[:, "Bsmt Half Bath"] = housing.loc[:, "Bsmt Half Bath"].fillna(0)
housing.loc[:, "Bsmt Unf SF"] = housing.loc[:, "Bsmt Unf SF"].fillna(0)

housing.loc[:, "Central Air"] = housing.loc[:, "Central Air"].fillna("N")

housing.loc[:, "Condition 1"] = housing.loc[:, "Condition 1"].fillna("Norm")
housing.loc[:, "Condition 2"] = housing.loc[:, "Condition 2"].fillna("Norm")

housing.loc[:, "Enclosed Porch"] = housing.loc[:, "Enclosed Porch"].fillna(0)

housing.loc[:, "Exter Cond"] = housing.loc[:, "Exter Cond"].fillna("TA")
housing.loc[:, "Exter Qual"] = housing.loc[:, "Exter Qual"].fillna("TA")

housing.loc[:, "Fence"] = housing.loc[:, "Fence"].fillna("No")

housing.loc[:, "Fireplace Qu"] = housing.loc[:, "Fireplace Qu"].fillna("No")
housing.loc[:, "Fireplaces"] = housing.loc[:, "Fireplaces"].fillna(0)

housing.loc[:, "Functional"] = housing.loc[:, "Functional"].fillna("Typ")

housing.loc[:, "Garage Type"] = housing.loc[:, "Garage Type"].fillna("No")
housing.loc[:, "Garage Finish"] = housing.loc[:, "Garage Finish"].fillna("No")
housing.loc[:, "Garage Qual"] = housing.loc[:, "Garage Qual"].fillna("No")
housing.loc[:, "Garage Cond"] = housing.loc[:, "Garage Cond"].fillna("No")
housing.loc[:, "Garage Area"] = housing.loc[:, "Garage Area"].fillna(0)
housing.loc[:, "Garage Cars"] = housing.loc[:, "Garage Cars"].fillna(0)

housing.loc[:, "Half Bath"] = housing.loc[:, "Half Bath"].fillna(0)

housing.loc[:, "Heating QC"] = housing.loc[:, "Heating QC"].fillna("TA")

housing.loc[:, "Kitchen AbvGr"] = housing.loc[:, "Kitchen AbvGr"].fillna(0)

housing.loc[:, "Kitchen Qual"] = housing.loc[:, "Kitchen Qual"].fillna("TA")

housing.loc[:, "Lot Frontage"] = housing.loc[:, "Lot Frontage"].fillna(0)

housing.loc[:, "Lot Shape"] = housing.loc[:, "Lot Shape"].fillna("Reg")

housing.loc[:, "Mas Vnr Type"] = housing.loc[:, "Mas Vnr Type"].fillna("None")
housing.loc[:, "Mas Vnr Area"] = housing.loc[:, "Mas Vnr Area"].fillna(0)

housing.loc[:, "Misc Feature"] = housing.loc[:, "Misc Feature"].fillna("No")
housing.loc[:, "Misc Val"] = housing.loc[:, "Misc Val"].fillna(0)

housing.loc[:, "Open Porch SF"] = housing.loc[:, "Open Porch SF"].fillna(0)

housing.loc[:, "Paved Drive"] = housing.loc[:, "Paved Drive"].fillna("N")

housing.loc[:, "Pool QC"] = housing.loc[:, "Pool QC"].fillna("No")
housing.loc[:, "Pool Area"] = housing.loc[:, "Pool Area"].fillna(0)

housing.loc[:, "Overall Cond"] = housing.loc[:, "Overall Cond"].fillna("Normal")

housing.loc[:, "Screen Porch"] = housing.loc[:, "Screen Porch"].fillna(0)

housing.loc[:, "TotRms AbvGrd"] = housing.loc[:, "TotRms AbvGrd"].fillna(0)

housing.loc[:, "Utilities"] = housing.loc[:, "Utilities"].fillna("AllPub")

housing.loc[:, "Total Bsmt SF"] = housing.loc[:, "Total Bsmt SF"].fillna(0)
housing.loc[:, "BsmtFin SF 2"] = housing.loc[:, "BsmtFin SF 2"].fillna(0)
housing.loc[:, "BsmtFin SF 1"] = housing.loc[:, "BsmtFin SF 1"].fillna(0)

In [22]:

housing_test.loc[:, "Alley"] = housing_test.loc[:, "Alley"].fillna("None")

housing_test.loc[:, "BedroomA bvGr"] = housing_test.loc[:, "Bedroom AbvGr"].fillna(0)

housing_test.loc[:, "Bsmt Qual"] = housing_test.loc[:, "Bsmt Qual"].fillna("No")
housing_test.loc[:, "Bsmt Cond"] = housing_test.loc[:, "Bsmt Cond"].fillna("No")
housing_test.loc[:, "Bsmt Exposure"] = housing_test.loc[:, "Bsmt Exposure"].fillna("No")
housing_test.loc[:, "BsmtFin Type 1"] = housing_test.loc[:, "BsmtFin Type 1"].fillna("No")
housing_test.loc[:, "BsmtFin Type 2"] = housing_test.loc[:, "BsmtFin Type 2"].fillna("No")
housing_test.loc[:, "Bsmt Full Bath"] = housing_test.loc[:, "Bsmt Full Bath"].fillna(0)
housing_test.loc[:, "Bsmt Half Bath"] = housing_test.loc[:, "Bsmt Half Bath"].fillna(0)
housing_test.loc[:, "Bsmt Unf SF"] = housing_test.loc[:, "Bsmt Unf SF"].fillna(0)

housing_test.loc[:, "Central Air"] = housing_test.loc[:, "Central Air"].fillna("N")

housing_test.loc[:, "Condition 1"] = housing_test.loc[:, "Condition 1"].fillna("Norm")
housing_test.loc[:, "Condition 2"] = housing_test.loc[:, "Condition 2"].fillna("Norm")

housing_test.loc[:, "Enclosed Porch"] = housing_test.loc[:, "Enclosed Porch"].fillna(0)

housing_test.loc[:, "Exter Cond"] = housing_test.loc[:, "Exter Cond"].fillna("TA")
housing_test.loc[:, "Exter Qual"] = housing_test.loc[:, "Exter Qual"].fillna("TA")

housing_test.loc[:, "Fence"] = housing_test.loc[:, "Fence"].fillna("No")

housing_test.loc[:, "Fireplace Qu"] = housing_test.loc[:, "Fireplace Qu"].fillna("No")
housing_test.loc[:, "Fireplaces"] = housing_test.loc[:, "Fireplaces"].fillna(0)

housing_test.loc[:, "Functional"] = housing_test.loc[:, "Functional"].fillna("Typ")

housing_test.loc[:, "Garage Type"] = housing_test.loc[:, "Garage Type"].fillna("No")
housing_test.loc[:, "Garage Finish"] = housing_test.loc[:, "Garage Finish"].fillna("No")
housing_test.loc[:, "Garage Qual"] = housing_test.loc[:, "Garage Qual"].fillna("No")
housing_test.loc[:, "Garage Cond"] = housing_test.loc[:, "Garage Cond"].fillna("No")
housing_test.loc[:, "Garage Area"] = housing_test.loc[:, "Garage Area"].fillna(0)
housing_test.loc[:, "Garage Cars"] = housing_test.loc[:, "Garage Cars"].fillna(0)

housing_test.loc[:, "Half Bath"] = housing_test.loc[:, "Half Bath"].fillna(0)

housing_test.loc[:, "Heating QC"] = housing_test.loc[:, "Heating QC"].fillna("TA")

housing_test.loc[:, "Kitchen AbvGr"] = housing_test.loc[:, "Kitchen AbvGr"].fillna(0)

housing_test.loc[:, "Kitchen Qual"] = housing_test.loc[:, "Kitchen Qual"].fillna("TA")

housing_test.loc[:, "Lot Frontage"] = housing_test.loc[:, "Lot Frontage"].fillna(0)

housing_test.loc[:, "Lot Shape"] = housing_test.loc[:, "Lot Shape"].fillna("Reg")

housing_test.loc[:, "Mas Vnr Type"] = housing_test.loc[:, "Mas Vnr Type"].fillna("None")
housing_test.loc[:, "Mas Vnr Area"] = housing_test.loc[:, "Mas Vnr Area"].fillna(0)

housing_test.loc[:, "Misc Feature"] = housing_test.loc[:, "Misc Feature"].fillna("No")
housing_test.loc[:, "Misc Val"] = housing_test.loc[:, "Misc Val"].fillna(0)

housing_test.loc[:, "Open Porch SF"] = housing_test.loc[:, "Open Porch SF"].fillna(0)

housing_test.loc[:, "Paved Drive"] = housing_test.loc[:, "Paved Drive"].fillna("N")

housing_test.loc[:, "Pool QC"] = housing_test.loc[:, "Pool QC"].fillna("No")
housing_test.loc[:, "Pool Area"] = housing_test.loc[:, "Pool Area"].fillna(0)

housing_test.loc[:, "Overall Cond"] = housing_test.loc[:, "Overall Cond"].fillna("Normal")

housing_test.loc[:, "Screen Porch"] = housing_test.loc[:, "Screen Porch"].fillna(0)

housing_test.loc[:, "TotRms AbvGrd"] = housing_test.loc[:, "TotRms AbvGrd"].fillna(0)

housing_test.loc[:, "Utilities"] = housing_test.loc[:, "Utilities"].fillna("AllPub")

housing_test.loc[:, "Total Bsmt SF"] = housing_test.loc[:, "Total Bsmt SF"].fillna(0)
housing_test.loc[:, "BsmtFin SF 2"] = housing_test.loc[:, "BsmtFin SF 2"].fillna(0)
housing_test.loc[:, "BsmtFin SF 1"] = housing_test.loc[:, "BsmtFin SF 1"].fillna(0)

#### Conversion of Garage Yr Built NA values to corresponding Year Built, assumption garage built same time as house

In [7]:
housing['Garage Yr Blt'] = housing.apply(lambda row: row['Year Built'] 
                                     if np.isnan(row['Garage Yr Blt']) else row['Garage Yr Blt'],axis =1)

In [None]:
housing_test['Garage Yr Blt'] = housing_test.apply(lambda row: row['Year Built'] 
                                     if np.isnan(row['Garage Yr Blt']) else row['Garage Yr Blt'],axis =1)

#### Encoding ordinal variables

In [23]:
housing_test = housing_test.replace({"MS SubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "Mo Sold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

In [24]:
housing_test = housing_test.replace({"Alley" : {"Grvl" : 1, "Pave" : 2},
                       "Bsmt Cond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Bsmt Exposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFin Type1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFin Type2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "Bsmt Qual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "Exter Cond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "Exter Qual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "Fireplace Qu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "Garage Cond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Garage Qual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Heating QC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Kitchen Qual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Land Slope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "Lot Shape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "Paved Drive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "Pool QC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}},
                     )

In [8]:
housing = housing.replace({"MS SubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "Mo Sold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

In [9]:
housing = housing.replace({"Alley" : {"Grvl" : 1, "Pave" : 2},
                       "Bsmt Cond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Bsmt Exposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFin Type1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFin Type2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "Bsmt Qual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "Exter Cond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "Exter Qual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "Fireplace Qu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "Garage Cond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Garage Qual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Heating QC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Kitchen Qual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Land Slope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "Lot Shape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "Paved Drive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "Pool QC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}},
                     )

##### Converting incorrect categorical values

In [10]:
housing[['PID','Year Remod/Add','Year Built','Garage Yr Blt','Yr Sold']] = housing[['PID','Year Remod/Add','Year Built','Garage Yr Blt','Yr Sold']].astype(str) 


In [25]:
housing_test[['PID','Year Remod/Add','Year Built','Garage Yr Blt','Yr Sold']] = housing_test[['PID','Year Remod/Add','Year Built','Garage Yr Blt','Yr Sold']].astype(str)

BedroomA bvGr    0
Heating          0
Roof Style       0
Roof Matl        0
Exterior 1st     0
                ..
TotRms AbvGrd    0
Functional       0
Fireplaces       0
Fireplace Qu     0
PID              0
Length: 81, dtype: int64

In [None]:
categorical_features = housing.select_dtypes(include = ["object"]).columns
categorical_features

#### dropping PID as it does not give any impactful info

In [12]:
housing.drop(['PID'] , axis =1, inplace = True)

In [None]:
housing_test.drop(['PID'] , axis =1, inplace = True)

#### Getting rid of outliers

In [13]:
Q1 = housing.quantile(0.25)
Q3 = housing.quantile(0.75)
IQR = Q3 - Q1

In [14]:
housing= housing[~((housing < (Q1 - 1.5 * IQR)) |(housing > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
categorical_features = housing.select_dtypes(include = ["object"]).columns
categorical_features

In [None]:
#Q1_test = housing_test.quantile(0.25)
#Q3_test = housing_test.quantile(0.75)
#IQR_test = Q3_test - Q1_test

In [None]:
#housing_test= housing_test[~((housing_test < (Q1_test - 1.5 * IQR_test)) |(housing_test > (Q3_test + 1.5 * IQR_test))).any(axis=1)]

#### One hot encoding of categorical date

In [15]:
housing =pd.get_dummies(housing, columns =['MS SubClass', 'MS Zoning', 'Alley', 'Land Contour', 'Lot Config',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Foundation', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating',
       'Central Air', 'Electrical', 'Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Fence', 'Misc Feature', 'Mo Sold', 'Yr Sold',
       'Sale Type'], drop_first = True)

In [26]:
housing_test =pd.get_dummies(housing_test, columns =['MS SubClass', 'MS Zoning', 'Alley', 'Land Contour', 'Lot Config',
       'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Foundation', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating',
       'Central Air', 'Electrical', 'Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Fence', 'Misc Feature', 'Mo Sold', 'Yr Sold',
       'Sale Type'], drop_first = True)

In [None]:
#housing_test =pd.get_dummies(housing_test, columns =["Neighborhood"], drop_first = True)

In [None]:
categorical = housing.select_dtypes(include = ["object"]).columns

In [None]:
categorical_features

#### Finding and dropping variables that do not have high correlations to target variable "SalePrice"

In [16]:
cor =housing.corr()

In [17]:
cor_target = abs(cor['SalePrice'])

In [18]:
cor_target.sort_values(ascending = False).head(20)

SalePrice               1.000000
Overall Qual            0.811975
Gr Liv Area             0.747290
Garage Cars             0.722162
Kitchen Qual            0.711311
Garage Area             0.705318
Bsmt Qual               0.684001
Exter Qual              0.674655
Full Bath               0.626463
Foundation_PConc        0.602849
Foundation_CBlock       0.565617
TotRms AbvGrd           0.556682
Heating QC              0.534189
Garage Finish_Unf       0.530771
Total Bsmt SF           0.530662
1st Flr SF              0.525249
Exterior 1st_VinylSd    0.451722
Exterior 2nd_VinylSd    0.451636
BsmtFin Type 1_GLQ      0.445192
Fireplace Qu            0.443212
Name: SalePrice, dtype: float64

In [19]:
drop_columns = pd.DataFrame([cor_target[cor_target<0.5]])


In [20]:
housing =housing.drop(columns =drop_columns)

In [None]:
#cor_target.sort_values(ascending = False).head(20)

In [None]:
housing.shape

In [None]:
#housing.corr()
#cor_target = abs(housing[housing.columns[1:]].corr()['SalePrice'][:].sort_values(ascending = False))
#cor_target = abs(housing.corr("SalePrice"))
#Selecting highly correlated features
#cor_target

#### Remove columns that are not found in either the training data or submission data

In [21]:
features = [col for col in housing._get_numeric_data().columns if col != 'SalePrice']
X = housing[features]
y = housing['SalePrice']

In [27]:
features_test = [col for col in housing_test._get_numeric_data().columns if col != 'SalePrice']
X_sub = housing_test[features_test]


In [28]:
missing_features = list(set(X.columns.values)-set(X_sub.columns.values))

In [29]:
X = X.drop(missing_features, axis =1)

In [30]:
missing_features = list(set(X_sub.columns.values)-set(X.columns.values))
X_sub= X_sub.drop(missing_features, axis =1)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [32]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [33]:
lr = LinearRegression()

In [34]:
lasso = LassoCV(n_alphas=200)

In [35]:
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [36]:
lr_scores = cross_val_score(lr, X_train, y_train, cv=5)
lr_scores.mean()

0.8786783322973847

In [37]:
lasso_scores = cross_val_score(lasso, X_train, y_train, cv=3)
lasso_scores.mean()

0.8772360798733349

In [38]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=3)
ridge_scores.mean()

0.8771809689123883

In [39]:
lasso.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=200, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [40]:
lasso.score(X_test, y_test)

0.84444385269396

In [41]:
lasso_scores.mean()

0.8772360798733349

#### Prediction

In [None]:
X.shape

In [None]:
X_sub.shape

In [42]:
pred = lasso.predict(X_test)

In [43]:
pred.shape

(162,)

In [44]:
r2_score(y_test, pred)

0.84444385269396

In [45]:
mse =mean_squared_error(y_test, pred)

In [46]:
rmse = np.sqrt(mse)

In [47]:
rmse

18910.392121751065

In [48]:
pred_test = lasso.predict(X_sub)

In [49]:
pred.shape

(162,)

#### Creating submission CSV

In [50]:
submission = pd.DataFrame()

In [51]:
X_sub.shape

(879, 34)

In [52]:
housing_test.shape

(879, 487)

In [53]:
submission=pd.DataFrame(pred_test,columns=["SalePrice"],index = housing_test.index)

In [None]:
submission.head()

In [54]:
submission.to_csv("submission.csv")