
# 1. Read Training CSV file

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('training_set.csv' , na_values=['NA',''] , keep_default_na=False)
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# 2. Check missing values

In [4]:
m = df.isna().sum()
m

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [5]:
m[m>0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
X = df.drop(columns=['Id','SalePrice'])
Y = df[['SalePrice']]

# 3. Decide Strategy for Con, Cat features , Cat most_frequent, constant



In [8]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [9]:
X

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [10]:
Y

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


# 5. Sklearn Pipeline  1. Feature selection (Ordinal Encode) 

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder , StandardScaler
from sklearn.compose import ColumnTransformer

In [12]:
num_pipe1 = Pipeline(steps=[('impute' , SimpleImputer(strategy='mean')),
                            ('scaler', StandardScaler())])

In [13]:
cat_pipe1 = Pipeline(steps=[('impute' , SimpleImputer(strategy='constant',fill_value='Not_Avail')),
                            ('ordinal', OrdinalEncoder())])

In [14]:
pre1 = ColumnTransformer([('num' ,num_pipe1, con),
                          ('cat', cat_pipe1 , cat)]).set_output(transform='pandas')

In [15]:
Xpre = pre1.fit_transform(X)
Xpre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0


In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
lr = LinearRegression()
sel = SequentialFeatureSelector(estimator=lr , n_features_to_select='auto',direction='backward')
sel.fit_transform(Xpre , Y)
sel_cols = sel.get_feature_names_out()
sel_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__YearRemodAdd',
       'num__MasVnrArea', 'num__BsmtFinSF1', 'num__BsmtFinSF2',
       'num__BsmtUnfSF', 'num__TotalBsmtSF', 'num__GrLivArea',
       'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
       'num__GarageCars', 'num__WoodDeckSF', 'num__OpenPorchSF',
       'num__EnclosedPorch', 'num__ScreenPorch', 'num__PoolArea',
       'num__YrSold', 'cat__LandContour', 'cat__Neighborhood',
       'cat__HouseStyle', 'cat__RoofMatl', 'cat__Exterior1st',
       'cat__MasVnrType', 'cat__ExterQual', 'cat__BsmtQual',
       'cat__BsmtCond', 'cat__BsmtExposure', 'cat__HeatingQC',
       'cat__KitchenQual', 'cat__Functional', 'cat__FireplaceQu',
       'cat__GarageQual', 'cat__Fence', 'cat__MiscFeature',
       'cat__SaleCondition'], dtype=object)

In [17]:
len(sel_cols)

40

In [18]:
(sel_cols)[0]

'num__MSSubClass'

In [19]:
sel_cols[0].split('__')

['num', 'MSSubClass']

In [20]:
sel_cols[0].split('__')[1]

'MSSubClass'

In [21]:
imp_cols = []
for i in sel_cols:
    s = i.split('__')[1]
    imp_cols.append(s)

In [22]:
imp_cols

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'GrLivArea',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolArea',
 'YrSold',
 'LandContour',
 'Neighborhood',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageQual',
 'Fence',
 'MiscFeature',
 'SaleCondition']

In [23]:
len(imp_cols)

40

In [24]:
Xsel = X[imp_cols]
Xsel.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageQual,Fence,MiscFeature,SaleCondition
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,TA,No,Ex,Gd,Typ,,TA,,,Normal
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,TA,Gd,Ex,TA,Typ,TA,TA,,,Normal
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,TA,Mn,Ex,Gd,Typ,TA,TA,,,Normal
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,Gd,No,Gd,Gd,Typ,Gd,TA,,,Abnorml
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,TA,Av,Ex,Gd,Typ,TA,TA,,,Normal


# 5. Sklearn Pipeline   2. Final Pipeline(OneHotEncoder)

In [25]:
cat_sel = list(Xsel.columns[Xsel.dtypes=='object'])
con_sel = list(Xsel.columns[Xsel.dtypes!='object'])

In [26]:
cat_sel

['LandContour',
 'Neighborhood',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageQual',
 'Fence',
 'MiscFeature',
 'SaleCondition']

In [27]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
num_pipe2 = Pipeline(steps=[('impute',SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler())])

In [30]:
cat_pipe2 = Pipeline(steps=[('impute',SimpleImputer(strategy='constant' , fill_value='Not_Avail')),
                            ('ohe',OneHotEncoder(handle_unknown='ignore' , sparse_output=False))])

In [31]:
pre2 = ColumnTransformer([('num' , num_pipe2 , con_sel),
                          ('cat' , cat_pipe2 , cat_sel)]).set_output(transform='pandas')

In [32]:
pre2

In [33]:
Xsel_pre = pre2 . fit_transform(Xsel)
Xsel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,...,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,-0.944591,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,-0.641228,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,-0.301643,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,-0.06167,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,-0.174865,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# 6. Train Test Split

### 20% Test (Unseen to model)

In [34]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(Xsel_pre , Y , test_size=0.2 ,random_state= 21)


In [35]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,...,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
710,-0.636078,-0.640101,-2.241782,0.381743,-1.201217,0.878668,-0.57441,-0.973018,-0.288653,-0.672923,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1098,-0.163109,-0.452686,-1.518467,0.381743,-1.168096,-1.689368,-0.57441,0.500854,-0.288653,-1.284176,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1286,-0.872563,-0.072844,-0.071836,-0.5172,-0.273836,-1.059473,1.924104,0.274948,0.213629,0.250749,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
992,0.073375,-0.075851,-0.071836,2.179628,-0.240715,0.394133,1.30917,0.20257,0.436865,-0.901577,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
631,1.492282,-0.593999,1.374795,-0.5172,1.150356,1.024029,0.023903,-0.92038,-0.288653,2.179592,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [36]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [37]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,...,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
880,-0.872563,-0.350058,-0.795151,-0.5172,1.117235,1.024029,-0.57441,1.176379,-0.288653,-1.035147,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
605,0.073375,0.309002,0.651479,0.381743,-0.207594,0.248772,0.40062,0.022723,-0.288653,-0.573311,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1166,-0.872563,-0.004192,1.374795,-0.5172,1.216598,1.120936,-0.175535,-0.973018,-0.288653,2.550871,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
216,-0.872563,-0.207142,0.651479,-0.5172,1.084115,0.927122,0.899214,1.101808,-0.288653,-0.174865,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
970,-0.163109,0.02838,-1.518467,-1.416142,-0.737526,-1.689368,-0.57441,-0.973018,-0.288653,0.345832,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [38]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


In [39]:
xtrain.shape

(1168, 150)

In [40]:
ytrain.shape

(1168, 1)

# 7. Final Model Building (Rigde/Lasso) 

In [41]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

In [42]:
model.score(xtrain ,ytrain )

0.9271440349953535

In [43]:
model.score(xtest,ytest)

0.816383962226541

In [44]:
from sklearn.linear_model import Ridge
model2=Ridge(alpha=1)
model2.fit(xtrain,ytrain)

In [45]:
model2.score(xtrain,ytrain)

0.9123591172292143

In [46]:
model2.score(xtest,ytest)

0.8367143710609645

In [47]:
from sklearn.linear_model import Lasso
model3 = Lasso(alpha=2)
model3.fit(xtrain,ytrain)

In [48]:
model3.score(xtrain,ytrain)

0.9271173836650964

In [49]:
model3.score(xtest,ytest)

0.8178174463710117


# 8. Apply Ridge Lasso with GridSearchCV

In [50]:
import numpy as np
params = {'alpha' :np.arange(start=1 , stop=1200 , step=5.0)}
params

{'alpha': array([1.000e+00, 6.000e+00, 1.100e+01, 1.600e+01, 2.100e+01, 2.600e+01,
        3.100e+01, 3.600e+01, 4.100e+01, 4.600e+01, 5.100e+01, 5.600e+01,
        6.100e+01, 6.600e+01, 7.100e+01, 7.600e+01, 8.100e+01, 8.600e+01,
        9.100e+01, 9.600e+01, 1.010e+02, 1.060e+02, 1.110e+02, 1.160e+02,
        1.210e+02, 1.260e+02, 1.310e+02, 1.360e+02, 1.410e+02, 1.460e+02,
        1.510e+02, 1.560e+02, 1.610e+02, 1.660e+02, 1.710e+02, 1.760e+02,
        1.810e+02, 1.860e+02, 1.910e+02, 1.960e+02, 2.010e+02, 2.060e+02,
        2.110e+02, 2.160e+02, 2.210e+02, 2.260e+02, 2.310e+02, 2.360e+02,
        2.410e+02, 2.460e+02, 2.510e+02, 2.560e+02, 2.610e+02, 2.660e+02,
        2.710e+02, 2.760e+02, 2.810e+02, 2.860e+02, 2.910e+02, 2.960e+02,
        3.010e+02, 3.060e+02, 3.110e+02, 3.160e+02, 3.210e+02, 3.260e+02,
        3.310e+02, 3.360e+02, 3.410e+02, 3.460e+02, 3.510e+02, 3.560e+02,
        3.610e+02, 3.660e+02, 3.710e+02, 3.760e+02, 3.810e+02, 3.860e+02,
        3.910e+02, 3.960e+02,

In [51]:
# Tuning for ridge
from sklearn.model_selection import GridSearchCV
rr = Ridge()
gscv1=GridSearchCV(estimator = rr, param_grid = params , cv=5, scoring='neg_mean_squared_error')
gscv1.fit(xtrain , ytrain)

In [52]:
gscv1.best_params_

{'alpha': 21.0}

In [53]:
best_ridge = gscv1.best_estimator_
best_ridge

In [54]:
gscv1.best_score_

-1066797111.0931041

In [55]:
best_ridge.score(xtrain,ytrain)

0.8843516635664743

In [56]:
best_ridge.score(xtest,ytest)

0.8328182723759818

In [57]:
# tuning for lasso
ls = Lasso()
gscv2 = GridSearchCV(estimator=ls , param_grid=params , cv = 5 , scoring='neg_mean_squared_error')
gscv2.fit(xtrain,ytrain)

In [58]:
gscv2.best_params_

{'alpha': 121.0}

In [59]:
best_lasso = gscv2.best_estimator_
best_lasso

In [60]:
gscv2.best_score_


-975944698.1359575

In [61]:
best_lasso.score(xtrain,ytrain)

0.9158154049486711

In [62]:
best_lasso.score(xtest,ytest)

0.827166615960984

# Cross validation get r2 scores

In [63]:
from sklearn.model_selection import cross_val_score
scores_ridge = cross_val_score(best_ridge , xtrain , ytrain , cv=5 , scoring='r2')
scores_ridge

array([0.59551371, 0.85006156, 0.90309658, 0.8786772 , 0.92521328])

In [64]:
scores_ridge.mean()

0.8305124635036718

In [65]:
scores_lasso = cross_val_score(best_lasso , xtrain , ytrain , cv = 5, scoring='r2')
scores_lasso


array([0.61403945, 0.87282649, 0.91337479, 0.89131803, 0.9342882 ])

In [66]:
scores_lasso.mean()

0.8451693901358244

# From above model choose Best Ridge model as it has higest test r2 score of 0.8308

# 9. Model Predction for train, test split

In [67]:
p1 =model.predict(xtrain)
p1

array([[ 53056.],
       [121792.],
       [154576.],
       ...,
       [131368.],
       [143392.],
       [120496.]])

In [68]:
p2 = model.predict(xtest)
p2[0:5]

array([[181152.],
       [202144.],
       [231488.],
       [216176.],
       [ 78008.]])

# 10. Evaluate model with MSE, RMSE, MAE, R2 

In [69]:
model.score(xtrain,ytrain)

0.9271440349953535

In [70]:
model.score(xtest,ytest)

0.816383962226541

In [71]:
def evaluate_model(model, x, y):
    # Predict values
    ypred = model.predict(x)
    # Import metrics
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.metrics import mean_absolute_percentage_error
    mse = mean_squared_error(y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(y, ypred)
    r2 = r2_score(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    # Print the results
    print(f'MSE : {mse:.2f}')
    print(f'RMSE : {rmse:.2f}')
    print(f'MAE : {mae:.2f}')
    print(f'MAPE : {mape:.4f}')
    print(f'R2 score : {r2:.4f}')

In [72]:
evaluate_model(model, xtrain, ytrain)

MSE : 450900141.20
RMSE : 21234.41
MAE : 14571.10
MAPE : 0.0878
R2 score : 0.9271


In [73]:
evaluate_model(model, xtest , ytest)

MSE : 1244423173.09
RMSE : 35276.38
MAE : 16995.84
MAPE : 0.0990
R2 score : 0.8164


## Above is Generalized model becuse it has good r2 score > 0.8 both in train and test

# 11. Predict the sample_set.csv 

In [74]:
xnew = pd.read_csv('sample_set.csv', na_values=['','NA'], keep_default_na=False)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [75]:
pre2

In [76]:
# Apply pre2.transform on xnew
xnew_pre = pre2.transform(xnew)
xnew_pre

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,...,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,-0.872563,0.110763,-0.795151,0.381743,-0.340077,-1.156380,-0.574410,0.053428,0.604293,-0.672923,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.375850,-0.071836,0.381743,-0.439440,-1.301740,0.023903,1.051363,-0.288653,-0.365032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.332053,-0.795151,-0.517200,0.852269,0.636400,-0.574410,0.761852,-0.288653,-0.974021,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,-0.054002,-0.071836,0.381743,0.885390,0.636400,-0.463612,0.347326,-0.288653,-0.550672,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.492282,-0.552407,1.374795,-0.517200,0.686666,0.345679,-0.574410,-0.396190,-0.288653,1.018211,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-0.859988,-1.518467,1.280685,-0.041991,-0.720298,-0.574410,-0.973018,-0.288653,-0.048086,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1455,2.438219,-0.864197,-1.518467,-0.517200,-0.041991,-0.720298,-0.574410,-0.420316,-0.288653,-0.618589,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,-0.872563,0.950423,-0.795151,1.280685,-0.373198,0.539493,-0.574410,1.711535,-0.288653,-1.284176,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,0.664586,-0.007600,-0.795151,-0.517200,0.686666,0.345679,-0.574410,-0.233889,-0.288653,0.017567,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [77]:
# Predict the SalePrice
preds = model.predict(xnew_pre)
preds

array([[121176.],
       [154640.],
       [190872.],
       ...,
       [170920.],
       [115120.],
       [214432.]])

In [78]:
# Save above in xnew
xnew['SalePrice'] = preds

In [79]:
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,121176.0
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,154640.0
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,190872.0
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,188928.0
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,206960.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,81472.0
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,72680.0
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,170920.0
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,115120.0


In [80]:
xnew.to_csv('Prediction.csv', index=False)