# House Price Regression Project

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

# Step 1 - Data Ingestion

In [2]:
import pandas as pd
df = pd.read_csv("training_set.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Target - Sale Price

# Step 2 - Perform basic data quality checks

In [3]:
df.duplicated().sum()

np.int64(0)

In [5]:
m = df.isna().sum()
m[m > 0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Step 3 - Seperate X and Y(SalePrice)

In [7]:
X = df.drop(columns = ["SalePrice", "Id"])
Y = df["SalePrice"]

In [8]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [9]:
Y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

# Train Test Split

In [36]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X, Y, test_size=0.2, random_state = 10
)

In [37]:
xtrain.shape

(1168, 79)

In [38]:
xtest.shape

(292, 79)

In [39]:
xtrain.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1216,90,RM,68.0,8930,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal
339,20,RL,66.0,12400,Pave,,IR1,Lvl,AllPub,Inside,...,234,0,,,,0,6,2009,WD,Normal
1057,60,RL,,29959,Pave,,IR2,Lvl,AllPub,FR2,...,0,0,,,,0,1,2009,WD,Normal
482,70,RM,50.0,2500,Pave,Pave,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,6,2009,WD,Normal
529,20,RL,,32668,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,,,,0,3,2007,WD,Alloca


In [40]:
ytrain.head()

1216    112000
339     155000
1057    248000
482     155000
529     200624
Name: SalePrice, dtype: int64

In [41]:
xtest.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
854,20,RL,102.0,17920,Pave,,Reg,Lvl,AllPub,Inside,...,312,0,,,,0,7,2006,WD,Abnorml
381,20,FV,60.0,7200,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2006,New,Partial
816,20,RL,,11425,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,7,2006,WD,Normal
577,80,RL,96.0,11777,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,5,2006,WD,Abnorml
35,60,RL,108.0,13418,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Normal


In [42]:
ytest.head()

854    170000
381    187750
816    137000
577    164500
35     309000
Name: SalePrice, dtype: int64

# Step 4 - Apply preprocessing on X

In [43]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [44]:
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

In [45]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Not_Avail"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler()
)

In [46]:
cat_cols = X.select_dtypes(include = "object").columns.tolist()
num_cols = X.select_dtypes(include = "number").columns.tolist()
pre = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform = "pandas")

In [47]:
pre.fit(xtrain, ytrain)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Avail'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [48]:
xtrain_pre = pre.transform(xtrain)
xtrain_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
1216,0.77329,-0.09673,-0.165559,-0.08869,-0.518637,0.219362,-0.338129,-0.572336,-0.957202,-0.285084,...,-0.754478,0.95192,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733
339,-0.86847,-0.184996,0.154803,-0.08869,1.284244,-0.442522,0.635935,0.390462,0.300242,-0.285084,...,-0.754478,0.95192,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733
1057,0.069678,-0.008464,1.775909,0.640818,0.382804,0.74887,0.441123,-0.572336,0.321737,-0.285084,...,0.287782,-1.557883,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733
482,0.304215,-0.891125,-0.759198,0.640818,2.185685,-1.865572,0.976858,-0.572336,-0.314508,-0.285084,...,-0.754478,0.95192,-4.19357,-4.576916,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733
529,-0.86847,-0.008464,2.026014,-0.08869,-2.321518,-0.475616,-0.484238,-0.572336,1.663011,-0.285084,...,-0.754478,0.115319,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,-1.60879


In [49]:
xtest_pre = pre.transform(xtest)
xtest_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
854,-0.86847,1.403794,0.664428,-0.818199,-1.420078,-0.541804,-0.532941,-0.572336,-0.299462,6.59017,...,-0.754478,0.95192,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,-3.424314
381,-0.86847,-0.449795,-0.325279,0.640818,-0.518637,1.146,1.025561,-0.572336,-0.957202,-0.285084,...,-0.754478,0.115319,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,-0.966014,1.114495
816,-0.86847,-0.008464,0.064788,-0.818199,0.382804,-0.574898,-1.507005,-0.572336,0.087444,-0.285084,...,-0.754478,0.115319,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733
577,0.538753,1.138996,0.097286,-0.818199,0.382804,-0.177768,-0.922567,-0.041703,-0.252173,3.206404,...,-0.754478,0.115319,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,-3.424314
35,0.069678,1.668593,0.248788,1.370326,-0.518637,1.079811,0.976858,0.149763,-0.957202,-0.285084,...,0.287782,-1.557883,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733


# Step 5 - Build a base model

In [50]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [51]:
model1.intercept_

np.float64(181415.76113013714)

In [52]:
model1.coef_

array([ -4562.39557443,  -3522.00280674,   4324.16285435,  15261.31228312,
         6005.87306504,   7111.6271963 ,  -1157.88015053,   6312.72339081,
         1158.67077865,   1041.7895293 ,  -1309.86178424,    286.37540967,
         7889.74449478,   8789.83796616,  -1634.78424095,  12895.85871575,
         3487.91441477,   -228.48391891,   1243.71726573,  -1953.26967674,
        -2512.66514559,  -3009.73944296,   7385.34998891,   2602.99411697,
          425.15558968,  10159.74890534,  -1721.46242088,   3405.14680715,
         -185.97923143,    121.77317137,   1282.99289296,   3056.03649629,
       -11742.776629  ,    132.47146266,   -383.52475612,  -1196.14165509,
         -729.1338242 ,   2416.74921962,    560.52825632,  -1693.30273443,
         2957.39372614,  -1761.07893623,   -611.86283708,   2258.01312279,
         2896.88324401,   -418.74888319,  -2515.75800451,  -2685.17781507,
        -2399.9872877 ,   1760.02395581,   4276.14392907,  -2707.45228266,
         1075.81310111,  

In [53]:
model1.score(xtrain_pre, ytrain)

0.8519533390533331

In [54]:
model1.score(xtest_pre, ytest)

0.8465505584765124

# Feature Selection
Backward elimination

In [55]:
from sklearn.feature_selection import SequentialFeatureSelector

model = LinearRegression()
sel = SequentialFeatureSelector(model, n_features_to_select="auto", direction="backward")
sel.fit(xtrain_pre, ytrain)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [56]:
sel.get_feature_names_out()

array(['num__OverallQual', 'num__OverallCond', 'num__YearBuilt',
       'num__MasVnrArea', 'num__BsmtFinSF2', 'num__1stFlrSF',
       'num__2ndFlrSF', 'num__LowQualFinSF', 'num__GrLivArea',
       'num__BsmtFullBath', 'num__FullBath', 'num__KitchenAbvGr',
       'num__TotRmsAbvGrd', 'num__Fireplaces', 'num__GarageCars',
       'num__WoodDeckSF', 'num__3SsnPorch', 'num__ScreenPorch',
       'num__PoolArea', 'cat__Street', 'cat__LotShape',
       'cat__LandContour', 'cat__LandSlope', 'cat__Neighborhood',
       'cat__BldgType', 'cat__HouseStyle', 'cat__RoofStyle',
       'cat__RoofMatl', 'cat__Exterior2nd', 'cat__BsmtQual',
       'cat__BsmtCond', 'cat__BsmtExposure', 'cat__HeatingQC',
       'cat__Electrical', 'cat__KitchenQual', 'cat__GarageFinish',
       'cat__GarageQual', 'cat__PoolQC', 'cat__SaleType',
       'cat__SaleCondition'], dtype=object)

In [57]:
xtrain_pre_sel = sel.transform(xtrain_pre)
xtest_pre_sel = sel.transform(xtest_pre)

In [59]:
xtrain_pre_sel[0:5]

array([[-0.08869022, -0.5186371 ,  0.21936239, -0.57233581, -0.28508365,
         0.38742939,  0.54232028, -0.128577  ,  0.71947909, -0.8224736 ,
         0.78544702,  4.63265998,  0.91599721, -0.9562229 ,  0.30834308,
        -0.7675683 , -0.12179567, -0.27582204, -0.07108341,  0.05862104,
         0.75358397,  0.30525576, -0.22373925,  1.13242686,  1.27510627,
        -1.58485913, -0.49352413, -0.12866801,  0.77302801,  0.21185112,
        -1.80716453,  1.45044121,  1.32514791,  0.30262726,  0.78971739,
         0.95192001,  0.31042061,  0.06450642,  0.31615002,  0.20673345],
       [-0.08869022,  1.28424426, -0.44252154,  0.39046229, -0.28508365,
         0.12481956, -0.79639498, -0.128577  , -0.5751583 , -0.8224736 ,
        -1.03479528, -0.21585871, -0.32588873, -0.9562229 , -1.04051352,
        -0.7675683 , -0.12179567,  4.04030491, -0.07108341,  0.05862104,
        -1.38106295,  0.30525576, -0.22373925, -0.03409979, -0.40774585,
        -0.54168333,  1.88844358, -0.12866801,  1.

In [60]:
xtest_pre_sel[0:5]

array([[-0.81819854, -1.42007779, -0.54180413, -0.57233581,  6.59016968,
         1.56279961, -0.79639498, -0.128577  ,  0.48768812,  1.11431907,
        -1.03479528, -0.21585871, -0.32588873,  0.5885466 ,  0.30834308,
        -0.7675683 , -0.12179567,  5.47901389, -0.07108341,  0.05862104,
         0.75358397,  0.30525576, -0.22373925,  1.13242686, -0.40774585,
        -0.54168333,  1.88844358, -0.12866801, -0.10784683,  0.99242559,
         0.32062596, -0.25733634,  1.32514791,  0.30262726,  0.78971739,
         0.95192001,  0.31042061,  0.06450642,  0.31615002, -3.42431418],
       [ 0.64081809, -0.5186371 ,  1.14599988, -0.57233581, -0.28508365,
         0.34408602, -0.79639498, -0.128577  , -0.41309306,  1.11431907,
         0.78544702, -0.21585871, -0.9468317 ,  0.5885466 ,  0.30834308,
         0.98584096, -0.12179567, -0.27582204, -0.07108341,  0.05862104,
         0.75358397,  0.30525576, -0.22373925,  1.46572018, -0.40774585,
        -0.54168333, -0.49352413, -0.12866801,  0.

In [61]:
model2 = LinearRegression()
model2.fit(xtrain_pre_sel, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [64]:
model2.score(xtrain_pre_sel, ytrain)

0.8384114007904653

In [66]:
model2.score(xtest_pre_sel, ytest)

0.8346642987069534

# Step 3 - Ridge or Lasso

In [67]:
from sklearn.linear_model import Ridge
rr = Ridge(alpha = 0.5)
rr.fit(xtrain_pre_sel, ytrain)

0,1,2
,alpha,0.5
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [68]:
rr.score(xtrain_pre_sel, ytrain)

0.8384113035977316

In [69]:
rr.score(xtest_pre_sel, ytest)

0.8346687604233236

In [70]:
params = {
    "alpha": [0.1, 1, 10, 50, 100, 500, 1000]
}

In [71]:
from sklearn.model_selection import GridSearchCV

model3 = Ridge()
gscv_ridge = GridSearchCV(model3, params, cv=5, scoring="r2")
gscv_ridge.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': [0.1, 1, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [72]:
gscv_ridge.best_params_

{'alpha': 100}

In [73]:
gscv_ridge.best_score_

np.float64(0.8045225877502723)

In [75]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [76]:
best_ridge.score(xtrain_pre_sel, ytrain)

0.8366291952332615

In [77]:
best_ridge.score(xtest_pre_sel, ytest)

0.8346186274678655

# Polynomial with Ridge

In [93]:
from sklearn.preprocessing import PolynomialFeatures

poly_ridge = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(),
    Ridge(random_state=42)
)

params = {
    "ridge__alpha": [0.1, 1, 10, 50, 100, 500, 1000]
}

gscv_poly_ridge = GridSearchCV(poly_ridge, params, cv=5, scoring="r2")
gscv_poly_ridge.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'ridge__alpha': [0.1, 1, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1000
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [94]:
gscv_poly_ridge.best_params_

{'ridge__alpha': 1000}

In [95]:
best_poly_ridge = gscv_poly_ridge.best_estimator_
best_poly_ridge

0,1,2
,steps,"[('polynomialfeatures', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,1000
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [96]:
gscv_poly_ridge.best_score_

np.float64(0.6331077107150908)

In [97]:
gscv_poly_ridge.score(xtrain_pre_sel, ytrain)

0.9259046997038105

In [98]:
gscv_poly_ridge.score(xtest_pre_sel, ytest)

0.8297975319165757

# Lasso Model

In [86]:
from sklearn.linear_model import Lasso

params = {"alpha": [0.1, 0.5, 1, 10, 50, 100, 500, 1000]}

model = Lasso()
gscv_lasso = GridSearchCV(model, params, cv=5, scoring="r2")
gscv_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': [0.1, 0.5, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [87]:
gscv_lasso.best_params_

{'alpha': 0.1}

In [88]:
gscv_lasso.best_score_

np.float64(0.8023692915652039)

In [89]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [90]:
best_lasso.score(xtrain_pre_sel, ytrain)

0.8384114007048193

In [91]:
best_lasso.score(xtest_pre_sel, ytest)

0.834665525506319

# Polynomial Lasso

In [99]:
poly_lasso = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(),
    Lasso(random_state=42)
)

In [100]:
params = {"lasso__alpha": [0.1, 1, 10, 50, 100, 500, 1000]}

gscv_poly_lasso = GridSearchCV(poly_lasso, params, cv=5, scoring="r2")
gscv_poly_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'lasso__alpha': [0.1, 1, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,500
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [101]:
gscv_poly_lasso.best_params_

{'lasso__alpha': 500}

In [102]:
gscv_poly_lasso.best_score_

np.float64(0.8190757881228242)

In [103]:
best_poly_lasso = gscv_poly_lasso.best_estimator_
best_poly_lasso

0,1,2
,steps,"[('polynomialfeatures', ...), ('standardscaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alpha,500
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [104]:
best_poly_lasso.score(xtrain_pre_sel, ytrain)

0.9447119147560072

In [105]:
best_poly_lasso.score(xtest_pre_sel, ytest)

0.8360538722325842

# Choose the best ridge model

In [106]:
best_ridge

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [107]:
best_ridge.score(xtrain_pre_sel, ytrain)

0.8366291952332615

In [108]:
best_ridge.score(xtest_pre_sel, ytest)

0.8346186274678655

In [109]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

In [110]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

In [113]:
print("Training Results : ")
evaluate_model(best_ridge, xtrain_pre_sel, ytrain)

Training Results : 
RMSE : 32038.44
MAE: 19904.08
MAPE : 11.51%
R2 : 83.66%


In [114]:
print("Test Results : ")
evaluate_model(best_ridge, xtest_pre_sel, ytest)

Test Results : 
RMSE : 32526.00
MAE: 22875.43
MAPE : 13.64%
R2 : 83.46%


# Out of Sample prediction

In [115]:
xnew = pd.read_csv("testing_set.csv")
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [116]:
pre

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Avail'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [117]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,-0.86847,0.432867,0.082975,-0.818199,0.382804,-0.343239,-1.166083,-0.572336,0.048753,0.627392,...,-0.754478,0.95192,0.310421,0.296227,0.284178,0.064506,-1.388538,-0.190791,0.31615,0.206733
1,-0.86847,0.477,0.327171,-0.08869,0.382804,-0.442522,-1.312193,0.018472,1.026765,-0.285084,...,-0.754478,0.95192,0.310421,0.296227,0.284178,0.064506,0.455478,-2.843692,0.31615,0.206733
2,0.069678,0.168068,0.286826,-0.818199,-0.518637,0.848152,0.635935,-0.572336,0.743035,-0.285084,...,-0.754478,-1.557883,0.310421,0.296227,0.284178,0.064506,-1.388538,-0.190791,0.31615,0.206733
3,0.069678,0.344601,-0.068804,-0.08869,0.382804,0.881246,0.635935,-0.462927,0.336783,-0.285084,...,-0.754478,-1.557883,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733
4,1.476901,-1.200057,-0.527928,1.370326,-0.518637,0.682681,0.343716,-0.572336,-0.391889,-0.285084,...,-0.754478,0.115319,0.310421,0.296227,0.284178,0.064506,0.455478,-0.190791,0.31615,0.206733


In [118]:
sel

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [120]:
xnew_pre_sel = sel.transform(xnew_pre)
xnew_pre_sel[0:5]

array([[-0.81819854,  0.38280358, -0.34323895, -0.57233581,  0.62739237,
        -0.68850603, -0.79639498, -0.128577  , -1.17630725, -0.8224736 ,
        -1.03479528, -0.21585871, -0.9468317 , -0.9562229 , -1.04051352,
         0.36890066, -0.12179567,  1.93757639, -0.07108341,  0.05862104,
         0.75358397,  0.30525576, -0.22373925, -0.03409979, -0.40774585,
        -0.54168333, -0.49352413, -0.12866801,  0.77302801,  0.99242559,
         0.32062596,  0.59655243,  1.32514791,  0.30262726,  0.78971739,
         0.95192001,  0.31042061,  0.06450642,  0.31615002,  0.20673345],
       [-0.08869022,  0.38280358, -0.44252154,  0.01847211, -0.28508365,
         0.4154751 , -0.79639498, -0.128577  , -0.36032764, -0.8224736 ,
        -1.03479528, -0.21585871, -0.32588873, -0.9562229 , -1.04051352,
         2.42266243, -0.12179567, -0.27582204, -0.07108341,  0.05862104,
        -1.38106295,  0.30525576, -0.22373925, -0.03409979, -0.40774585,
        -0.54168333,  1.88844358, -0.12866801,  1.

In [123]:
preds = best_ridge.predict(xnew_pre_sel)
preds[0:5]

array([ 99887.29896749, 159361.03075748, 165859.19778063, 193975.41689427,
       190174.38896846])

In [124]:
xnew["SalePrice"] = preds.round(2)
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,99887.3
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,159361.03
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,165859.2
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,193975.42
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,190174.39


In [125]:
xnew.to_csv("results.csv", index=False)

# Save the model object

In [126]:
pre

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Avail'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [127]:
sel

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [129]:
best_ridge

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [130]:
import joblib

joblib.dump(best_ridge, "ridge_model.joblib")

['ridge_model.joblib']

In [131]:
joblib.dump(pre, "pre.joblib")

['pre.joblib']

In [132]:
joblib.dump(sel, "sel.joblib")

['sel.joblib']

# Load the model object

In [133]:
pre = joblib.load("pre.joblib")
pre

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Not_Avail'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [134]:
sel = joblib.load("sel.joblib")
sel

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'backward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [135]:
m = joblib.load("ridge_model.joblib")
m

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,
