# House Price Prediction 
for given house details estimate appropriate price

Main steps in machine

1. Data ingestion - read the file
2. Perform basic data quality checks
3. Separate X and Y
4. train test split 
5. Apply preprocessing on xtrain, xtest
6. Feature Select -> select the relevant feature
7. Regurlarization - L1(Lasso) L2(Ridge)
8. Select the best model
9. Out of sample prediction
10. Save the best model and preprocessor

In [1]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 1 - Data Ingestion

In [2]:
import pandas as pd

df = pd.read_csv("training_set.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Target Feature - SalePrice
Estimate SalePrice based on other columns

# Step 2 - Basic Data quality checks

In [3]:
df.shape

(1460, 81)

In [4]:
m = df.isna().sum()
m[m > 0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(0)

In [6]:
cat_unique = df.select_dtypes(include="object").nunique()
cat_unique

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
MasVnrType        3
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            3
Fence             4
MiscFeature       4
SaleType          9
SaleCondition     6
dtype: int64

In [7]:
cardinality = cat_unique / len(df)
cardinality

MSZoning         0.003425
Street           0.001370
Alley            0.001370
LotShape         0.002740
LandContour      0.002740
Utilities        0.001370
LotConfig        0.003425
LandSlope        0.002055
Neighborhood     0.017123
Condition1       0.006164
Condition2       0.005479
BldgType         0.003425
HouseStyle       0.005479
RoofStyle        0.004110
RoofMatl         0.005479
Exterior1st      0.010274
Exterior2nd      0.010959
MasVnrType       0.002055
ExterQual        0.002740
ExterCond        0.003425
Foundation       0.004110
BsmtQual         0.002740
BsmtCond         0.002740
BsmtExposure     0.002740
BsmtFinType1     0.004110
BsmtFinType2     0.004110
Heating          0.004110
HeatingQC        0.003425
CentralAir       0.001370
Electrical       0.003425
KitchenQual      0.002740
Functional       0.004795
FireplaceQu      0.003425
GarageType       0.004110
GarageFinish     0.002055
GarageQual       0.003425
GarageCond       0.003425
PavedDrive       0.002055
PoolQC      

In [8]:
cardinality[cardinality > 0.8]

Series([], dtype: float64)

# Step 3 - Seperate X and Y(SalePrice)

In [9]:
X = df.drop(columns=["Id", "SalePrice"])
Y = df["SalePrice"]

In [10]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [11]:
Y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

# Step 4 - Apply Train Test Split

### Train model on xtrain, ytrain
### Use xtest, ytest for evaluating model

In [12]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=21)

In [13]:
xtrain.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
710,30,RL,56.0,4130,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,7,2008,WD,Normal
1098,50,RM,50.0,6000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,7,2009,WD,Normal
1286,20,RL,,9790,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
992,60,RL,80.0,9760,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnWw,,0,7,2007,WD,Normal
631,120,RL,34.0,4590,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal


In [14]:
ytrain.head()

710      52000
1098    128000
1286    143000
992     187000
631     209500
Name: SalePrice, dtype: int64

In [15]:
xtest.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
880,20,RL,60.0,7024,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2007,WD,Normal
605,60,RL,85.0,13600,Pave,,Reg,Lvl,AllPub,Inside,...,189,0,,,,0,10,2009,WD,Normal
1166,20,RL,64.0,10475,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2010,WD,Normal
216,20,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2008,WD,Normal
970,50,RL,60.0,10800,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,12,2006,WD,Abnorml


In [16]:
ytest.head()

880     157000
605     205000
1166    245350
216     210000
970     135000
Name: SalePrice, dtype: int64

# Step  5 -Apply Preprocessing on xtrain and xtest

1. num cols : SimpleImputer (to replace missing values mean, median) -> StandardScaler (to scale data mean=0, std=1)
2. cat cols : SimpleImputer (to replace missing values most_frequent, constant) -> OrdinalEncoder -> StandardScaler

In [17]:
cat_cols = X.select_dtypes(include="object").columns.tolist()
cat_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [18]:
num_cols = X.select_dtypes(include="number").columns.tolist()
num_cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [20]:
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [21]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="not_available"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler(),
)

In [22]:
pre = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)], remainder="drop"
).set_output(transform="pandas")

In [23]:
pre.fit(xtrain)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'not_available'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [24]:
xtrain_pre = pre.transform(xtrain)
xtrain_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
710,-0.639505,-0.640097,-0.61039,-2.264159,0.371687,-1.182858,0.877919,-0.580525,-0.96883,-0.274742,...,1.80698,1.908866,1.663409,1.793075,-3.520289,0.064506,0.458715,0.187833,0.308409,0.200871
1098,-0.166757,-0.906459,-0.437322,-1.536438,0.371687,-1.150075,-1.703259,-0.580525,0.496747,-0.274742,...,1.2907,0.796313,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871
1286,-0.875879,-0.018586,-0.086559,-0.080996,-0.521585,-0.264931,-1.07014,1.88276,0.272112,0.229492,...,-0.77442,0.796313,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871
992,0.069617,0.42535,-0.089336,-0.080996,2.158232,-0.232148,0.390904,1.276497,0.200142,0.453597,...,-0.77442,-0.316239,0.099108,0.069362,0.306326,0.064506,-0.471457,0.187833,0.308409,0.200871
631,1.487861,-1.616757,-0.567817,1.374445,-0.521585,1.144742,1.024023,0.009352,-0.916488,-0.274742,...,-0.77442,-0.316239,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871


In [25]:
xtest_pre = pre.transform(xtest)
xtest_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
880,-0.875879,-0.462522,-0.342552,-0.808717,-0.521585,1.111959,1.024023,-0.580525,1.16847,-0.274742,...,-0.77442,-1.428792,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871
605,0.069617,0.647318,0.266055,0.646725,0.371687,-0.199365,0.2448,0.380757,0.021307,-0.274742,...,-0.77442,0.796313,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871
1166,-0.875879,-0.284948,-0.023163,1.374445,-0.521585,1.210308,1.121426,-0.187273,-0.96883,-0.274742,...,-0.77442,-0.316239,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871
216,-0.875879,-0.240554,-0.210576,0.646725,-0.521585,1.079176,0.92662,0.872322,1.094318,-0.274742,...,-0.77442,0.796313,0.099108,0.069362,0.306326,0.064506,0.458715,0.187833,0.308409,0.200871
970,-0.166757,-0.462522,0.006916,-1.536438,-1.414858,-0.723895,-1.703259,-0.580525,-0.96883,-0.274742,...,1.80698,1.908866,1.663409,1.793075,0.306326,0.064506,0.458715,0.187833,0.308409,-3.508497


# Without feature selection check model performance

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

baseline_model = LinearRegression()
scores1 = cross_val_score(baseline_model, xtrain_pre, ytrain, cv=5, scoring="r2")
scores1

array([0.47429083, 0.75809139, 0.87674635, 0.84109937, 0.8682828 ])

In [27]:
# cross validiated score > 0.8
scores1.mean()

np.float64(0.7637021490230197)

In [28]:
# std or fluctuation should be as low as possible
scores1.std()

np.float64(0.15066308493133415)

In [29]:
baseline_model.fit(xtrain_pre, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [30]:
type(baseline_model).__name__

'LinearRegression'

In [31]:
train_r2 = baseline_model.score(xtrain_pre, ytrain)
train_r2

0.8659369195929708

In [32]:
test_r2 = baseline_model.score(xtest_pre, ytest)
test_r2

0.8023366414636373

In [33]:
gen_err = abs(train_r2 - test_r2)
print(gen_err)

0.06360027812933355


In [34]:
def evaluate_single_model(model, xtrain, ytrain, xtest, ytest):
    # get cross validated score
    scores = cross_val_score(model, xtrain, ytrain, cv=5, scoring="r2")
    # calculate cv mean and std
    cv_mean = scores.mean().round(4)
    cv_std = scores.std().round(4)
    # Fit the model
    model.fit(xtrain, ytrain)
    # Calcualte train r2 and test r2
    train_r2 = round(model.score(xtrain, ytrain), 4)
    test_r2 = round(model.score(xtest, ytest), 4)
    gen_err = round(abs(train_r2 - test_r2), 4)
    # show results in dictionary
    return {
        "model_name": type(model).__name__,
        "model": model,
        "cv_mean": cv_mean,
        "cv_std": cv_std,
        "train_r2": train_r2,
        "test_r2": test_r2,
        "gen_err": gen_err,
    }

In [35]:
# Results list is created to store differnet model results
results = []

r = evaluate_single_model(baseline_model, xtrain_pre, ytrain, xtest_pre, ytest)
print(r)
results.append(r)

{'model_name': 'LinearRegression', 'model': LinearRegression(), 'cv_mean': np.float64(0.7637), 'cv_std': np.float64(0.1507), 'train_r2': 0.8659, 'test_r2': 0.8023, 'gen_err': 0.0636}


# Feature selection

In [36]:
from sklearn.feature_selection import SequentialFeatureSelector

sel = SequentialFeatureSelector(
    LinearRegression(), n_features_to_select="auto", direction="forward"
).set_output(transform="pandas")
sel.fit(xtrain_pre, ytrain)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [37]:
sel_cols = sel.get_feature_names_out()
print(sel_cols)

['num__MSSubClass' 'num__LotArea' 'num__OverallQual' 'num__OverallCond'
 'num__YearBuilt' 'num__MasVnrArea' 'num__GrLivArea' 'num__BsmtFullBath'
 'num__BsmtHalfBath' 'num__KitchenAbvGr' 'num__GarageCars'
 'num__WoodDeckSF' 'num__EnclosedPorch' 'num__ScreenPorch' 'num__PoolArea'
 'num__YrSold' 'cat__MSZoning' 'cat__Street' 'cat__Alley' 'cat__Utilities'
 'cat__LotConfig' 'cat__Neighborhood' 'cat__Condition2' 'cat__RoofStyle'
 'cat__RoofMatl' 'cat__Exterior1st' 'cat__MasVnrType' 'cat__ExterQual'
 'cat__BsmtQual' 'cat__BsmtCond' 'cat__BsmtExposure' 'cat__BsmtFinType1'
 'cat__Heating' 'cat__KitchenQual' 'cat__Functional' 'cat__FireplaceQu'
 'cat__GarageType' 'cat__MiscFeature' 'cat__SaleCondition']


In [38]:
len(sel_cols)

39

In [39]:
xtrain_pre_sel = sel.transform(xtrain_pre)
xtrain_pre_sel

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__GrLivArea,num__BsmtFullBath,num__BsmtHalfBath,num__KitchenAbvGr,...,cat__BsmtCond,cat__BsmtExposure,cat__BsmtFinType1,cat__Heating,cat__KitchenQual,cat__Functional,cat__FireplaceQu,cat__GarageType,cat__MiscFeature,cat__SaleCondition
710,-0.639505,-0.610390,-2.264159,0.371687,-1.182858,-0.580525,-1.511625,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,1.163719,-0.120232,0.807064,-5.028120,0.863307,1.80698,0.187833,0.200871
1098,-0.166757,-0.437322,-1.536438,0.371687,-1.150075,-0.580525,-0.373855,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,-0.983554,-0.120232,0.807064,0.253215,0.863307,1.29070,0.187833,0.200871
1286,-0.875879,-0.086559,-0.080996,-0.521585,-0.264931,1.882760,-0.366206,1.105982,-0.237226,-0.217078,...,0.227563,0.601700,-1.520372,-0.120232,0.807064,0.253215,-1.317678,-0.77442,0.187833,0.200871
992,0.069617,-0.089336,-0.080996,2.158232,-0.232148,1.276497,0.633884,-0.807777,-0.237226,-0.217078,...,0.227563,-1.085661,-0.983554,-0.120232,-0.412407,0.253215,-2.044673,-0.77442,0.187833,0.200871
631,1.487861,-0.567817,1.374445,-0.521585,1.144742,0.009352,0.065955,-0.807777,-0.237226,-0.217078,...,-2.693249,-0.241980,-0.446736,-0.120232,-0.412407,0.253215,-1.317678,-0.77442,0.187833,0.200871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,-0.875879,-0.270733,-0.808717,0.371687,-0.559979,0.511841,-0.480939,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,0.090083,-0.120232,0.807064,0.253215,-1.317678,-0.77442,0.187833,0.200871
48,3.142478,-0.580219,-1.536438,-0.521585,-1.674604,-0.580525,-0.129091,-0.807777,-0.237226,8.525917,...,0.227563,0.601700,1.163719,-0.120232,0.807064,0.253215,0.863307,1.80698,0.187833,1.128213
772,0.542365,-0.268975,-0.080996,-0.521585,0.161249,-0.580525,-0.769684,1.105982,-0.237226,-0.217078,...,0.227563,-1.929342,-1.520372,-0.120232,0.807064,0.253215,0.136312,1.29070,0.187833,-3.508497
1231,0.778739,-0.277397,-0.808717,0.371687,-0.297714,0.074894,-0.630092,1.105982,-0.237226,-0.217078,...,0.227563,-1.929342,-1.520372,-0.120232,0.807064,0.253215,0.863307,-0.77442,0.187833,0.200871


In [40]:
xtest_pre_sel = sel.transform(xtest_pre)
xtest_pre_sel

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__GrLivArea,num__BsmtFullBath,num__BsmtHalfBath,num__KitchenAbvGr,...,cat__BsmtCond,cat__BsmtExposure,cat__BsmtFinType1,cat__Heating,cat__KitchenQual,cat__Functional,cat__FireplaceQu,cat__GarageType,cat__MiscFeature,cat__SaleCondition
880,-0.875879,-0.342552,-0.808717,-0.521585,1.111959,-0.580525,-0.821314,1.105982,-0.237226,-0.217078,...,-2.693249,0.601700,-1.520372,-0.120232,0.807064,0.253215,0.863307,-0.77442,0.187833,0.200871
605,0.069617,0.266055,0.646725,0.371687,-0.199365,0.380757,0.892034,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,-0.983554,-0.120232,0.807064,0.253215,-2.044673,-0.77442,0.187833,0.200871
1166,-0.875879,-0.023163,1.374445,-0.521585,1.210308,-0.187273,0.333666,-0.807777,-0.237226,-0.217078,...,0.227563,-1.929342,1.163719,-0.120232,-0.412407,0.253215,0.863307,-0.77442,0.187833,0.200871
216,-0.875879,-0.210576,0.646725,-0.521585,1.079176,0.872322,-0.159687,1.105982,-0.237226,-0.217078,...,0.227563,-0.241980,-0.446736,-0.120232,-0.412407,0.253215,0.863307,-0.77442,0.187833,0.200871
970,-0.166757,0.006916,-1.536438,-1.414858,-0.723895,-0.580525,-0.626268,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,1.163719,-0.120232,0.807064,0.253215,0.863307,1.80698,0.187833,-3.508497
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,-0.166757,0.456708,0.646725,3.051504,-1.051725,1.123566,0.830843,-0.807777,3.960955,-0.217078,...,0.227563,0.601700,-0.983554,-0.120232,-0.412407,0.253215,0.136312,-0.77442,0.187833,0.200871
1228,1.487861,-0.181052,2.102166,-0.521585,1.210308,3.603236,0.348964,1.105982,-0.237226,-0.217078,...,0.227563,0.601700,-0.446736,-0.120232,-2.851347,0.253215,-1.317678,-0.77442,0.187833,1.128213
1007,2.433356,-0.787438,-1.536438,-1.414858,-0.035450,-0.580525,-0.817490,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,-0.983554,-0.120232,0.807064,0.253215,0.863307,-0.77442,0.187833,0.200871
575,-0.166757,-0.207799,-0.808717,-0.521585,-0.789461,-0.580525,-0.580375,-0.807777,-0.237226,-0.217078,...,0.227563,0.601700,0.626901,-0.120232,0.807064,0.253215,0.863307,1.29070,0.187833,-3.508497


In [41]:
r2 = evaluate_single_model(
    LinearRegression(), xtrain_pre_sel, ytrain, xtest_pre_sel, ytest
)
print(r2)
results.append(r2)

{'model_name': 'LinearRegression', 'model': LinearRegression(), 'cv_mean': np.float64(0.8392), 'cv_std': np.float64(0.0794), 'train_r2': 0.8526, 'test_r2': 0.8025, 'gen_err': 0.0501}


In [42]:
results

[{'model_name': 'LinearRegression',
  'model': LinearRegression(),
  'cv_mean': np.float64(0.7637),
  'cv_std': np.float64(0.1507),
  'train_r2': 0.8659,
  'test_r2': 0.8023,
  'gen_err': 0.0636},
 {'model_name': 'LinearRegression',
  'model': LinearRegression(),
  'cv_mean': np.float64(0.8392),
  'cv_std': np.float64(0.0794),
  'train_r2': 0.8526,
  'test_r2': 0.8025,
  'gen_err': 0.0501}]

# Build Ridge and Lasso models 
finetune best alpha value

In [43]:
params = {"alpha": [0.1, 1, 10, 50, 100, 500, 1000]}

In [44]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

gscv_ridge = GridSearchCV(Ridge(random_state=42), params, cv=5, scoring="r2")
gscv_ridge.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Ridge(random_state=42)
,param_grid,"{'alpha': [0.1, 1, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [45]:
gscv_ridge.best_score_

np.float64(0.8401653409106904)

In [46]:
gscv_ridge.best_params_

{'alpha': 50}

In [47]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [48]:
r3 = evaluate_single_model(best_ridge, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest)
print(r3)
results.append(r3)

{'model_name': 'Ridge', 'model': Ridge(alpha=50, random_state=42), 'cv_mean': np.float64(0.8402), 'cv_std': np.float64(0.0747), 'train_r2': 0.8521, 'test_r2': 0.8057, 'gen_err': 0.0464}


# Lasso Model

In [49]:
from sklearn.linear_model import Lasso

gscv_lasso = GridSearchCV(Lasso(random_state=42), params, cv=5, scoring="r2")
gscv_lasso.fit(xtrain_pre_sel, ytrain)

0,1,2
,estimator,Lasso(random_state=42)
,param_grid,"{'alpha': [0.1, 1, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,50
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [50]:
gscv_lasso.best_params_

{'alpha': 50}

In [51]:
gscv_lasso.best_score_

np.float64(0.8391726254434546)

In [52]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

0,1,2
,alpha,50
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'


In [53]:
r4 = evaluate_single_model(best_lasso, xtrain_pre_sel, ytrain, xtest_pre_sel, ytest)
print(r4)
results.append(r4)

{'model_name': 'Lasso', 'model': Lasso(alpha=50, random_state=42), 'cv_mean': np.float64(0.8392), 'cv_std': np.float64(0.0791), 'train_r2': 0.8526, 'test_r2': 0.8027, 'gen_err': 0.0499}


In [54]:
results

[{'model_name': 'LinearRegression',
  'model': LinearRegression(),
  'cv_mean': np.float64(0.7637),
  'cv_std': np.float64(0.1507),
  'train_r2': 0.8659,
  'test_r2': 0.8023,
  'gen_err': 0.0636},
 {'model_name': 'LinearRegression',
  'model': LinearRegression(),
  'cv_mean': np.float64(0.8392),
  'cv_std': np.float64(0.0794),
  'train_r2': 0.8526,
  'test_r2': 0.8025,
  'gen_err': 0.0501},
 {'model_name': 'Ridge',
  'model': Ridge(alpha=50, random_state=42),
  'cv_mean': np.float64(0.8402),
  'cv_std': np.float64(0.0747),
  'train_r2': 0.8521,
  'test_r2': 0.8057,
  'gen_err': 0.0464},
 {'model_name': 'Lasso',
  'model': Lasso(alpha=50, random_state=42),
  'cv_mean': np.float64(0.8392),
  'cv_std': np.float64(0.0791),
  'train_r2': 0.8526,
  'test_r2': 0.8027,
  'gen_err': 0.0499}]

In [55]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model_name,model,cv_mean,cv_std,train_r2,test_r2,gen_err
0,LinearRegression,LinearRegression(),0.7637,0.1507,0.8659,0.8023,0.0636
1,LinearRegression,LinearRegression(),0.8392,0.0794,0.8526,0.8025,0.0501
2,Ridge,"Ridge(alpha=50, random_state=42)",0.8402,0.0747,0.8521,0.8057,0.0464
3,Lasso,"Lasso(alpha=50, random_state=42)",0.8392,0.0791,0.8526,0.8027,0.0499


In [57]:
sort_df = results_df.sort_values(by="cv_mean", ascending=False).reset_index(drop=True)
sort_df

Unnamed: 0,model_name,model,cv_mean,cv_std,train_r2,test_r2,gen_err
0,Ridge,"Ridge(alpha=50, random_state=42)",0.8402,0.0747,0.8521,0.8057,0.0464
1,LinearRegression,LinearRegression(),0.8392,0.0794,0.8526,0.8025,0.0501
2,Lasso,"Lasso(alpha=50, random_state=42)",0.8392,0.0791,0.8526,0.8027,0.0499
3,LinearRegression,LinearRegression(),0.7637,0.1507,0.8659,0.8023,0.0636


In [58]:
best_final_model = sort_df.loc[0, "model"]
best_final_model

0,1,2
,alpha,50
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [59]:
sort_df.to_csv("evaluation.csv", index=False)