In [1]:
import pandas as pd
import numpy as np

# **Model Building**

## **Model Training**

### **Load Dataset**

In [2]:
data = pd.read_csv("../data/train.csv")
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

### **Train-Test Split**

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 80)
(365, 80)
(1095,)
(365,)


### **Feature Engineering**

In [6]:
def feature_engineering(df):
    df = df.copy()

    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBathrooms'] = (
        df['FullBath'] + 0.5 * df['HalfBath'] +
        df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    )
    df['TotalPorchSF'] = (
        df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    )
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['RemodelAge'] = df['YrSold'] - df['YearRemodAdd']
    df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']
    df['HasGarage'] = df['GarageType'].notnull().astype(int)
    df['IsRemodeled'] = (df['YearBuilt'] != df['YearRemodAdd']).astype(int)

    qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, None: 0}
    for col in ['ExterQual', 'KitchenQual', 'FireplaceQu', 'BsmtQual', 'HeatingQC']:
        df[col + '_Num'] = df[col].map(qual_mapping)

    return df

In [7]:
X_train_fe = feature_engineering(X_train)
X_train_fe

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,HouseAge,RemodelAge,GarageAge,HasGarage,IsRemodeled,ExterQual_Num,KitchenQual_Num,FireplaceQu_Num,BsmtQual_Num,HeatingQC_Num
1023,1024,120,RL,43.0,3182,Pave,,Reg,Lvl,AllPub,...,3,2,3.0,1,1,4,4,4.0,4.0,5
810,811,20,RL,78.0,10140,Pave,,Reg,Lvl,AllPub,...,32,7,32.0,1,1,3,4,2.0,3.0,2
1384,1385,50,RL,60.0,9060,Pave,,Reg,Lvl,AllPub,...,70,59,70.0,1,1,3,3,,3.0,3
626,627,20,RL,,12342,Pave,,IR1,Lvl,AllPub,...,47,29,47.0,1,1,3,3,3.0,3.0,3
813,814,20,RL,75.0,9750,Pave,,Reg,Lvl,AllPub,...,49,49,49.0,1,0,3,3,,3.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1096,20,RL,78.0,9317,Pave,,IR1,Lvl,AllPub,...,1,1,1.0,1,0,4,4,4.0,4.0,4
1130,1131,50,RL,65.0,7804,Pave,,Reg,Lvl,AllPub,...,81,59,28.0,1,1,3,4,3.0,3.0,3
1294,1295,20,RL,60.0,8172,Pave,,Reg,Lvl,AllPub,...,51,16,49.0,1,1,3,3,,3.0,3
860,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,...,89,9,82.0,1,1,4,4,4.0,3.0,4


### **Preprocessing**

In [8]:
# Converting object dtypes to category
def conversion(data):
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = data[col].astype('category')
    return data

X_train_mod = conversion(X_train_fe)

In [9]:
num_cols = ['TotalBathrooms','TotalPorchSF','HouseAge','RemodelAge','GarageAge','LotFrontage','LotArea','GarageYrBlt','GarageArea','GrLivArea']
cat_cols = ['LotShape','LotConfig','BldgType','HouseStyle','BsmtQual','GarageType','GarageFinish','RoofStyle','Foundation','SaleCondition','Utilities','LandSlope']

##### **Handling Missing Values**

In [10]:
print("Missing Values for X_Train in Numerical Columns")
print(X_train_mod[num_cols].isna().sum())

Missing Values for X_Train in Numerical Columns
TotalBathrooms      0
TotalPorchSF        0
HouseAge            0
RemodelAge          0
GarageAge          58
LotFrontage       200
LotArea             0
GarageYrBlt        58
GarageArea          0
GrLivArea           0
dtype: int64


In [11]:
print("Missing Values for X_Train in Categorical Columns")
print(X_train_mod[cat_cols].isna().sum())

Missing Values for X_Train in Categorical Columns
LotShape          0
LotConfig         0
BldgType          0
HouseStyle        0
BsmtQual         27
GarageType       58
GarageFinish     58
RoofStyle         0
Foundation        0
SaleCondition     0
Utilities         0
LandSlope         0
dtype: int64


Filling Missing Values in Train and Test Sets
- Numerical Columns: Fill with **Mean**  
- Categorical Columns: Fill with **Mode (Most Frequent)** 

In [12]:
def fill_values(data,num_cols,cat_cols):
    data.fillna({col: data[col].mean() for col in num_cols}, inplace=True)
    data.fillna({col: data[col].mode()[0] for col in cat_cols}, inplace=True)
    return data

X_train_mod = fill_values(X_train_fe,num_cols,cat_cols)

In [13]:
print("After filling Values for X_Train in Numerical Columns")
print(X_train_mod[num_cols].isna().sum())

After filling Values for X_Train in Numerical Columns
TotalBathrooms    0
TotalPorchSF      0
HouseAge          0
RemodelAge        0
GarageAge         0
LotFrontage       0
LotArea           0
GarageYrBlt       0
GarageArea        0
GrLivArea         0
dtype: int64


In [14]:
print("After filling Values for X_Train in Categorical Columns")
print(X_train_mod[cat_cols].isna().sum())

After filling Values for X_Train in Categorical Columns
LotShape         0
LotConfig        0
BldgType         0
HouseStyle       0
BsmtQual         0
GarageType       0
GarageFinish     0
RoofStyle        0
Foundation       0
SaleCondition    0
Utilities        0
LandSlope        0
dtype: int64


##### **Scaling Numerical Features**

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_mod[num_cols])

X_train_num = scaler.transform(X_train_mod[num_cols])

In [16]:
X_train_scaled = pd.DataFrame(X_train_num, columns=num_cols)
X_train_scaled

Unnamed: 0,TotalBathrooms,TotalPorchSF,HouseAge,RemodelAge,GarageAge,LotFrontage,LotArea,GarageYrBlt,GarageArea,GrLivArea
0,-0.281484,-0.649766,-1.101005,-1.012238,-1.081738,-1.202174,-0.683950,1.090030,-0.192617,-0.051643
1,0.360651,-0.833865,-0.153848,-0.770809,0.119071,0.333763,-0.054883,-0.194518,0.030695,-0.421692
2,-1.565755,-0.833865,1.087254,1.740054,1.692545,-0.456147,-0.152524,-1.644814,-0.938576,-0.518474
3,-1.565755,-0.502488,0.336061,0.291479,0.740179,0.000000,0.144198,-0.774637,-0.910068,-0.207253
4,-0.923619,1.697488,0.401382,1.257195,0.822993,0.202111,-0.090142,-0.857511,-0.838798,-0.169300
...,...,...,...,...,...,...,...,...,...,...
1090,-0.281484,-0.631357,-1.166326,-1.060524,-1.164553,0.333763,-0.129289,1.131467,-0.178363,-0.412204
1091,1.002787,-0.428848,1.446521,1.740054,-0.046558,-0.236728,-0.266078,0.095541,0.467817,0.853554
1092,-0.281484,-0.833865,0.466703,-0.336237,0.822993,-0.456147,-0.232808,-0.898948,0.448812,-1.266163
1093,-0.923619,1.375316,1.707806,-0.674238,2.189431,-0.675567,-0.280725,-2.224933,-1.242660,-0.199663


##### **Encoding Categorical Features** 

In [17]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[cat_cols])

X_train_cat = encoder.transform(X_train_mod[cat_cols]).toarray()

In [18]:
encoded_cols = encoder.get_feature_names_out(cat_cols)

In [19]:
X_train_encoded = pd.DataFrame(X_train_cat, columns=encoded_cols)
X_train_encoded

Unnamed: 0,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,BldgType_1Fam,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Utilities_AllPub,Utilities_NoSeWa,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1091,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1092,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1093,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


##### **Merging the Data**

In [20]:
X_train_final = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_train_final

Unnamed: 0,TotalBathrooms,TotalPorchSF,HouseAge,RemodelAge,GarageAge,LotFrontage,LotArea,GarageYrBlt,GarageArea,GrLivArea,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Utilities_AllPub,Utilities_NoSeWa,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev
0,-0.281484,-0.649766,-1.101005,-1.012238,-1.081738,-1.202174,-0.683950,1.090030,-0.192617,-0.051643,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.360651,-0.833865,-0.153848,-0.770809,0.119071,0.333763,-0.054883,-0.194518,0.030695,-0.421692,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-1.565755,-0.833865,1.087254,1.740054,1.692545,-0.456147,-0.152524,-1.644814,-0.938576,-0.518474,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,-1.565755,-0.502488,0.336061,0.291479,0.740179,0.000000,0.144198,-0.774637,-0.910068,-0.207253,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-0.923619,1.697488,0.401382,1.257195,0.822993,0.202111,-0.090142,-0.857511,-0.838798,-0.169300,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,-0.281484,-0.631357,-1.166326,-1.060524,-1.164553,0.333763,-0.129289,1.131467,-0.178363,-0.412204,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1091,1.002787,-0.428848,1.446521,1.740054,-0.046558,-0.236728,-0.266078,0.095541,0.467817,0.853554,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1092,-0.281484,-0.833865,0.466703,-0.336237,0.822993,-0.456147,-0.232808,-0.898948,0.448812,-1.266163,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1093,-0.923619,1.375316,1.707806,-0.674238,2.189431,-0.675567,-0.280725,-2.224933,-1.242660,-0.199663,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


### **Model Training**

In [21]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train_final,y_train)

## **Model evaluation**

### **Feature Engineering**

In [22]:
X_test_fe = feature_engineering(X_test)
X_test_fe

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,HouseAge,RemodelAge,GarageAge,HasGarage,IsRemodeled,ExterQual_Num,KitchenQual_Num,FireplaceQu_Num,BsmtQual_Num,HeatingQC_Num
892,893,20,RL,70.0,8414,Pave,,Reg,Lvl,AllPub,...,43,3,43.0,1,1,3,3,,3.0,3
1105,1106,60,RL,98.0,12256,Pave,,IR1,Lvl,AllPub,...,16,15,16.0,1,1,4,4,3.0,5.0,5
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,83,60,83.0,1,1,3,3,4.0,3.0,4
522,523,50,RM,50.0,5000,Pave,,Reg,Lvl,AllPub,...,59,56,56.0,1,1,3,3,4.0,3.0,5
1036,1037,20,RL,89.0,12898,Pave,,IR1,HLS,AllPub,...,2,1,1.0,1,1,4,5,5.0,5.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,989,60,RL,,12046,Pave,,IR1,Lvl,AllPub,...,31,31,31.0,1,0,3,4,3.0,3.0,3
243,244,160,RL,75.0,10762,Pave,,Reg,Lvl,AllPub,...,29,29,29.0,1,0,3,3,3.0,4.0,3
1342,1343,60,RL,,9375,Pave,,Reg,Lvl,AllPub,...,5,5,5.0,1,0,4,4,4.0,4.0,5
1057,1058,60,RL,,29959,Pave,,IR2,Lvl,AllPub,...,15,15,15.0,1,0,4,4,4.0,4.0,5


### **Preprocessing**

In [23]:
X_test_mod = conversion(X_test_fe)
X_test_mod

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,HouseAge,RemodelAge,GarageAge,HasGarage,IsRemodeled,ExterQual_Num,KitchenQual_Num,FireplaceQu_Num,BsmtQual_Num,HeatingQC_Num
892,893,20,RL,70.0,8414,Pave,,Reg,Lvl,AllPub,...,43,3,43.0,1,1,3,3,,3.0,3
1105,1106,60,RL,98.0,12256,Pave,,IR1,Lvl,AllPub,...,16,15,16.0,1,1,4,4,3.0,5.0,5
413,414,30,RM,56.0,8960,Pave,Grvl,Reg,Lvl,AllPub,...,83,60,83.0,1,1,3,3,4.0,3.0,4
522,523,50,RM,50.0,5000,Pave,,Reg,Lvl,AllPub,...,59,56,56.0,1,1,3,3,4.0,3.0,5
1036,1037,20,RL,89.0,12898,Pave,,IR1,HLS,AllPub,...,2,1,1.0,1,1,4,5,5.0,5.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,989,60,RL,,12046,Pave,,IR1,Lvl,AllPub,...,31,31,31.0,1,0,3,4,3.0,3.0,3
243,244,160,RL,75.0,10762,Pave,,Reg,Lvl,AllPub,...,29,29,29.0,1,0,3,3,3.0,4.0,3
1342,1343,60,RL,,9375,Pave,,Reg,Lvl,AllPub,...,5,5,5.0,1,0,4,4,4.0,4.0,5
1057,1058,60,RL,,29959,Pave,,IR2,Lvl,AllPub,...,15,15,15.0,1,0,4,4,4.0,4.0,5


##### **Handling Missing Values**

In [24]:
print("Missing Values for X_test in Numerical Columns")
print(X_test_mod[num_cols].isna().sum())

Missing Values for X_test in Numerical Columns
TotalBathrooms     0
TotalPorchSF       0
HouseAge           0
RemodelAge         0
GarageAge         23
LotFrontage       59
LotArea            0
GarageYrBlt       23
GarageArea         0
GrLivArea          0
dtype: int64


In [25]:
print("Missing Values for X_test in Categorical Columns")
print(X_test_mod[cat_cols].isna().sum())

Missing Values for X_test in Categorical Columns
LotShape          0
LotConfig         0
BldgType          0
HouseStyle        0
BsmtQual         10
GarageType       23
GarageFinish     23
RoofStyle         0
Foundation        0
SaleCondition     0
Utilities         0
LandSlope         0
dtype: int64


In [26]:
X_test_mod = fill_values(X_test_mod,num_cols,cat_cols)

In [27]:
print("After filling Values for X_test in Numerical Columns")
print(X_test_mod[num_cols].isna().sum())

After filling Values for X_test in Numerical Columns
TotalBathrooms    0
TotalPorchSF      0
HouseAge          0
RemodelAge        0
GarageAge         0
LotFrontage       0
LotArea           0
GarageYrBlt       0
GarageArea        0
GrLivArea         0
dtype: int64


In [28]:
print("After filling Values for X_test in Categorical Columns")
print(X_test_mod[cat_cols].isna().sum())

After filling Values for X_test in Categorical Columns
LotShape         0
LotConfig        0
BldgType         0
HouseStyle       0
BsmtQual         0
GarageType       0
GarageFinish     0
RoofStyle        0
Foundation       0
SaleCondition    0
Utilities        0
LandSlope        0
dtype: int64


##### **Scaling Numerical Features** - `StandardScaler`

In [29]:
X_test_num = scaler.transform(X_test_mod[num_cols])

In [30]:
X_test_scaled = pd.DataFrame(X_test_num, columns=num_cols)
X_test_scaled

Unnamed: 0,TotalBathrooms,TotalPorchSF,HouseAge,RemodelAge,GarageAge,LotFrontage,LotArea,GarageYrBlt,GarageArea,GrLivArea
0,-0.923619,-0.833865,0.205419,-0.963953,0.574550,-0.017308,-0.210929,-0.650326,-1.014597,-0.879035
1,1.644922,-0.539307,-0.676417,-0.384523,-0.543445,1.211441,0.136423,0.634222,1.113997,2.069972
2,-1.565755,0.362775,1.511842,1.788339,2.230838,-0.631683,-0.161565,-2.142059,-0.558469,-0.954942
3,-0.281484,-0.281570,0.727988,1.595196,1.112844,-0.894986,-0.519586,-1.189007,-0.273390,0.251987
4,1.002787,-0.833865,-1.133665,-1.060524,-1.164553,0.816486,0.194466,1.214341,2.064262,0.168489
...,...,...,...,...,...,...,...,...,...,...
360,0.360651,1.228037,-0.186508,0.388050,0.077663,-0.059328,0.117437,-0.111644,0.349034,0.946541
361,-0.923619,-0.576127,-0.251830,0.291479,-0.005151,0.202111,0.001352,0.054104,-0.900565,-0.596279
362,0.360651,-0.033037,-1.035684,-0.867381,-0.998924,-0.059328,-0.124046,0.965719,0.805161,1.210319
363,0.360651,0.068217,-0.709078,-0.384523,-0.584852,-0.059328,1.736937,0.634222,-0.050078,0.604957


#####  **Encoding Categorical Features** - `OneHotEncoder`

In [31]:
X_test_cat = encoder.transform(X_test_mod[cat_cols]).toarray()

In [32]:
X_test_encoded = pd.DataFrame(X_test_cat, columns=encoded_cols)
X_test_encoded

Unnamed: 0,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,BldgType_1Fam,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Utilities_AllPub,Utilities_NoSeWa,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
361,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
362,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
363,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


##### **Merging the Data — Ready for Model Training**

In [33]:
X_test_final = pd.concat([X_test_scaled, X_test_encoded], axis=1)
X_test_final

Unnamed: 0,TotalBathrooms,TotalPorchSF,HouseAge,RemodelAge,GarageAge,LotFrontage,LotArea,GarageYrBlt,GarageArea,GrLivArea,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Utilities_AllPub,Utilities_NoSeWa,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev
0,-0.923619,-0.833865,0.205419,-0.963953,0.574550,-0.017308,-0.210929,-0.650326,-1.014597,-0.879035,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.644922,-0.539307,-0.676417,-0.384523,-0.543445,1.211441,0.136423,0.634222,1.113997,2.069972,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-1.565755,0.362775,1.511842,1.788339,2.230838,-0.631683,-0.161565,-2.142059,-0.558469,-0.954942,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,-0.281484,-0.281570,0.727988,1.595196,1.112844,-0.894986,-0.519586,-1.189007,-0.273390,0.251987,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.002787,-0.833865,-1.133665,-1.060524,-1.164553,0.816486,0.194466,1.214341,2.064262,0.168489,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,0.360651,1.228037,-0.186508,0.388050,0.077663,-0.059328,0.117437,-0.111644,0.349034,0.946541,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
361,-0.923619,-0.576127,-0.251830,0.291479,-0.005151,0.202111,0.001352,0.054104,-0.900565,-0.596279,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
362,0.360651,-0.033037,-1.035684,-0.867381,-0.998924,-0.059328,-0.124046,0.965719,0.805161,1.210319,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
363,0.360651,0.068217,-0.709078,-0.384523,-0.584852,-0.059328,1.736937,0.634222,-0.050078,0.604957,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


### **Model Predictions**

##### **Training Predictions**

In [34]:
y_train_preds = model.predict(X_train_final)
y_train_preds

array([192827.84334766, 180362.86087012, 103831.37412277, ...,
       138841.8371807 , 128678.33548135, 217481.56456297])

##### **Testing Predictions**

In [35]:
y_test_preds = model.predict(X_test_final)
y_test_preds

array([142132.55985058, 332143.89865484, 104427.83258298, 145283.64918362,
       313629.36096751,  80460.06879811, 195216.04987703, 148698.49717673,
        77113.46724205, 143361.62469968, 128804.53603691, 117647.1521444 ,
       129304.90226441, 209398.05788814, 188445.81460962, 130264.85141402,
       205801.46078457, 127564.28087529, 116909.06424727, 217229.7776835 ,
       204000.5442602 , 203828.33014448, 188841.83585198, 132504.25559348,
       220777.44609058, 169769.80408379, 204712.32736044,  85952.08568623,
       189056.82626983, 250546.80641613, 116511.00411783, 254535.19439884,
       235357.14525574,  99998.66802932, 238154.54394347, 135431.53689048,
       175109.09514822, 217706.19444895, 273955.80902241, 103482.39702591,
       106285.06399162, 237847.23538052, 107255.46503236, 369701.7444178 ,
       141251.97721767, 144938.81047228, 100021.72091992, 123863.72850542,
       330169.393827  , 134573.57895424, 104579.56570053, 225228.66236757,
       131718.46426292, 3

### **Model Evaluation**

##### **Performance Analysis with R2-Score**

In [36]:
from sklearn.metrics import r2_score

train_score = r2_score(y_train,y_train_preds)
test_score = r2_score(y_test,y_test_preds)
print("Train Score: ",f'{train_score:.2f}')
print("Test Score: ",f'{test_score:.2f}')

Train Score:  0.79
Test Score:  0.80


##### **Computation Metric with RMSE (Root-Mean-Squared-Error)**

In [37]:
from sklearn.metrics import mean_squared_error

def compute_rmse(test,pred):
    rmse = np.sqrt(mean_squared_error(test,pred))
    return f'{rmse:2f}'

In [38]:
print("RMSE for Train Data")
print(compute_rmse(y_train,y_train_preds))
print('------------------------------------')
print("RMSE for Test Data")
print(compute_rmse(y_test,y_test_preds))

RMSE for Train Data
35444.961137
------------------------------------
RMSE for Test Data
37578.098835


### **Save Model & Objects**

In [39]:
import joblib
import os

# Ensure models folder exists
os.makedirs("../models", exist_ok=True)

# Save model
joblib.dump(model, "../models/model.joblib")

# Save scaler and encoder
joblib.dump(scaler, "../models/scaler.joblib")
joblib.dump(encoder, "../models/encoder.joblib")

print("Model and preprocessing objects saved in 'models/'")

Model and preprocessing objects saved in 'models/'


### **Model Inference**

### **Load Model & Objects**

In [40]:
import joblib

# Load saved objects
model = joblib.load("../models/model.joblib")
scaler = joblib.load("../models/scaler.joblib")
encoder = joblib.load("../models/encoder.joblib")

print("Loaded model and preprocessing objects from 'models/'")

Loaded model and preprocessing objects from 'models/'


### **Load Dataset**

In [41]:
test_data = pd.read_csv("../data/test.csv")
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


### **Feature Engineering**

In [42]:
data_fe = feature_engineering(test_data)

### **Preprocessing**

In [43]:
data_mod = conversion(data_fe)
data_mod

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,HouseAge,RemodelAge,GarageAge,HasGarage,IsRemodeled,ExterQual_Num,KitchenQual_Num,FireplaceQu_Num,BsmtQual_Num,HeatingQC_Num
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,49,49,49.0,1,0,3,3.0,,3.0,3
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,52,52,52.0,1,0,3,4.0,,3.0,3
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,13,12,13.0,1,1,3,3.0,3.0,4.0,4
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,12,12,12.0,1,0,3,4.0,4.0,3.0,5
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,18,18,18.0,1,0,4,4.0,,4.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,36,36,,0,0,3,3.0,,3.0,4
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,36,36,36.0,1,0,3,3.0,,3.0,3
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,46,10,46.0,1,1,3,3.0,3.0,3.0,5
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,14,14,,0,0,3,3.0,,4.0,3


##### **Handling Missing Values**

In [44]:
print("Missing Values for test data in Numerical Columns")
print(data_mod[num_cols].isna().sum())

Missing Values for test data in Numerical Columns
TotalBathrooms      2
TotalPorchSF        0
HouseAge            0
RemodelAge          0
GarageAge          78
LotFrontage       227
LotArea             0
GarageYrBlt        78
GarageArea          1
GrLivArea           0
dtype: int64


In [45]:
print("Missing Values for test data in Categorical Columns")
print(data_mod[cat_cols].isna().sum())

Missing Values for test data in Categorical Columns
LotShape          0
LotConfig         0
BldgType          0
HouseStyle        0
BsmtQual         44
GarageType       76
GarageFinish     78
RoofStyle         0
Foundation        0
SaleCondition     0
Utilities         2
LandSlope         0
dtype: int64


In [46]:
data_mod = fill_values(data_mod,num_cols,cat_cols)

In [47]:
print("After filling Values for test data in Numerical Columns")
print(data_mod[num_cols].isna().sum())

After filling Values for test data in Numerical Columns
TotalBathrooms    0
TotalPorchSF      0
HouseAge          0
RemodelAge        0
GarageAge         0
LotFrontage       0
LotArea           0
GarageYrBlt       0
GarageArea        0
GrLivArea         0
dtype: int64


In [48]:
print("After filling Values for test data in Categorical Columns")
print(data_mod[cat_cols].isna().sum())

After filling Values for test data in Categorical Columns
LotShape         0
LotConfig        0
BldgType         0
HouseStyle       0
BsmtQual         0
GarageType       0
GarageFinish     0
RoofStyle        0
Foundation       0
SaleCondition    0
Utilities        0
LandSlope        0
dtype: int64


##### **Scaling Numerical Features** 

In [49]:
data_num = scaler.transform(data_mod[num_cols])

In [50]:
data_scaled = pd.DataFrame(data_num, columns=num_cols)
data_scaled

Unnamed: 0,TotalBathrooms,TotalPorchSF,HouseAge,RemodelAge,GarageAge,LotFrontage,LotArea,GarageYrBlt,GarageArea,GrLivArea
0,-1.565755,0.270726,0.401382,1.257195,0.822993,0.421531,0.079104,-0.733200,1.199521,-1.205437
1,-0.923619,-0.502488,0.499364,1.402053,0.947215,0.465415,0.318236,-0.857511,-0.786533,-0.383738
2,0.360651,-0.520898,-0.774399,-0.529380,-0.667666,0.158227,0.278727,0.758533,0.021192,0.185568
3,0.360651,-0.502488,-0.807059,-0.529380,-0.709073,0.333763,-0.069529,0.799970,-0.035824,0.138126
4,-0.281484,1.246447,-0.611096,-0.239665,-0.460630,-1.202174,-0.519134,0.551348,0.135224,-0.476725
...,...,...,...,...,...,...,...,...,...,...
1454,-0.923619,-0.833865,-0.023205,0.629480,0.038145,-2.167620,-0.796599,-0.040322,-2.268947,-0.833490
1455,-0.923619,-0.612947,-0.023205,0.629480,0.284700,-2.167620,-0.800397,-0.360266,-0.910068,-0.833490
1456,-0.281484,-0.833865,0.303400,-0.625952,0.698772,3.932243,0.836552,-0.774637,0.467817,-0.582996
1457,-0.923619,-0.539307,-0.741738,-0.432809,0.038145,-0.368380,-0.027669,-0.040322,-2.268947,-1.065008


##### **Encoding Categorical Features** 

In [51]:
data_cat = encoder.transform(data_mod[cat_cols]).toarray()

In [52]:
data_encoded = pd.DataFrame(data_cat, columns=encoded_cols)
data_encoded

Unnamed: 0,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,BldgType_1Fam,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Utilities_AllPub,Utilities_NoSeWa,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1455,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1456,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1457,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


##### **Merging the Data**

In [53]:
data_final = pd.concat([data_scaled, data_encoded], axis=1)
data_final

Unnamed: 0,TotalBathrooms,TotalPorchSF,HouseAge,RemodelAge,GarageAge,LotFrontage,LotArea,GarageYrBlt,GarageArea,GrLivArea,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Utilities_AllPub,Utilities_NoSeWa,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev
0,-1.565755,0.270726,0.401382,1.257195,0.822993,0.421531,0.079104,-0.733200,1.199521,-1.205437,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-0.923619,-0.502488,0.499364,1.402053,0.947215,0.465415,0.318236,-0.857511,-0.786533,-0.383738,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.360651,-0.520898,-0.774399,-0.529380,-0.667666,0.158227,0.278727,0.758533,0.021192,0.185568,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.360651,-0.502488,-0.807059,-0.529380,-0.709073,0.333763,-0.069529,0.799970,-0.035824,0.138126,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-0.281484,1.246447,-0.611096,-0.239665,-0.460630,-1.202174,-0.519134,0.551348,0.135224,-0.476725,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-0.923619,-0.833865,-0.023205,0.629480,0.038145,-2.167620,-0.796599,-0.040322,-2.268947,-0.833490,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1455,-0.923619,-0.612947,-0.023205,0.629480,0.284700,-2.167620,-0.800397,-0.360266,-0.910068,-0.833490,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1456,-0.281484,-0.833865,0.303400,-0.625952,0.698772,3.932243,0.836552,-0.774637,0.467817,-0.582996,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1457,-0.923619,-0.539307,-0.741738,-0.432809,0.038145,-0.368380,-0.027669,-0.040322,-2.268947,-1.065008,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


### **Predicting House Prices**

In [54]:
predictions = model.predict(data_final)
predictions

array([127602.81130756, 148082.14285818, 199338.91314595, ...,
       135989.75614396, 129432.01638608, 241855.40843956])

In [55]:
predicted_prices = pd.DataFrame(predictions, columns=['PredictedPrice'])
predicted_prices

Unnamed: 0,PredictedPrice
0,127602.811308
1,148082.142858
2,199338.913146
3,185212.007343
4,179052.703157
...,...
1454,77583.914670
1455,60897.609097
1456,135989.756144
1457,129432.016386
