In [156]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error
)

In [157]:
# 1. Load dataset and EDA
df = pd.read_csv("HousePricePrediction.csv")

In [158]:
print("Shape: ",df.shape)

Shape:  (2919, 13)


In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2919 non-null   int64  
 1   MSSubClass    2919 non-null   int64  
 2   MSZoning      2915 non-null   object 
 3   LotArea       2919 non-null   int64  
 4   LotConfig     2919 non-null   object 
 5   BldgType      2919 non-null   object 
 6   OverallCond   2919 non-null   int64  
 7   YearBuilt     2919 non-null   int64  
 8   YearRemodAdd  2919 non-null   int64  
 9   Exterior1st   2918 non-null   object 
 10  BsmtFinSF2    2918 non-null   float64
 11  TotalBsmtSF   2918 non-null   float64
 12  SalePrice     1460 non-null   float64
dtypes: float64(3), int64(6), object(4)
memory usage: 296.6+ KB


In [160]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2918.0,2918.0,1460.0
mean,1459.0,57.137718,10168.11408,5.564577,1971.312778,1984.264474,49.582248,1051.777587,180921.19589
std,842.787043,42.517628,7886.996359,1.113131,30.291442,20.894344,169.205611,440.766258,79442.502883
min,0.0,20.0,1300.0,1.0,1872.0,1950.0,0.0,0.0,34900.0
25%,729.5,20.0,7478.0,5.0,1953.5,1965.0,0.0,793.0,129975.0
50%,1459.0,50.0,9453.0,5.0,1973.0,1993.0,0.0,989.5,163000.0
75%,2188.5,70.0,11570.0,6.0,2001.0,2004.0,0.0,1302.0,214000.0
max,2918.0,190.0,215245.0,9.0,2010.0,2010.0,1526.0,6110.0,755000.0


In [161]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotArea            0
LotConfig          0
BldgType           0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
Exterior1st        1
BsmtFinSF2         1
TotalBsmtSF        1
SalePrice       1459
dtype: int64

In [162]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [163]:
# 2. Drop rows with missing target
df = df.dropna(subset=['SalePrice'])

In [164]:
# 3. Separate features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [165]:
df.isnull().sum()

Id              0
MSSubClass      0
MSZoning        0
LotArea         0
LotConfig       0
BldgType        0
OverallCond     0
YearBuilt       0
YearRemodAdd    0
Exterior1st     0
BsmtFinSF2      0
TotalBsmtSF     0
SalePrice       0
dtype: int64

In [166]:
# 4. Encode categorical variables
categorical_cols = ['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [167]:
print(X.shape)
X.head()

(1460, 34)


Unnamed: 0,Id,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,MSZoning_FV,MSZoning_RH,...,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing
0,0,60,8450,5,2003,2003,0.0,856.0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,1,20,9600,8,1976,1976,0.0,1262.0,False,False,...,False,False,False,True,False,False,False,False,False,False
2,2,60,11250,5,2001,2002,0.0,920.0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,3,70,9550,5,1915,1970,0.0,756.0,False,False,...,False,False,False,False,False,False,False,False,True,False
4,4,60,14260,5,2000,2000,0.0,1145.0,False,False,...,False,False,False,False,False,False,False,True,False,False


In [168]:
# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [169]:
# 6. Impute missing values (fit on train, transform on test)
imputer = SimpleImputer(strategy='mean')

X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns
)

In [170]:
# 7. Train model
model = LinearRegression()
model.fit(X_train, y_train)

In [171]:
# 8. Predict and evaluate
y_pred = model.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))

R2 Score: 0.6196387511449835
MAE: 34129.48682987312
RMSE: 54013.839479258066
MAPE: 0.19781860268689624


In [172]:
# Feature Scaling - to try & improve baseline performance
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_percentage_error(y_test, y_pred))

R2 Score: 0.6196387511449681
MAE: 34129.4868298736
RMSE: 54013.83947925916
0.19781860268690488


# By Using Pipeline

In [173]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [174]:
df = pd.read_csv("HousePricePrediction.csv")

df = df.dropna(subset=['SalePrice'])

X_raw = df.drop('SalePrice', axis=1)
y_raw = df['SalePrice']

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42
)

In [175]:
# 1. Define distinct paths for different data types
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first',handle_unknown='ignore'))
])


In [176]:
# 2. Create the 'Pre-processor'
# This grabs both int64 and float64 columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_raw.select_dtypes(include=['object']).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer,numeric_features),
        ('cat', categorical_transformer,categorical_features)
    ]
)

In [177]:
# 3. Final Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [178]:
pipeline.fit(X_train_raw, y_train_raw)

In [179]:
y_pred_pipe = pipeline.predict(X_test_raw)

In [180]:
print("--- Pipeline Results (Raw Data) ---")
print("R2 Score:", r2_score(y_test_raw, y_pred_pipe))
print("MAE:", mean_absolute_error(y_test_raw, y_pred_pipe))

--- Pipeline Results (Raw Data) ---
R2 Score: 0.6196387511449679
MAE: 34129.48682987361
