# Housing Prices Machine Learning Project

This notebook documents the workflow for predicting housing prices using machine learning techniques.

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Load dataset from CSV file
train_data_fp = r'C:\Users\thebi\house-prices-ml-project\data\train.csv'
test_data_fp = r'C:\Users\thebi\house-prices-ml-project\data\test.csv'
df_train = pd.read_csv(train_data_fp)
df_test = pd.read_csv(test_data_fp)

print('Number of Training Examples:', df_train.shape[0])
print('Number of Testing Examples:', df_test.shape[0])
print('Training X Shape:', df_train.shape)
print('Testing X Shape:', df_test.shape)

Number of Training Examples: 1460
Number of Testing Examples: 1459
Training X Shape: (1460, 81)
Testing X Shape: (1459, 80)


In [9]:
# ***EXPLORATORY DATA ANALYSIS***


# Display basic statistics and first few rows of the dataframe
print(df_train.describe())
print(df_train.head())
# print(df_train.columns)
print(df_train.info())
df_train.sample(5)

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   
std       1.112799    30.202904     20.645407   181.066207   456.098091   
min       1.000

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
236,237,20,RL,65.0,8773,Pave,,Reg,Lvl,AllPub,FR2,Gtl,CollgCr,Norm,Norm,1Fam,1Story,7,5,2004,2004,Gable,CompShg,VinylSd,VinylSd,BrkFace,98.0,Gd,TA,PConc,Gd,TA,Av,GLQ,24,Unf,0,1390,1414,GasA,Ex,Y,SBrkr,1414,0,0,1414,0,0,2,0,3,1,Gd,6,Typ,0,,Attchd,2004.0,RFn,2,494,TA,TA,Y,132,105,0,0,0,0,,,,0,5,2010,WD,Normal,185500
551,552,20,RM,50.0,6000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1957,1957,Hip,CompShg,BrkFace,BrkFace,,0.0,TA,TA,CBlock,TA,TA,No,Rec,308,Unf,0,620,928,GasA,Gd,Y,FuseA,928,0,0,928,0,0,1,0,3,1,TA,5,Typ,0,,Attchd,1957.0,Fin,1,288,TA,TA,Y,0,0,0,0,0,0,,,,0,6,2008,WD,Normal,112500
470,471,120,RL,,6820,Pave,,IR1,Lvl,AllPub,Corner,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1985,1985,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,Av,GLQ,368,BLQ,1120,0,1488,GasA,TA,Y,SBrkr,1502,0,0,1502,1,0,1,1,1,1,Gd,4,Typ,0,,Attchd,1985.0,RFn,2,528,TA,TA,Y,0,54,0,0,140,0,,,,0,6,2010,WD,Normal,212000
41,42,20,RL,115.0,16905,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Timber,Norm,Norm,1Fam,1Story,5,6,1959,1959,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,Gd,BLQ,967,Unf,0,383,1350,GasA,Gd,Y,SBrkr,1328,0,0,1328,0,1,1,1,2,1,TA,5,Typ,2,Gd,Attchd,1959.0,RFn,1,308,TA,TA,P,0,104,0,0,0,0,,,,0,7,2007,WD,Normal,170000
556,557,20,RL,69.0,14850,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,5,1957,1957,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,Rec,895,Unf,0,197,1092,GasA,TA,Y,FuseA,1092,0,0,1092,1,0,1,0,2,1,TA,6,Typ,1,TA,Attchd,1957.0,Fin,1,299,TA,TA,Y,268,0,0,0,122,0,,MnWw,,0,5,2006,WD,Normal,141000


In [10]:
# ****FEATURE ENGINEERING****

# Check for missing values in the dataset
def display_missing(df_train):
    for col in df_train.columns.tolist():
        print('{} column missing values: {}'.format(col, df_train[col].isnull().sum()))
    print('\n')

display_missing(df_train)

# Remove columns with a high percentage of missing values
threshold = 0.4
df_train = df_train[df_train.columns[df_train.isnull().mean() < threshold]]

# Create indicators for all columns with missing values
for col in df_train.columns:
    if df_train[col].isnull().sum() > 0:
        df_train[col + '_missing'] = df_train[col].isnull().astype(int)
        
# Impute missing values for numerical columns with the mean
num_cols = df_train.select_dtypes(include=['int64', 'float64']).columns
df_train[num_cols] = df_train[num_cols].fillna(df_train[num_cols].mean())

# Impute missing values for categorical columns with the mode
cat_cols = df_train.select_dtypes(include=['object']).columns
df_train[cat_cols] = df_train[cat_cols].fillna(df_train[cat_cols].mode().iloc[0])
# Check individual columns where missing values are actually informative (e.g., GarageFinish, etc.)



Id column missing values: 0
MSSubClass column missing values: 0
MSZoning column missing values: 0
LotFrontage column missing values: 259
LotArea column missing values: 0
Street column missing values: 0
Alley column missing values: 1369
LotShape column missing values: 0
LandContour column missing values: 0
Utilities column missing values: 0
LotConfig column missing values: 0
LandSlope column missing values: 0
Neighborhood column missing values: 0
Condition1 column missing values: 0
Condition2 column missing values: 0
BldgType column missing values: 0
HouseStyle column missing values: 0
OverallQual column missing values: 0
OverallCond column missing values: 0
YearBuilt column missing values: 0
YearRemodAdd column missing values: 0
RoofStyle column missing values: 0
RoofMatl column missing values: 0
Exterior1st column missing values: 0
Exterior2nd column missing values: 0
MasVnrType column missing values: 872
MasVnrArea column missing values: 8
ExterQual column missing values: 0
ExterCond

In [12]:
# ****FEATURE SELECTION****

# Correlation heatmap for initial multicollinearity check
# Compute correlation matrix with numeric only
corr = df_train.corr(numeric_only=True)
corr_pairs = (
    corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    .stack()
    .reset_index()
    .rename(columns={0: 'correlation', 'level_0': 'feature1', 'level_1': 'feature2'})
)

# Display highly correlated pairs
high_corr = corr_pairs[abs(corr_pairs['correlation']) > 0.8].sort_values(by='correlation', ascending=False)
# print(high_corr)

# Plot the heatmap
# plt.figure(figsize=(12, 8))
# sns.heatmap(high_corr, annot=True, fmt=".2f")
# plt.show()

# Variance Inflation Factor (VIF) for additonal multicollinearity check
# Select numerical features for VIF calculation
X_train = df_train.select_dtypes(include=['int64', 'float64'])
X_constant = add_constant(X_train)
vif_data = pd.DataFrame({
    'Feature': X_constant.columns,
    'VIF': [variance_inflation_factor(X_constant.values, i) for i in range(X_constant.shape[1])]
})

print(high_corr.sort_values('correlation', ascending=False), vif_data.sort_values('VIF', ascending=False))

  vif = 1. / (1. - r_squared_i)


                  feature1              feature2  correlation
1220      BsmtQual_missing      BsmtCond_missing     1.000000
1222      BsmtQual_missing  BsmtFinType1_missing     1.000000
1270   GarageYrBlt_missing    GarageQual_missing     1.000000
1271   GarageYrBlt_missing    GarageCond_missing     1.000000
1265    GarageType_missing   GarageYrBlt_missing     1.000000
1266    GarageType_missing  GarageFinish_missing     1.000000
1231      BsmtCond_missing  BsmtFinType1_missing     1.000000
1273  GarageFinish_missing    GarageCond_missing     1.000000
1272  GarageFinish_missing    GarageQual_missing     1.000000
1274    GarageQual_missing    GarageCond_missing     1.000000
1267    GarageType_missing    GarageQual_missing     1.000000
1268    GarageType_missing    GarageCond_missing     1.000000
1269   GarageYrBlt_missing  GarageFinish_missing     1.000000
1223      BsmtQual_missing  BsmtFinType2_missing     0.986408
1232      BsmtCond_missing  BsmtFinType2_missing     0.986408
1247  Bs