https://www.kaggle.com/datasets/yasserh/housing-prices-dataset

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Original Data

In [6]:
df = pd.read_csv(r'C:\Users\Saniya\SANIYA\Projects\HousePricePrediction\dataset\Housing.csv')

Data Exploration

In [7]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [8]:
df.shape

(545, 13)

In [9]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [10]:
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

Data Preprocessing

In [13]:
# One-Hot Encoding

from sklearn.preprocessing import OneHotEncoder

In [14]:
OH_features = df.select_dtypes(['object', 'category']).columns.tolist()
print(OH_features)

['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


In [15]:
OH_encoder = OneHotEncoder(sparse=False)

In [16]:
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[OH_features]))
OH_cols.index = df.index
OH_cols.columns = OH_encoder.get_feature_names_out(OH_features)

df = df.drop(OH_features, axis=1)
df = pd.concat([df, OH_cols], axis=1)

In [17]:
print(df.shape)
print(df.columns)

(545, 21)
Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
       'mainroad_no', 'mainroad_yes', 'guestroom_no', 'guestroom_yes',
       'basement_no', 'basement_yes', 'hotwaterheating_no',
       'hotwaterheating_yes', 'airconditioning_no', 'airconditioning_yes',
       'prefarea_no', 'prefarea_yes', 'furnishingstatus_furnished',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')


In [18]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,12250000,8960,4,4,4,3,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,12250000,9960,3,2,2,2,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,12215000,7500,4,2,2,3,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,11410000,7420,4,1,2,2,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


In [19]:
X = df.drop(['price'], axis=1)
y = df['price']

In [20]:
print(f'X.columns: {X.columns}')
print(f'\ny.name: {y.name}')

X.columns: Index(['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'mainroad_no',
       'mainroad_yes', 'guestroom_no', 'guestroom_yes', 'basement_no',
       'basement_yes', 'hotwaterheating_no', 'hotwaterheating_yes',
       'airconditioning_no', 'airconditioning_yes', 'prefarea_no',
       'prefarea_yes', 'furnishingstatus_furnished',
       'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished'],
      dtype='object')

y.name: price


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [23]:
print(f'X_train.shape: {X_train.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_test.shape: {y_test.shape}')

X_train.shape: (381, 20)
y_train.shape: (381,)
X_test.shape: (164, 20)
y_test.shape: (164,)


Model Implementation & Evaluation

In [24]:
from sklearn.metrics import mean_squared_error, r2_score

In [25]:
def implement(model, X_train, X_test, y_train, y_test):
    
    # Fitting the model
    model.fit(X_train, y_train)
    
    # Making predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluation
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    
    # Final model score
    model_score = [mse_train, mse_test, r2_train, r2_test]
    
    return model_score

In [26]:
score = pd.DataFrame(index = ['Train MSE', 'Test MSE', 'Train R2', 'Test R2'])

1. Linear Regression

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
lr_model = LinearRegression()

In [29]:
lr_model.__dict__

{'fit_intercept': True,
 'normalize': 'deprecated',
 'copy_X': True,
 'n_jobs': None,
 'positive': False}

In [30]:
lr_score = implement(lr_model, X_train, X_test, y_train, y_test)

In [31]:
score['Linear Regression'] = lr_score
score

Unnamed: 0,Linear Regression
Train MSE,1033735000000.0
Test MSE,1365103000000.0
Train R2,0.6812367
Test R2,0.6646299


2. SGD Regressor

In [32]:
from sklearn.linear_model import SGDRegressor

In [33]:
sgdr = SGDRegressor(max_iter=1000)

In [34]:
sgdr_score = implement(sgdr, X_train, X_test, y_train, y_test)

In [35]:
score['SGD Regressor'] = sgdr_score
score

Unnamed: 0,Linear Regression,SGD Regressor
Train MSE,1033735000000.0,8.20716e+31
Test MSE,1365103000000.0,7.204740000000001e+31
Train R2,0.6812367,-2.530766e+19
Test R2,0.6646299,-1.770017e+19


3. Decision Tree Regressor

In [36]:
from sklearn.tree import DecisionTreeRegressor

In [37]:
dt_reg = DecisionTreeRegressor(random_state=42)

In [38]:
dt_score = implement(dt_reg, X_train, X_test, y_train, y_test)

In [39]:
score['Decision Tree Regressor'] = dt_score
score

Unnamed: 0,Linear Regression,SGD Regressor,Decision Tree Regressor
Train MSE,1033735000000.0,8.20716e+31,23213910.0
Test MSE,1365103000000.0,7.204740000000001e+31,2517031000000.0
Train R2,0.6812367,-2.530766e+19,0.9999928
Test R2,0.6646299,-1.770017e+19,0.3816312


4. Random Forest Regressor

In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

In [43]:
rf_score = implement(rf_reg, X_train, X_test, y_train, y_test)

In [44]:
score['Random Forest Regressor'] = rf_score
score

Unnamed: 0,Linear Regression,SGD Regressor,Decision Tree Regressor,Random Forest Regressor
Train MSE,1033735000000.0,8.20716e+31,23213910.0,186561500000.0
Test MSE,1365103000000.0,7.204740000000001e+31,2517031000000.0,1436140000000.0
Train R2,0.6812367,-2.530766e+19,0.9999928,0.9424718
Test R2,0.6646299,-1.770017e+19,0.3816312,0.6471779


5. KNN Regressor

In [45]:
from sklearn.neighbors import KNeighborsRegressor

In [46]:
knn_reg = KNeighborsRegressor(n_neighbors=5)

In [47]:
knn_score = implement(knn_reg, X_train, X_test, y_train, y_test)

In [48]:
score['KNN Regressor'] = knn_score
score

Unnamed: 0,Linear Regression,SGD Regressor,Decision Tree Regressor,Random Forest Regressor,KNN Regressor
Train MSE,1033735000000.0,8.20716e+31,23213910.0,186561500000.0,1568198000000.0
Test MSE,1365103000000.0,7.204740000000001e+31,2517031000000.0,1436140000000.0,2394486000000.0
Train R2,0.6812367,-2.530766e+19,0.9999928,0.9424718,0.5164292
Test R2,0.6646299,-1.770017e+19,0.3816312,0.6471779,0.4117374
