# **HOUSE PRICE PREDICTION MODEL BY LINEAR REGRESSION**

In [37]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import model_selection

In [3]:
data = pd.read_csv("/content/drive/MyDrive/datasets/house_price_dataset.csv")
data.head()

Unnamed: 0,h_type,location,society,size,bathroom,balcony,total_sqft,yr_built,furniture,sale_type,...,college,hospital,population,railway,airport,on_road,air_quality,restaurant,park,price
0,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2011.0,0,new,...,0,1,2,0,0,1,1,1,1,4361705
1,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2012.0,1,new,...,0,1,2,0,0,1,1,1,1,5001905
2,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1860.0,2010.0,1,new,...,0,1,2,0,0,1,1,1,1,5588795
3,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1015.0,2016.0,1,new,...,0,1,2,0,0,1,1,1,1,3184740
4,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1210.0,2019.0,0,new,...,0,1,2,0,0,1,1,1,1,3498895


In [4]:
data = data.drop(['society', 'yr_built', 'furniture', 'sale_type', 'amenities', 'market', 'office', 'school', 'college', 'hospital', 'population', 'railway', 'airport', 'on_road', 'air_quality', 'restaurant', 'park'], axis=1)
data.head()

Unnamed: 0,h_type,location,size,bathroom,balcony,total_sqft,price
0,apartment,Maneja,3 BHK,3,1,1550.0,4361705
1,apartment,Maneja,3 BHK,3,1,1550.0,5001905
2,apartment,Maneja,3 BHK,3,1,1860.0,5588795
3,apartment,Maneja,2 BHK,2,1,1015.0,3184740
4,apartment,Maneja,2 BHK,2,1,1210.0,3498895


In [5]:
data.isnull().sum()

Unnamed: 0,0
h_type,0
location,0
size,0
bathroom,0
balcony,0
total_sqft,0
price,0


In [6]:
df = data.iloc[ : , :-1].values
df

array([['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       ['apartment', 'Gotri', '3 BHK', 3, 3, 1550.0],
       ['apartment', 'Gotri', '3 BHK', 3, 2, 1750.0],
       ['apartment', 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [7]:
label_encoder = LabelEncoder()

In [8]:
df[ : ,0] = label_encoder.fit_transform(df[ : ,0])
df

array([[0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 'Gotri', '3 BHK', 3, 3, 1550.0],
       [0, 'Gotri', '3 BHK', 3, 2, 1750.0],
       [0, 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [9]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'apartment': np.int64(0),
 'duplex': np.int64(1),
 'pent house': np.int64(2),
 'tenament': np.int64(3),
 'triplex': np.int64(4),
 'villa': np.int64(5)}

In [10]:
df[ : ,1] = label_encoder.fit_transform(df[ : ,1])
df

array([[0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 8, '3 BHK', 3, 3, 1550.0],
       [0, 8, '3 BHK', 3, 2, 1750.0],
       [0, 8, '2 BHK', 2, 3, 1200.0]], dtype=object)

In [11]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'Ajwa Road': np.int64(0),
 'Akota': np.int64(1),
 'Alkapuri': np.int64(2),
 'Atladra': np.int64(3),
 'Bhayli': np.int64(4),
 'Chhani': np.int64(5),
 'Fatehgunj': np.int64(6),
 'Gorwa': np.int64(7),
 'Gotri': np.int64(8),
 'Harni': np.int64(9),
 'Karelibaug': np.int64(10),
 'Khodiyar Nagar': np.int64(11),
 'Laxmipura': np.int64(12),
 'Madhav Pura': np.int64(13),
 'Mandvi': np.int64(14),
 'Maneja': np.int64(15),
 'Manjalpur': np.int64(16),
 'Navapura': np.int64(17),
 'New Alkapuri': np.int64(18),
 'New Karelibaugh': np.int64(19),
 'New Sama': np.int64(20),
 'New VIP Road': np.int64(21),
 'Sama': np.int64(22),
 'Sayajipura': np.int64(23),
 'Soma Talav': np.int64(24),
 'Vasant Vihar': np.int64(25),
 'Vasna Road': np.int64(26),
 'Vasna-Bhayli Road': np.int64(27),
 'Waghodia Road': np.int64(28)}

In [12]:
df[ : ,2] = label_encoder.fit_transform(df[ : ,2])
df

array([[0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1860.0],
       ...,
       [0, 8, 2, 3, 3, 1550.0],
       [0, 8, 2, 3, 2, 1750.0],
       [0, 8, 1, 2, 3, 1200.0]], dtype=object)

In [13]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'1 BHK': np.int64(0),
 '2 BHK': np.int64(1),
 '3 BHK': np.int64(2),
 '4 BHK': np.int64(3),
 '5 BHK': np.int64(4)}

In [14]:
y = data.price.values
X = df

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2)

standard_x = StandardScaler()

In [16]:
X_train = standard_x.fit_transform(X_train)
X_val = standard_x.transform(X_val)

In [17]:
print('Train_Shape: ',X_train.shape)
print("\nX_train:")
X_train

Train_Shape:  (272, 6)

X_train:


array([[-0.38829014,  1.47114255,  0.37553511,  0.4108774 , -0.62570424,
         0.76559597],
       [ 0.5547002 ,  0.62358995,  0.37553511,  0.4108774 ,  0.46526725,
         0.0370897 ],
       [ 0.5547002 ,  1.57708663,  1.56327406,  0.4108774 , -0.62570424,
         0.50798729],
       ...,
       [-0.38829014, -0.75368304, -0.81220384, -0.64344952, -0.62570424,
        -0.93240533],
       [-0.38829014,  1.04736625, -1.9999428 , -1.69777645, -0.62570424,
        -1.14015426],
       [ 0.5547002 ,  1.57708663, -0.81220384, -0.64344952,  0.46526725,
        -0.30777353]])

In [18]:
print('Val_Shape: ',X_val.shape)
print("\nX_val:")
X_val

Val_Shape:  (68, 6)

X_val:


array([[-0.38829014, -0.75368304,  0.37553511,  0.4108774 ,  0.46526725,
         1.4220826 ],
       [-0.38829014, -0.32990674, -0.81220384, -0.64344952, -0.62570424,
        -0.37840817],
       [-0.38829014, -1.07151527,  1.56327406,  1.46520433,  1.55623874,
         0.86808544],
       [-0.38829014, -0.32990674,  0.37553511,  0.4108774 ,  0.46526725,
         0.38056794],
       [-0.38829014,  1.47114255, -0.81220384, -0.64344952, -1.71667573,
        -0.44765781],
       [ 0.5547002 ,  1.57708663,  0.37553511,  0.4108774 , -0.62570424,
        -0.44765781],
       [-0.38829014,  0.62358995,  0.37553511,  0.4108774 , -0.62570424,
        -0.17065923],
       [-0.38829014, -0.43585081,  0.37553511,  0.4108774 , -0.62570424,
         1.54811696],
       [-0.38829014, -1.28340342,  0.37553511,  0.4108774 ,  0.46526725,
         0.72958615],
       [ 0.5547002 ,  1.57708663,  0.37553511,  0.4108774 ,  0.46526725,
         0.12988423],
       [-0.38829014, -0.22396266,  0.37553511,  0.

In [25]:
model = LinearRegression()

In [26]:
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_val)

In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


print("R² Score:", r2_score(y_val, y_pred))
print("MAE:", mean_absolute_error(y_val, y_pred))
print("MSE:", mean_squared_error(y_val, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))


R² Score: 0.6844597792410183
MAE: 1805434.8377981104
MSE: 5344062507310.734
RMSE: 2311722.8439652394
