# House Price Prediction Model

In [2]:
import pandas as pd
import numpy as np

In [41]:
import pickle as pk

In [5]:
data = pd.read_csv("house_price_dataset.csv")
data.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Built-up Area,Under Construction,Indira Nagar,5 Bedroom,Society D,1145,2,1,9364.3
1,Built-up Area,Ready To Move,Koramangala,4 BHK,Society B,1126,5,0,10962.21
2,Plot Area,Immediate Possession,BTM Layout,2 BHK,Society D,810,2,0,4613.14
3,Plot Area,Under Construction,Koramangala,3 BHK,Society A,1643,3,3,12895.12
4,Carpet Area,Under Construction,Electronic City,3 BHK,Society E,1199,1,2,3933.13


In [8]:
data.drop(columns=["area_type","availability","society"] ,inplace=True)

In [9]:
data.dropna(inplace=True)

In [10]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Indira Nagar,5 Bedroom,1145,2,1,9364.30
1,Koramangala,4 BHK,1126,5,0,10962.21
2,BTM Layout,2 BHK,810,2,0,4613.14
3,Koramangala,3 BHK,1643,3,3,12895.12
4,Electronic City,3 BHK,1199,1,2,3933.13
...,...,...,...,...,...,...
2995,HSR Layout,4 Bedroom,1095,4,2,6088.36
2996,Marathahalli,4 BHK,1071,4,2,6906.84
2997,Whitefield,2 Bedroom,1573,3,3,10735.04
2998,BTM Layout,5 Bedroom,1626,4,0,8720.33


In [11]:
data.location.value_counts()

location
BTM Layout         405
Jayanagar          397
Whitefield         385
HSR Layout         380
Marathahalli       372
Koramangala        365
Electronic City    365
Indira Nagar       331
Name: count, dtype: int64

In [12]:
data['size'].unique()

array(['5 Bedroom', '4 BHK', '2 BHK', '3 BHK', '2 Bedroom', '1 Bedroom',
       '5 BHK', '3 Bedroom', '4 Bedroom', '1 BHK'], dtype=object)

In [14]:
data['bedrooms'] = data['size'].apply(lambda x: int(x.split(' ')[0]))

In [15]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedrooms
0,Indira Nagar,5 Bedroom,1145,2,1,9364.30,5
1,Koramangala,4 BHK,1126,5,0,10962.21,4
2,BTM Layout,2 BHK,810,2,0,4613.14,2
3,Koramangala,3 BHK,1643,3,3,12895.12,3
4,Electronic City,3 BHK,1199,1,2,3933.13,3
...,...,...,...,...,...,...,...
2995,HSR Layout,4 Bedroom,1095,4,2,6088.36,4
2996,Marathahalli,4 BHK,1071,4,2,6906.84,4
2997,Whitefield,2 Bedroom,1573,3,3,10735.04,2
2998,BTM Layout,5 Bedroom,1626,4,0,8720.33,5


In [16]:
data.total_sqft.unique()

array([1145, 1126,  810, ...,  622,  650, 1043], shape=(1164,))

In [17]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bedrooms
count,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1298.163,2.977,1.501333,8073.632007,3.001
std,325.09606,1.421435,1.117026,2904.878507,1.423028
min,601.0,1.0,0.0,1988.23,1.0
25%,1044.0,2.0,1.0,5863.9375,2.0
50%,1300.5,3.0,1.0,7717.48,3.0
75%,1552.0,4.0,3.0,9924.58,4.0
max,1997.0,5.0,3.0,20482.09,5.0


In [18]:
data['sqft_per_bed'] = data['total_sqft'] / data['bedrooms']

In [20]:
data.sqft_per_bed.describe()

count    3000.000000
mean      556.159617
std       330.935211
min       200.200000
25%       331.333333
50%       429.000000
75%       681.000000
max      1599.000000
Name: sqft_per_bed, dtype: float64

In [22]:
data['price_per_sqft'] = round(data['price']*100/data['total_sqft'])

In [23]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bedrooms,sqft_per_bed,price_per_sqft
0,Indira Nagar,5 Bedroom,1145,2,1,9364.30,5,229.000000,818.0
1,Koramangala,4 BHK,1126,5,0,10962.21,4,281.500000,974.0
2,BTM Layout,2 BHK,810,2,0,4613.14,2,405.000000,570.0
3,Koramangala,3 BHK,1643,3,3,12895.12,3,547.666667,785.0
4,Electronic City,3 BHK,1199,1,2,3933.13,3,399.666667,328.0
...,...,...,...,...,...,...,...,...,...
2995,HSR Layout,4 Bedroom,1095,4,2,6088.36,4,273.750000,556.0
2996,Marathahalli,4 BHK,1071,4,2,6906.84,4,267.750000,645.0
2997,Whitefield,2 Bedroom,1573,3,3,10735.04,2,786.500000,682.0
2998,BTM Layout,5 Bedroom,1626,4,0,8720.33,5,325.200000,536.0


In [24]:
data.drop(columns=['size' ,'sqft_per_bed' ,'price_per_sqft'], axis=1 , inplace=True)

In [25]:
data

Unnamed: 0,location,total_sqft,bath,balcony,price,bedrooms
0,Indira Nagar,1145,2,1,9364.30,5
1,Koramangala,1126,5,0,10962.21,4
2,BTM Layout,810,2,0,4613.14,2
3,Koramangala,1643,3,3,12895.12,3
4,Electronic City,1199,1,2,3933.13,3
...,...,...,...,...,...,...
2995,HSR Layout,1095,4,2,6088.36,4
2996,Marathahalli,1071,4,2,6906.84,4
2997,Whitefield,1573,3,3,10735.04,2
2998,BTM Layout,1626,4,0,8720.33,5


In [26]:
from sklearn.preprocessing import OneHotEncoder , StandardScaler

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
from sklearn.pipeline import make_pipeline

In [32]:
from sklearn.compose import make_column_transformer

In [31]:
col_trans = make_column_transformer((OneHotEncoder(sparse_output=False) , ['location']) , remainder='passthrough')

In [34]:
lr = LinearRegression()
scaler = StandardScaler()
model = make_pipeline(col_trans, scaler,lr)

In [35]:
data_input = data.drop(columns=['price'])
data_output = data['price']

In [36]:
x_train , x_test , y_train , y_test = train_test_split(data_input , data_output , test_size=0.2)

In [38]:
model.fit(x_train , y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [39]:
model.score(x_test,y_test)

0.7411728128858605

In [40]:
input = pd.DataFrame([['Indira Nagar',1500,2,1,9364.30,5]] , columns=['location', 'total_sqft', 'bath', 'balcony', 'price', 'bedrooms'])

model.predict(input)

array([10870.39715396])

In [42]:
pk.dump(model , open('House_prediction_model.pkl' , 'wb'))

In [43]:
data.to_csv("Cleaned_data.csv")