# Bengaluru House Price prediction

### Importing important libraries and data

In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [143]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [144]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [145]:
features = list(train_data.columns)
features.remove('price')
target = 'price'
target, features

('price',
 ['area_type',
  'availability',
  'location',
  'size',
  'society',
  'total_sqft',
  'bath',
  'balcony'])

## Basic analysis
1. Missing values 
2. Simple description 
3. Properties of the features

#### Missing values

In [146]:
train_data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [147]:
test_data.isnull().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

#### Simple description

In [148]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [149]:
train_data.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


#### 1. Area type

In [150]:
area_types = list(train_data['area_type'].unique())
area_types

['Super built-up  Area', 'Plot  Area', 'Built-up  Area', 'Carpet  Area']

In [151]:
train_data['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

Note: The data contains Super built-up Area type the most number of times. This could be bacause the data is imperfect or maybe there are just more Super built-up Areas in total in Bengaluru

In [152]:
train_data.groupby('area_type').agg({'price': 'mean'}).sort_values(by=['price'], ascending=False)

Unnamed: 0_level_0,price
area_type,Unnamed: 1_level_1
Plot Area,208.495486
Built-up Area,104.285498
Super built-up Area,92.971757
Carpet Area,89.502356


Note: It would make sense to encode the area type in the same order as the mean price. 

#### 2. Availability

In [153]:
train_data['availability'].unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

In [154]:
train_data.availability.value_counts()

Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64

In [155]:
train_data.groupby('availability').agg({'price':'mean'})

Unnamed: 0_level_0,price
availability,Unnamed: 1_level_1
14-Jul,43.710000
14-Nov,43.710000
15-Aug,62.000000
15-Dec,130.000000
15-Jun,97.000000
...,...
22-Mar,49.000000
22-May,225.500000
22-Nov,78.370000
Immediate Possession,98.921875


I probably won't use this feature for the training

#### 3. Location

In [156]:
len(train_data.location.unique())

1306

In [157]:
train_data.location.value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

#### 4. Size

In [158]:
train_data['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

Size should be in numerical format but it is in terms of string. To use this feature we will first have to extract the numerical size of the house in terms of bhk. 

In [159]:
train_data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [160]:
train_data['size'].value_counts().head(3)

2 BHK        5199
3 BHK        4310
4 Bedroom     826
Name: size, dtype: int64

In [161]:
# converting the object type value into integer value
def get_size(st):
    st = str(st)
    if st=='nan':
        return 2 # since 2 is the most common value
    return int(st.split()[0])

train_data['size'] = train_data['size'].apply(get_size)
train_data['size']

0        2
1        4
2        3
3        3
4        2
        ..
13315    5
13316    4
13317    2
13318    4
13319    1
Name: size, Length: 13320, dtype: int64

In [162]:
median_size = train_data['size'].median()
median_size

3.0

In [163]:
train_data.isnull().sum()

area_type          0
availability       0
location           1
size               0
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

#### Removing the society feature because it's missing many values in both train and test data

In [164]:
#Removing the society column because it contains so many missing values
train_data.drop('society', inplace=True, axis=1)

In [165]:
train_data.isnull().sum()

area_type         0
availability      0
location          1
size              0
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

#### Total Sqft

In [166]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13320 non-null  int64  
 4   total_sqft    13320 non-null  object 
 5   bath          13247 non-null  float64
 6   balcony       12711 non-null  float64
 7   price         13320 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 832.6+ KB


In [167]:
def convert_to_float(x):
    try:
        x = x.split()[0]
        return float(x)
    except:
        return None

# instead of converting all the non sqft value to sqft (which are only few values)
# I will just fill those values with average value
train_data['total_sqft'] = train_data.total_sqft.apply(convert_to_float)

In [168]:
mean_sqft = train_data.total_sqft.mean() # I will fill all the None values with mean value
train_data.total_sqft.fillna(mean_sqft, inplace=True)

In [169]:
train_data.total_sqft.isnull().sum()

0

In [170]:
train_data.head(3)

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2,1056.0,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3,1440.0,2.0,3.0,62.0


#### Location

In [171]:
train_data.location.nunique()

1305

In [172]:
train_data.location.value_counts().head(12)

Whitefield               540
Sarjapur  Road           399
Electronic City          302
Kanakpura Road           273
Thanisandra              234
Yelahanka                213
Uttarahalli              186
Hebbal                   177
Marathahalli             175
Raja Rajeshwari Nagar    171
Bannerghatta Road        152
Hennur Road              152
Name: location, dtype: int64

In [173]:
price_location = train_data.groupby('location').agg({'price': 'mean'}).sort_values(by=['price'], ascending=False)
price_location

Unnamed: 0_level_0,price
location,Unnamed: 1_level_1
Cubbon Road,1900.000000
Ashok Nagar,1486.000000
Defence Colony,1167.714286
Yemlur,1093.388889
Church Street,1068.000000
...,...
Celebrity Paradise Layout,19.245000
BAGUR,17.000000
Makali,16.000000
Anekal,16.000000


Cubbon Road is the most expensive area and Alur is the least expensive area. It is also clear that that location affect the price of the property a lot. I think that encoding these values in the same order as the mean price in that location would be very beneficial for the algorithm

In [174]:
price_location[(87==price_location.price)]

Unnamed: 0_level_0,price
location,Unnamed: 1_level_1
bsk 6th stage 2ad block near sri conversation hall,87.0
RBI Layout,87.0
Off Bannergatta Road,87.0
Dasappa Layout,87.0


In [175]:
train_data[train_data.location.isnull() == True]

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
568,Super built-up Area,Ready To Move,,3,1600.0,3.0,2.0,86.0


In [176]:
train_data.iloc[568]

area_type       Super built-up  Area
availability           Ready To Move
location                         NaN
size                               3
total_sqft                    1600.0
bath                             3.0
balcony                          2.0
price                           86.0
Name: 568, dtype: object

In [177]:
train_data.isnull().sum()

area_type         0
availability      0
location          1
size              0
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [178]:
# dropping the null value row
try:
    train_data.drop(568, axis=0, inplace=True)
except:
    pass

#### Bath

In [179]:
train_data.bath.unique()

array([ 2.,  5.,  3.,  4.,  6.,  1.,  9., nan,  8.,  7., 11., 10., 14.,
       27., 12., 16., 40., 15., 13., 18.])

In [180]:
train_data.groupby('bath').agg({'price':'mean'}).head(4)

Unnamed: 0_level_0,price
bath,Unnamed: 1_level_1
1.0,47.584632
2.0,63.404087
3.0,125.143801
4.0,237.005812


In [181]:
train_data.isnull().sum()

area_type         0
availability      0
location          0
size              0
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [182]:
median_bath = train_data.bath.median()
train_data.bath.fillna(median_bath, inplace=True)
train_data.isnull().sum()

area_type         0
availability      0
location          0
size              0
total_sqft        0
bath              0
balcony         609
price             0
dtype: int64

#### Balcony

In [183]:
train_data.balcony.unique()

array([ 1.,  3., nan,  2.,  0.])

In [184]:
median_balcony = train_data.balcony.median()
train_data.balcony.fillna(median_balcony, inplace=True)
train_data.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

### Converting the categorical variable into numerical 

In [185]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13319 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13319 non-null  object 
 1   availability  13319 non-null  object 
 2   location      13319 non-null  object 
 3   size          13319 non-null  int64  
 4   total_sqft    13319 non-null  float64
 5   bath          13319 non-null  float64
 6   balcony       13319 non-null  float64
 7   price         13319 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 936.5+ KB


In [186]:
train_data.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [187]:
train_data.groupby('area_type').agg({'price':'mean'})

Unnamed: 0_level_0,price
area_type,Unnamed: 1_level_1
Built-up Area,104.285498
Carpet Area,89.502356
Plot Area,208.495486
Super built-up Area,92.97255


In [188]:
from sklearn.preprocessing import LabelEncoder

In [189]:
loc_encoder = LabelEncoder()
train_data['location_en'] = loc_encoder.fit_transform(train_data['location'])

ar_type_encoder = LabelEncoder()
train_data['area_type_en'] = ar_type_encoder.fit_transform(train_data['area_type'])

train_data.head(4)

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,location_en,area_type_en
0,Super built-up Area,19-Dec,Electronic City Phase II,2,1056.0,2.0,1.0,39.07,419,3
1,Plot Area,Ready To Move,Chikka Tirupathi,4,2600.0,5.0,3.0,120.0,317,2
2,Built-up Area,Ready To Move,Uttarahalli,3,1440.0,2.0,3.0,62.0,1179,0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3,1521.0,3.0,1.0,95.0,757,3


In [190]:
# We already have mean and median value of some features calculated from the train data
# median_balcony
# median_bath
# mean_sqft
# median_size

def preprocessing(data):
    data.balcony.fillna(median_balcony, inplace=True)
    data.bath.fillna(median_bath, inplace=True)
    data['total_sqft'] = data.total_sqft.apply(convert_to_float)
    data.total_sqft.fillna(mean_sqft, inplace=True)
    data['size'] = train_data['size'].apply(get_size)
    data['size'].fillna(median_size, inplace=True)
    # data['location_en'] = loc_encoder.transform(data['location'])
    data['area_type_en'] = ar_type_encoder.transform(data['area_type'])
    return data
    

In [191]:
test_data.isnull().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

In [192]:

features = ['area_type_en',  'size', 'bath', 'balcony', 'total_sqft']

In [193]:
X = train_data[features]
y = train_data['price']
X, y

(       area_type_en  size  bath  balcony  total_sqft
 0                 3     2   2.0      1.0      1056.0
 1                 2     4   5.0      3.0      2600.0
 2                 0     3   2.0      3.0      1440.0
 3                 3     3   3.0      1.0      1521.0
 4                 3     2   2.0      1.0      1200.0
 ...             ...   ...   ...      ...         ...
 13315             0     5   4.0      0.0      3453.0
 13316             3     4   5.0      2.0      3600.0
 13317             0     2   2.0      1.0      1141.0
 13318             3     4   4.0      1.0      4689.0
 13319             3     1   1.0      1.0       550.0
 
 [13319 rows x 5 columns], 0         39.07
 1        120.00
 2         62.00
 3         95.00
 4         51.00
           ...  
 13315    231.00
 13316    400.00
 13317     60.00
 13318    488.00
 13319     17.00
 Name: price, Length: 13319, dtype: float64)

In [194]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)
X_train.head()

Unnamed: 0,area_type_en,size,bath,balcony,total_sqft
6099,3,3,2.0,2.0,1455.0
8786,0,3,2.0,1.0,1080.0
6642,3,2,2.0,2.0,1492.0
9646,2,5,5.0,0.0,1800.0
12312,0,2,2.0,2.0,1100.0


One problem I see with the model is that some location are very rare in the data and that might affect the accuracy of the the model

In [195]:
from xgboost import XGBRegressor

In [221]:
model = XGBRegressor(n_estimators=100, max_depth=9, reg_lambda=250)
model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)




(0.6758013223119885, 0.5679887643727324)

In [197]:
preprocessing?

In [198]:
test_data = preprocessing(test_data)

In [199]:
test_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,area_type_en
0,Super built-up Area,Ready To Move,Brookefield,2.0,Roeekbl,1225.0,2.0,2.0,,3
1,Plot Area,Ready To Move,Akshaya Nagar,4.0,,2400.0,9.0,2.0,,2
2,Plot Area,18-Apr,Hennur Road,3.0,Saandtt,1650.0,5.0,2.0,,2
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3.0,Winerri,1322.0,3.0,1.0,,3
4,Super built-up Area,Ready To Move,Konanakunte,2.0,AmageSa,1161.0,2.0,1.0,,3


In [200]:
features

['area_type_en', 'size', 'bath', 'balcony', 'total_sqft']

In [222]:
predictions = model.predict(test_data[features])

In [224]:
test_data['price'] = predictions

In [225]:
pred = test_data['price']

In [227]:
pred.to_csv('predictions.csv')