# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)

In [2]:
df1 = pd.read_csv('./data/Bengaluru_House_Data.csv')
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df1.shape

(13320, 9)

In [4]:
df1.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

## 1.1 Null Values 

For project simplicity, assume availability, society and area_type does not contribute a lot on determining house prices.

In [5]:
df2 = df1.drop(['area_type', 'society', 'availability'], axis = 'columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [25]:
df2.isnull().mean()

location      0.000075
size          0.001201
total_sqft    0.000000
bath          0.005480
balcony       0.045721
price         0.000000
dtype: float64

Check wheather null value in bath and balcony represents to 0 or actually null value

In [24]:
print(sorted(df2['bath'].unique()))
print(sorted(df2['balcony'].unique()))

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, nan, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 18.0, 27.0, 40.0]
[1.0, 3.0, nan, 0.0, 2.0]


Since the largest percentage of null value in fields is only 4%, I dropped null instead of alternate with mean value or other values.

In [32]:
df3 = df2.dropna()
df3.isnull().mean()

location      0.0
size          0.0
total_sqft    0.0
bath          0.0
balcony       0.0
price         0.0
dtype: float64

## 1.2 Feature Engineering

In [33]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In size column, some values are using the unit of BHK and some are using Bedroom. The BHK represents to bedroom, hall and kitchen. It conveys the number of rooms in a preoperty. In this case, I think the number of both units are all about the quantity of bedrooms, so I will create a new column bhk to only save the numbers.

In [35]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [37]:
sorted(df3['bhk'].unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 27, 43]

We can see that there are some houses having large numbers of bhk, and let's take a look.

In [38]:
df3[df3.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,0.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,0.0,660.0,43


In [39]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [40]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True