In [1]:
%pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df=df.drop_duplicates()

In [5]:
df['BHK'] = df['size'].str.split(' ').str[0]

In [6]:
df=df.drop(['size'], axis=1)

In [7]:
#First remove Nan
df['BHK'].fillna(0)

0        2
1        4
2        3
3        3
4        2
        ..
13314    3
13315    5
13316    4
13317    2
13318    4
Name: BHK, Length: 12791, dtype: object

In [8]:
df['BHK']=df['BHK'].fillna(0)

In [9]:
df['BHK']=df['BHK'].astype(int)

In [10]:
df['bath'].fillna(0)

0        2.0
1        5.0
2        2.0
3        3.0
4        2.0
        ... 
13314    3.0
13315    4.0
13316    5.0
13317    2.0
13318    4.0
Name: bath, Length: 12791, dtype: float64

In [11]:
df['bath']=df['bath'].fillna(0)

In [12]:
df['bath']=df['bath'].astype(int)

In [13]:
df['balcony']=df['balcony'].fillna(0)

In [14]:
df['balcony']=df['balcony'].astype(int)

In [15]:
#since the 'total_sqft' is object type we can't directly convert, we'll have find all the range values and work accordingly
def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True

In [16]:
# '~' will give all the false values
df[~df['total_sqft'].apply(isfloat)].head(10)

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,BHK
30,Super built-up Area,19-Dec,Yelahanka,LedorSa,2100 - 2850,4,0,186.0,4
56,Built-up Area,20-Feb,Devanahalli,BrereAt,3010 - 3410,0,0,192.0,4
81,Built-up Area,18-Oct,Hennur Road,Gollela,2957 - 3450,0,0,224.5,4
122,Super built-up Area,18-Mar,Hebbal,SNontle,3067 - 8156,4,0,477.0,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,Vaarech,1042 - 1105,2,0,54.005,2
165,Super built-up Area,18-Dec,Sarjapur,Kinuerg,1145 - 1340,2,0,43.49,2
188,Super built-up Area,Ready To Move,KR Puram,MCvarar,1015 - 1540,2,0,56.8,2
224,Super built-up Area,19-Dec,Devanahalli,Jurdsig,1520 - 1740,0,0,74.82,3
410,Super built-up Area,Ready To Move,Kengeri,,34.46Sq. Meter,1,0,18.5,1
549,Super built-up Area,18-Sep,Hennur Road,Shxorm,1195 - 1440,2,0,63.77,2


In [17]:
#we can see different variations of values and will have to work accordingly to solve it
# 1) we'll create a function to give average value for the range of sqft
def range_to_float(x):
    token=x.split('-')
    if token==2:
        return (float(x[0])+float(x[1]))/2
    try: 
        return float(x)
    except: 
        return None

In [18]:
df['total_sqft']=df['total_sqft'].apply(range_to_float)

In [19]:
df['total_sqft'].unique()

array([1056., 2600., 1440., ..., 2758.,  774., 4689.], shape=(1896,))

In [20]:
df[~df['total_sqft'].apply(isfloat)]

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,BHK


In [21]:
#Now there are 246 flats which do not have sqft mentioned so we can drop them
df = df.dropna(subset=['total_sqft'])

In [22]:
df = df[df['BHK'] != 0]

In [23]:
df11=df.copy()

In [24]:
df11=df11.drop(columns = ['society'])
df11.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,BHK
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2,1,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5,3,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2,3,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3,1,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2,1,51.0,2


In [25]:
df11=df11.dropna()

In [26]:
df11.total_sqft=df11.total_sqft.astype(int)

In [27]:
df12 = df11.copy()

**Now we need to do feature engineering for Dimensionality Reduction**

In [28]:
df12['price_per_sqft'] = df12.price*100000/df12.total_sqft

**We'll reduce the Dimensionality based on Location**

In [29]:
df12.location = df12.location.apply(lambda x: x.strip())
location_stats = df12.groupby('location').location.agg('count').sort_values(ascending = False)

In [30]:
location_stats_less=location_stats[location_stats<=10]

In [31]:
df12.location=df12.location.apply(lambda x: 'other' if x in location_stats_less else x)

In [32]:
df12.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,BHK,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2,1,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,5,3,120.0,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,1440,2,3,62.0,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3,1,95.0,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,1200,2,1,51.0,2,4250.0


**Now we will remove the outliers! There may be houses with less than 300 sqft area per BHK, in order to avoid conflicts we'll remove that!**

In [33]:
df12.shape

(12536, 9)

In [34]:
df13=df12[~(df12.total_sqft/df12.BHK < 300)]

**Now we'll remove all the instance where price is more than (mean + std) and less than (mean - std)**

In [35]:
def remove_pps_outlier(df):
    df_out=pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df14 = remove_pps_outlier(df13)

In [36]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats={}
        for bhk, bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean': np.mean (bhk_df.price_per_sqft),
                'std': np.std (bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df [bhk_df.price_per_sqft<(stats['mean'])]. index. values)
    return df.drop(exclude_indices, axis='index')

In [37]:
df15 = remove_bhk_outliers(df14)

In [38]:
df16 = df15[df15.bath<df15.BHK+2]

In [39]:
df20 = df16.drop(['price_per_sqft'], axis=1)

In [40]:
df20.area_type = df20.area_type.apply(lambda x: x.strip())

In [41]:
dummies1=pd.get_dummies(df20.location).astype(int)
dummies1.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
dummies2=pd.get_dummies(df20.area_type).astype(int)
dummies2.head(3)

Unnamed: 0,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1


In [43]:
df21 = pd.concat([df20, dummies1.drop(['other'], axis=1)], axis=1)

In [44]:
df21 = pd.concat([df21, dummies2.drop(['Plot  Area'], axis=1)], axis=1)
df21.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,BHK,1st Block Jayanagar,1st Phase JP Nagar,...,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,Built-up Area,Carpet Area,Super built-up Area
0,Super built-up Area,20-May,1st Block Jayanagar,2850,4,1,428.0,4,1,0,...,0,0,0,0,0,0,0,0,0,1
1,Super built-up Area,18-Jun,1st Block Jayanagar,1630,3,2,194.0,3,1,0,...,0,0,0,0,0,0,0,0,0,1
2,Super built-up Area,Ready To Move,1st Block Jayanagar,1875,2,3,235.0,3,1,0,...,0,0,0,0,0,0,0,0,0,1
3,Built-up Area,15-Dec,1st Block Jayanagar,1200,2,0,130.0,3,1,0,...,0,0,0,0,0,0,0,1,0,0
4,Super built-up Area,18-Jun,1st Block Jayanagar,1235,2,2,148.0,2,1,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
df22=df21.drop(["area_type", "location"], axis=1)
df22.head(3)

Unnamed: 0,availability,total_sqft,bath,balcony,price,BHK,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,Built-up Area,Carpet Area,Super built-up Area
0,20-May,2850,4,1,428.0,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,18-Jun,1630,3,2,194.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Ready To Move,1875,2,3,235.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [46]:
df22.availability = df22.availability.apply(lambda x: 1 if x=="Ready To Move" else 0)

In [47]:
df22.head()

Unnamed: 0,availability,total_sqft,bath,balcony,price,BHK,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,Built-up Area,Carpet Area,Super built-up Area
0,0,2850,4,1,428.0,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1630,3,2,194.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,1875,2,3,235.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1200,2,0,130.0,3,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1235,2,2,148.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [48]:
x = df22.drop(['price'], axis=1)
x.head(5)

Unnamed: 0,availability,total_sqft,bath,balcony,BHK,1st Block Jayanagar,1st Phase JP Nagar,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,Built-up Area,Carpet Area,Super built-up Area
0,0,2850,4,1,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1630,3,2,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,1875,2,3,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1200,2,0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1235,2,2,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [49]:
y = df22.price
y

0       428.0
1       194.0
2       235.0
3       130.0
4       148.0
        ...  
9729     64.0
9731     70.0
9732    200.0
9735     26.0
9738    400.0
Name: price, Length: 6827, dtype: float64

In [50]:
DFCOLS = [k for k in x.columns]
DFCOLS

['availability',
 'total_sqft',
 'bath',
 'balcony',
 'BHK',
 '1st Block Jayanagar',
 '1st Phase JP Nagar',
 '2nd Stage Nagarbhavi',
 '5th Block Hbr Layout',
 '5th Phase JP Nagar',
 '6th Phase JP Nagar',
 '7th Phase JP Nagar',
 '8th Phase JP Nagar',
 '9th Phase JP Nagar',
 'AECS Layout',
 'Abbigere',
 'Akshaya Nagar',
 'Ambalipura',
 'Ambedkar Nagar',
 'Amruthahalli',
 'Anandapura',
 'Ananth Nagar',
 'Anekal',
 'Anjanapura',
 'Ardendale',
 'Arekere',
 'Attibele',
 'BTM 2nd Stage',
 'BTM Layout',
 'Babusapalaya',
 'Balagere',
 'Banashankari',
 'Banashankari Stage II',
 'Banashankari Stage III',
 'Banashankari Stage V',
 'Banashankari Stage VI',
 'Banaswadi',
 'Banjara Layout',
 'Bannerghatta',
 'Bannerghatta Road',
 'Basavangudi',
 'Basaveshwara Nagar',
 'Battarahalli',
 'Begur',
 'Begur Road',
 'Bellandur',
 'Benson Town',
 'Bharathi Nagar',
 'Bhoganhalli',
 'Billekahalli',
 'Binny Pete',
 'Bisuvanahalli',
 'Bommanahalli',
 'Bommasandra',
 'Bommasandra Industrial Area',
 'Bommenahalli'

In [51]:
x.columns

Index(['availability', 'total_sqft', 'bath', 'balcony', 'BHK',
       '1st Block Jayanagar', '1st Phase JP Nagar', '2nd Stage Nagarbhavi',
       '5th Block Hbr Layout', '5th Phase JP Nagar',
       ...
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur', 'Built-up  Area',
       'Carpet  Area', 'Super built-up  Area'],
      dtype='object', length=239)

In [52]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [55]:

import pickle

model = pickle.load(open('model.pickle','rb'))

def predict_price(area_type, location, availability, sqft, bath, balcony, bhk):
    locidx = DFCOLS.index(location) if location in DFCOLS else -1
    areaidx = DFCOLS.index(area_type) if area_type in DFCOLS else -1

    z = np.zeros(len(DFCOLS))

    z[0] = availability
    z[1] = sqft
    z[2] = bath
    z[3] = balcony
    z[4] = bhk

    if locidx >= 0:
        z[locidx] = 1
    if areaidx >= 0:
        z[areaidx] = 1

    z_df = pd.DataFrame([z], columns=DFCOLS)

    return model.predict(z_df)[0].item()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [56]:
predict_price("Super  built-up  Area", "1st Phase JP Nagar", 1, 2000, 2, 1, 2)

177.02077854826484