## Bengaluru House Price Prediction

In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [7]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
#Check any missing values are present in the data
miss_data=pd.DataFrame({"Missing_values":df.isnull().sum(),'Percentage':(df.isnull().sum()/df.shape[0])*100})
miss_data

Unnamed: 0,Missing_values,Percentage
area_type,0,0.0
availability,0,0.0
location,1,0.007508
size,16,0.12012
society,5502,41.306306
total_sqft,0,0.0
bath,73,0.548048
balcony,609,4.572072
price,0,0.0


In [9]:
for column in df.columns:
    print(df[column].value_counts())
    print("*" * 20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Jul               1
14-Jul               1
16-Nov               1
16-Oct               1
15-Aug               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield             540
Sarjapur  Road         399
Electronic City        302
Kanakpura Road         273
Thanisandra            234
                      ... 
Tharabanahalli           1
Sector 4 HSR Layout      1
BEL Layout               1
Navodaya Nagar           1
poornaprajna layout      1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6

In [10]:
df.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace = True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [13]:
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [12]:
df['location'].value_counts()

Whitefield             540
Sarjapur  Road         399
Electronic City        302
Kanakpura Road         273
Thanisandra            234
                      ... 
Tharabanahalli           1
Sector 4 HSR Layout      1
BEL Layout               1
Navodaya Nagar           1
poornaprajna layout      1
Name: location, Length: 1305, dtype: int64

In [14]:
df['location'] = df['location'].fillna('Sarjapur  Road')

In [15]:
df['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 Bedroom       2
11 BHK           2
10 BHK           2
19 BHK           1
14 BHK           1
13 BHK           1
16 BHK           1
43 Bedroom       1
27 BHK           1
18 Bedroom       1
12 Bedroom       1
Name: size, dtype: int64

In [16]:
df['size'] = df['size'].fillna('2 BHK')

In [17]:
df['bath'].value_counts()

2.0     6908
3.0     3286
4.0     1226
1.0      788
5.0      524
6.0      273
7.0      102
8.0       64
9.0       43
10.0      13
12.0       7
13.0       3
11.0       3
16.0       2
27.0       1
18.0       1
40.0       1
15.0       1
14.0       1
Name: bath, dtype: int64

In [18]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [20]:
# here remove the string part of the size and add one more col bhk
df['bhk'] = df['size'].str.split().str.get(0).astype(int)
df['bhk']

0        2
1        4
2        3
3        3
4        2
        ..
13315    5
13316    4
13317    2
13318    4
13319    1
Name: bhk, Length: 13320, dtype: int32

In [21]:
df[df.bhk >20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [22]:
df['total_sqft'].unique()  # see here are some data in range so we have to fix it

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [27]:
def convertRange(x):
    
    temp = x.split('-')
    if len(temp)==2:
        return (float(temp[0]) + float(temp[1]))/2
                
    try:
        return float(x)
                
    except:
        return None      

In [28]:
df['total_sqft'] = df['total_sqft'].apply(convertRange)

In [29]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [30]:
## Price per sq feet

df['price_per_sqft'] = df['price']*100000 / df['total_sqft']

In [31]:
df['price_per_sqft'] 

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
13319     3090.909091
Name: price_per_sqft, Length: 13320, dtype: float64

In [32]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [33]:
df['location'].value_counts()  # we have to reduce location

Whitefield             540
Sarjapur  Road         400
Electronic City        302
Kanakpura Road         273
Thanisandra            234
                      ... 
Tharabanahalli           1
Sector 4 HSR Layout      1
BEL Layout               1
Navodaya Nagar           1
poornaprajna layout      1
Name: location, Length: 1305, dtype: int64

In [34]:
df['location'] = df['location'].apply(lambda x:x.strip())

In [35]:
df['location']

0        Electronic City Phase II
1                Chikka Tirupathi
2                     Uttarahalli
3              Lingadheeranahalli
4                        Kothanur
                   ...           
13315                  Whitefield
13316               Richards Town
13317       Raja Rajeshwari Nagar
13318             Padmanabhanagar
13319                Doddathoguru
Name: location, Length: 13320, dtype: object

In [36]:
location_counts = df['location'].value_counts()

In [38]:
location_counts

Whitefield             541
Sarjapur  Road         400
Electronic City        304
Kanakpura Road         273
Thanisandra            237
                      ... 
Tharabanahalli           1
Sector 4 HSR Layout      1
BEL Layout               1
Navodaya Nagar           1
Chikkajala               1
Name: location, Length: 1294, dtype: int64

In [40]:
location_counts_less10 = location_counts[location_counts <= 10]
location_counts_less10

1st Block Koramangala    10
Dairy Circle             10
BTM 1st Stage            10
Naganathapura            10
Dodsworth Layout         10
                         ..
Tharabanahalli            1
Sector 4 HSR Layout       1
BEL Layout                1
Navodaya Nagar            1
Chikkajala                1
Name: location, Length: 1053, dtype: int64

In [41]:
df['location'] = df['location'].apply(lambda x:'other' if x in location_counts_less10 else x)

In [45]:
df['location'].value_counts()

other                 2885
Whitefield             541
Sarjapur  Road         400
Electronic City        304
Kanakpura Road         273
                      ... 
Marsur                  11
Thyagaraja Nagar        11
HAL 2nd Stage           11
Banjara Layout          11
Pattandur Agrahara      11
Name: location, Length: 242, dtype: int64

In [46]:
df.describe()  # in total sqft mim value is  1 sqft which is not possible

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [47]:
(df['total_sqft']/df['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [50]:
df = df[((df['total_sqft']/df['bhk']) >= 300)] # let take area for 1 bhk > 300

In [51]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [52]:
df.shape

(12530, 7)

In [54]:
## see price per sqft has max value 	176470.588235 which is outlier, we have to remove this

def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        
        st = np.std(subdf.price_per_sqft)
        
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index = True)
    return df_output

df= remove_outliers_sqft(df)

In [55]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [59]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
                
            }
            print(location, bhk_stats)
#         for bhk, bhk_df in location_df.groupby('bhk'):
#             stats = bhk_stats.get(bhk-1)
#             if stats and stats['count']>5:
#                 exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
                
    return df.drop(exclude_indices, axis = 'index')       

In [60]:
df = bhk_outlier_remover(df)


1st Block Jayanagar {2: {'mean': 11983.805668016194, 'std': 0.0, 'count': 1}}
1st Block Jayanagar {2: {'mean': 11983.805668016194, 'std': 0.0, 'count': 1}, 3: {'mean': 11756.16905248807, 'std': 701.6243657657865, 'count': 3}}
1st Block Jayanagar {2: {'mean': 11983.805668016194, 'std': 0.0, 'count': 1}, 3: {'mean': 11756.16905248807, 'std': 701.6243657657865, 'count': 3}, 4: {'mean': 15018.711280365416, 'std': 1.2278182423353805, 'count': 3}}
1st Phase JP Nagar {1: {'mean': 5952.380952380952, 'std': 0.0, 'count': 1}}
1st Phase JP Nagar {1: {'mean': 5952.380952380952, 'std': 0.0, 'count': 1}, 2: {'mean': 7931.806799837383, 'std': 1534.1422783514054, 'count': 8}}
1st Phase JP Nagar {1: {'mean': 5952.380952380952, 'std': 0.0, 'count': 1}, 2: {'mean': 7931.806799837383, 'std': 1534.1422783514054, 'count': 8}, 3: {'mean': 9151.192151725822, 'std': 1054.731726021645, 'count': 7}}
1st Phase JP Nagar {1: {'mean': 5952.380952380952, 'std': 0.0, 'count': 1}, 2: {'mean': 7931.806799837383, 'std': 

Banashankari Stage II {2: {'mean': 8701.636799298034, 'std': 1030.287972433991, 'count': 4}}
Banashankari Stage II {2: {'mean': 8701.636799298034, 'std': 1030.287972433991, 'count': 4}, 3: {'mean': 10723.280423280425, 'std': 4231.90962768462, 'count': 6}}
Banashankari Stage II {2: {'mean': 8701.636799298034, 'std': 1030.287972433991, 'count': 4}, 3: {'mean': 10723.280423280425, 'std': 4231.90962768462, 'count': 6}, 4: {'mean': 12500.0, 'std': 0.0, 'count': 1}}
Banashankari Stage II {2: {'mean': 8701.636799298034, 'std': 1030.287972433991, 'count': 4}, 3: {'mean': 10723.280423280425, 'std': 4231.90962768462, 'count': 6}, 4: {'mean': 12500.0, 'std': 0.0, 'count': 1}, 5: {'mean': 18214.285714285714, 'std': 3035.7142857142853, 'count': 2}}
Banashankari Stage III {2: {'mean': 5430.250387687147, 'std': 1871.4365421808766, 'count': 4}}
Banashankari Stage III {2: {'mean': 5430.250387687147, 'std': 1871.4365421808766, 'count': 4}, 3: {'mean': 5330.43635317069, 'std': 1014.1370099612311, 'count'

Chikkabanavar {2: {'mean': 3979.526682999282, 'std': 296.21104114438486, 'count': 3}}
Chikkabanavar {2: {'mean': 3979.526682999282, 'std': 296.21104114438486, 'count': 3}, 3: {'mean': 4242.424242424242, 'std': 757.5757575757575, 'count': 2}}
Chikkabanavar {2: {'mean': 3979.526682999282, 'std': 296.21104114438486, 'count': 3}, 3: {'mean': 4242.424242424242, 'std': 757.5757575757575, 'count': 2}, 4: {'mean': 3252.032520325203, 'std': 0.0, 'count': 1}}
Chikkabanavar {2: {'mean': 3979.526682999282, 'std': 296.21104114438486, 'count': 3}, 3: {'mean': 4242.424242424242, 'std': 757.5757575757575, 'count': 2}, 4: {'mean': 3252.032520325203, 'std': 0.0, 'count': 1}, 5: {'mean': 3645.833333333333, 'std': 395.83333333333326, 'count': 2}}
Chikkabanavar {2: {'mean': 3979.526682999282, 'std': 296.21104114438486, 'count': 3}, 3: {'mean': 4242.424242424242, 'std': 757.5757575757575, 'count': 2}, 4: {'mean': 3252.032520325203, 'std': 0.0, 'count': 1}, 5: {'mean': 3645.833333333333, 'std': 395.833333333

Gottigere {1: {'mean': 3201.9704433497536, 'std': 0.0, 'count': 2}, 2: {'mean': 4187.198530290943, 'std': 615.4290329261535, 'count': 18}, 3: {'mean': 4243.533192141064, 'std': 545.165771797653, 'count': 14}, 4: {'mean': 4433.333333333333, 'std': 286.74417556808754, 'count': 3}}
Green Glen Layout {2: {'mean': 5874.64951344219, 'std': 208.0884384424191, 'count': 3}}
Green Glen Layout {2: {'mean': 5874.64951344219, 'std': 208.0884384424191, 'count': 3}, 3: {'mean': 6639.764255543528, 'std': 469.4210339773065, 'count': 22}}
Green Glen Layout {2: {'mean': 5874.64951344219, 'std': 208.0884384424191, 'count': 3}, 3: {'mean': 6639.764255543528, 'std': 469.4210339773065, 'count': 22}, 4: {'mean': 6835.898556082042, 'std': 333.3960060843011, 'count': 4}}
Gubbalala {2: {'mean': 5018.120021443538, 'std': 616.24268015249, 'count': 4}}
Gubbalala {2: {'mean': 5018.120021443538, 'std': 616.24268015249, 'count': 4}, 3: {'mean': 5794.291811767968, 'std': 701.7168143003905, 'count': 7}}
Gubbalala {2: {'

Hoskote {1: {'mean': 4518.664047151277, 'std': 0.0, 'count': 1}}
Hoskote {1: {'mean': 4518.664047151277, 'std': 0.0, 'count': 1}, 2: {'mean': 3174.4098068983926, 'std': 410.28654938450563, 'count': 7}}
Hoskote {1: {'mean': 4518.664047151277, 'std': 0.0, 'count': 1}, 2: {'mean': 3174.4098068983926, 'std': 410.28654938450563, 'count': 7}, 3: {'mean': 3600.5359456638253, 'std': 751.6624895989765, 'count': 8}}
Hosur Road {1: {'mean': 4565.217391304348, 'std': 0.0, 'count': 1}}
Hosur Road {1: {'mean': 4565.217391304348, 'std': 0.0, 'count': 1}, 2: {'mean': 5644.8823512611825, 'std': 1217.5145047356148, 'count': 9}}
Hosur Road {1: {'mean': 4565.217391304348, 'std': 0.0, 'count': 1}, 2: {'mean': 5644.8823512611825, 'std': 1217.5145047356148, 'count': 9}, 3: {'mean': 5646.277406311303, 'std': 1116.583963171691, 'count': 14}}
Hosur Road {1: {'mean': 4565.217391304348, 'std': 0.0, 'count': 1}, 2: {'mean': 5644.8823512611825, 'std': 1217.5145047356148, 'count': 9}, 3: {'mean': 5646.277406311303, 

Kambipura {2: {'mean': 4617.214043035106, 'std': 371.95620126318187, 'count': 13}}
Kambipura {2: {'mean': 4617.214043035106, 'std': 371.95620126318187, 'count': 13}, 3: {'mean': 4613.88537985688, 'std': 442.39294983399617, 'count': 7}}
Kammanahalli {1: {'mean': 8428.57142857143, 'std': 0.0, 'count': 1}}
Kammanahalli {1: {'mean': 8428.57142857143, 'std': 0.0, 'count': 1}, 2: {'mean': 6384.469696969697, 'std': 537.9688091548688, 'count': 4}}
Kammanahalli {1: {'mean': 8428.57142857143, 'std': 0.0, 'count': 1}, 2: {'mean': 6384.469696969697, 'std': 537.9688091548688, 'count': 4}, 4: {'mean': 6896.551724137931, 'std': 0.0, 'count': 1}}
Kammanahalli {1: {'mean': 8428.57142857143, 'std': 0.0, 'count': 1}, 2: {'mean': 6384.469696969697, 'std': 537.9688091548688, 'count': 4}, 4: {'mean': 6896.551724137931, 'std': 0.0, 'count': 1}, 5: {'mean': 6962.25766229163, 'std': 2041.3437782846004, 'count': 2}}
Kammasandra {1: {'mean': 2916.3934426229507, 'std': 116.39344262295072, 'count': 2}}
Kammasandra

Kumaraswami Layout {1: {'mean': 9176.470588235294, 'std': 0.0, 'count': 1}}
Kumaraswami Layout {1: {'mean': 9176.470588235294, 'std': 0.0, 'count': 1}, 2: {'mean': 5400.1387604070305, 'std': 402.12974684506605, 'count': 3}}
Kumaraswami Layout {1: {'mean': 9176.470588235294, 'std': 0.0, 'count': 1}, 2: {'mean': 5400.1387604070305, 'std': 402.12974684506605, 'count': 3}, 3: {'mean': 5822.598600508905, 'std': 555.2046988018691, 'count': 4}}
Kumaraswami Layout {1: {'mean': 9176.470588235294, 'std': 0.0, 'count': 1}, 2: {'mean': 5400.1387604070305, 'std': 402.12974684506605, 'count': 3}, 3: {'mean': 5822.598600508905, 'std': 555.2046988018691, 'count': 4}, 4: {'mean': 5719.897959183673, 'std': 1230.1020408163267, 'count': 2}}
Kumaraswami Layout {1: {'mean': 9176.470588235294, 'std': 0.0, 'count': 1}, 2: {'mean': 5400.1387604070305, 'std': 402.12974684506605, 'count': 3}, 3: {'mean': 5822.598600508905, 'std': 555.2046988018691, 'count': 4}, 4: {'mean': 5719.897959183673, 'std': 1230.10204081

Padmanabhanagar {2: {'mean': 6353.3984086808305, 'std': 1620.4038971043078, 'count': 8}}
Padmanabhanagar {2: {'mean': 6353.3984086808305, 'std': 1620.4038971043078, 'count': 8}, 3: {'mean': 6103.133559343508, 'std': 1413.7555415488598, 'count': 11}}
Padmanabhanagar {2: {'mean': 6353.3984086808305, 'std': 1620.4038971043078, 'count': 8}, 3: {'mean': 6103.133559343508, 'std': 1413.7555415488598, 'count': 11}, 4: {'mean': 6176.470588235294, 'std': 0.0, 'count': 1}}
Pai Layout {2: {'mean': 4275.598787873179, 'std': 847.9319844868568, 'count': 11}}
Pai Layout {2: {'mean': 4275.598787873179, 'std': 847.9319844868568, 'count': 11}, 3: {'mean': 4565.193665374716, 'std': 692.5145408804893, 'count': 5}}
Pai Layout {2: {'mean': 4275.598787873179, 'std': 847.9319844868568, 'count': 11}, 3: {'mean': 4565.193665374716, 'std': 692.5145408804893, 'count': 5}, 6: {'mean': 4605.263157894737, 'std': 0.0, 'count': 1}}
Panathur {1: {'mean': 6051.437216338881, 'std': 0.0, 'count': 1}}
Panathur {1: {'mean': 

Sarjapur  Road {1: {'mean': 5133.293879111827, 'std': 1040.4585511011248, 'count': 9}, 2: {'mean': 5376.764753392969, 'std': 940.2539293683719, 'count': 114}, 3: {'mean': 6069.097454249021, 'std': 1210.6553782479018, 'count': 153}, 4: {'mean': 6515.374368146306, 'std': 1322.5099449631382, 'count': 26}}
Sarjapur  Road {1: {'mean': 5133.293879111827, 'std': 1040.4585511011248, 'count': 9}, 2: {'mean': 5376.764753392969, 'std': 940.2539293683719, 'count': 114}, 3: {'mean': 6069.097454249021, 'std': 1210.6553782479018, 'count': 153}, 4: {'mean': 6515.374368146306, 'std': 1322.5099449631382, 'count': 26}, 5: {'mean': 5797.897681160006, 'std': 1209.3915232816719, 'count': 4}}
Sarjapura - Attibele Road {1: {'mean': 3195.945945945946, 'std': 0.0, 'count': 1}}
Sarjapura - Attibele Road {1: {'mean': 3195.945945945946, 'std': 0.0, 'count': 1}, 2: {'mean': 2980.300110459942, 'std': 240.74806987622512, 'count': 6}}
Sarjapura - Attibele Road {1: {'mean': 3195.945945945946, 'std': 0.0, 'count': 1}, 2

Vishwapriya Layout {2: {'mean': 3916.9969520750556, 'std': 177.84130871379733, 'count': 4}}
Vittasandra {2: {'mean': 5265.466588027837, 'std': 172.6857542886576, 'count': 31}}
Vittasandra {2: {'mean': 5265.466588027837, 'std': 172.6857542886576, 'count': 31}, 3: {'mean': 5113.14174326139, 'std': 100.56430016629145, 'count': 7}}
Whitefield {1: {'mean': 5765.189888976461, 'std': 1080.9401060122182, 'count': 26}}
Whitefield {1: {'mean': 5765.189888976461, 'std': 1080.9401060122182, 'count': 26}, 2: {'mean': 4996.6795555450735, 'std': 1261.8095382276895, 'count': 232}}
Whitefield {1: {'mean': 5765.189888976461, 'std': 1080.9401060122182, 'count': 26}, 2: {'mean': 4996.6795555450735, 'std': 1261.8095382276895, 'count': 232}, 3: {'mean': 5530.481945905363, 'std': 1329.8231587008422, 'count': 165}}
Whitefield {1: {'mean': 5765.189888976461, 'std': 1080.9401060122182, 'count': 26}, 2: {'mean': 4996.6795555450735, 'std': 1261.8095382276895, 'count': 232}, 3: {'mean': 5530.481945905363, 'std': 1

In [61]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
                
            }
          
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
                
    return df.drop(exclude_indices, axis = 'index')   

In [62]:
df = bhk_outlier_remover(df)

In [63]:
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...
10292,other,2 BHK,1200.0,2.0,70.0,2,5833.333333
10293,other,1 BHK,1800.0,1.0,200.0,1,11111.111111
10296,other,2 BHK,1353.0,2.0,110.0,2,8130.081301
10297,other,1 Bedroom,812.0,1.0,26.0,1,3201.970443


In [65]:
df.drop(columns=['size', 'price_per_sqft'],inplace=True)#price_per_sqft we use this col to remove outliers now we dont need this

In [66]:
df

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2
...,...,...,...,...,...
10292,other,1200.0,2.0,70.0,2
10293,other,1800.0,1.0,200.0,1
10296,other,1353.0,2.0,110.0,2
10297,other,812.0,1.0,26.0,1


In [67]:
df.to_csv('cleaned_data.csv')

### Cleaned Data

In [3]:
df = pd.read_csv("cleaned_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,location,total_sqft,bath,price,bhk
0,0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [10]:
df.drop(columns=['Unnamed: 0'], axis = 1, inplace=True)

In [11]:
## split the data for train and test

x = df.drop(columns=['price'])
y = df['price']

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [13]:
x_train,x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=0)

In [14]:
print(x_train.shape)
print(x_test.shape)

(5888, 4)
(1472, 4)


### Applying Linear Regressor

In [15]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [16]:
scaler = StandardScaler()

In [17]:
lr = LinearRegression(normalize=True)

In [18]:
pipe = make_pipeline(column_trans, scaler,lr)

In [19]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [20]:
y_pred_lr = pipe.predict(x_test)
r2_score(y_test, y_pred_lr)

0.8295405537662313

### Applying Lasso

In [21]:
lasso = Lasso()

In [23]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [24]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [25]:
y_pred_lasso = pipe.predict(x_test)
r2_score(y_test, y_pred_lasso)

0.8199181874762704

### Applying ridge

In [26]:
ridge = Ridge()

In [27]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [28]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [29]:
y_pred_ridge = pipe.predict(x_test)
r2_score(y_test, y_pred_ridge)

0.8296651410179636

In [31]:
print("No Regularization: ", r2_score(y_test, y_pred_lr))
print("Lasso: ", r2_score(y_test, y_pred_lasso))
print("Ridge: ",r2_score(y_test, y_pred_ridge) )

No Regularization:  0.8295405537662313
Lasso:  0.8199181874762704
Ridge:  0.8296651410179636


In [32]:
import pickle

In [33]:
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))