In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Bengaluru_house_cleaned_data.csv')
df

Unnamed: 0,location,total_sqft,bath,price,bhk
0,other,1672.0,3.0,150.00,3
1,other,1750.0,3.0,149.00,3
2,other,1750.0,3.0,150.00,3
3,Devarachikkanahalli,1250.0,2.0,44.00,3
4,Devarachikkanahalli,1250.0,2.0,40.00,2
...,...,...,...,...,...
8395,other,3800.0,6.0,390.00,6
8396,other,1780.0,3.0,84.83,3
8397,other,880.0,2.0,48.00,2
8398,other,1000.0,2.0,55.00,2


In [4]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk
count,8400.0,8400.0,8400.0,8400.0
mean,1513.598988,2.45381,92.15156,2.525952
std,892.394753,0.913713,96.321402,0.805265
min,400.0,1.0,10.0,1.0
25%,1125.0,2.0,50.0,2.0
50%,1300.0,2.0,67.38,2.0
75%,1650.0,3.0,100.0,3.0
max,30400.0,12.0,2200.0,10.0


Here we can see that maximum `sqft` is `30400`.

In [5]:
df[df.total_sqft >15000]

Unnamed: 0,location,total_sqft,bath,price,bhk
1992,other,30000.0,4.0,2100.0,4
1998,other,30400.0,4.0,1824.0,6


Here we can see that there are only two values in `total_sqft` is greater that `15000` so we can remove it.

In [6]:
df = df[df.total_sqft <15000]

In [7]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,other,1672.0,3.0,150.0,3
1,other,1750.0,3.0,149.0,3
2,other,1750.0,3.0,150.0,3
3,Devarachikkanahalli,1250.0,2.0,44.0,3
4,Devarachikkanahalli,1250.0,2.0,40.0,2


In [8]:
X = df.drop('price', 'columns')

  X = df.drop('price', 'columns')


In [9]:
X.head()

Unnamed: 0,location,total_sqft,bath,bhk
0,other,1672.0,3.0,3
1,other,1750.0,3.0,3
2,other,1750.0,3.0,3
3,Devarachikkanahalli,1250.0,2.0,3
4,Devarachikkanahalli,1250.0,2.0,2


In [10]:
y = df.price
y.head()

0    150.0
1    149.0
2    150.0
3     44.0
4     40.0
Name: price, dtype: float64

# Model Training

## `train_test_split`

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train, val_train, train_targets, val_targets = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
x_train

Unnamed: 0,location,total_sqft,bath,bhk
2485,Electronics City Phase 1,1080.0,2.0,2
893,Banashankari Stage V,1540.0,3.0,3
7979,Whitefield,1185.0,2.0,2
2346,Electronic City Phase II,900.0,2.0,2
2728,HRBR Layout,1374.0,2.0,2
...,...,...,...,...
5736,Parappana Agrahara,1194.0,2.0,2
5193,Marathahalli,1550.0,3.0,3
5392,Mysore Road,1029.5,2.0,2
860,other,1250.0,2.0,2


In [14]:
train_targets

2485     46.000
893      48.510
7979     56.000
2346     32.500
2728     95.000
         ...   
5736     46.000
5193     83.000
5392     50.855
860     110.000
7272     56.000
Name: price, Length: 6718, dtype: float64

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([('trf',OneHotEncoder(sparse=False, drop='first'),['location'])], remainder='passthrough')

In [16]:
from sklearn.pipeline import Pipeline

## LinearRegression

In [17]:
from sklearn.linear_model import LinearRegression
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LinearRegression())
])

In [18]:
pipe.fit(x_train, train_targets)



In [19]:
pipe.score(x_train, train_targets)

0.7632533861077619

In [20]:
pipe.score(val_train, val_targets)

0.7401493525170908

Here we can see that our `LinearRegression` is only `76%` accurate on Training set and `74%` on Validation set .

## DecisionTreeRegressor

In [21]:
from sklearn.tree import DecisionTreeRegressor
pipe2 = Pipeline(steps=[
    ('step1',trf),
    ('step2',DecisionTreeRegressor())
])

In [22]:
pipe2.fit(x_train, train_targets)



In [23]:
pipe2.score(x_train, train_targets)

0.985791578866338

In [24]:
pipe2.score(val_train, val_targets)

0.8045771381747767

Here `DecisionTreeRegressor` is `98.6%` accurate on Training set and `65.4%` accurate on Validation set.

### Hyperparameter and Tuning

In [25]:
from sklearn.metrics import mean_squared_error
def hyperparameter(**params):
    pipe3 = Pipeline(steps=[
    ('step1',trf),
    ('step2',DecisionTreeRegressor(**params))
    ])
    pipe3.fit(x_train, train_targets)
    print('Training Accuracy:- ',(pipe3.score(x_train, train_targets))*100,'%')
    print('Validation Accuracy:- ',(pipe3.score(val_train, val_targets))*100,'%')
    
    train_pred = pipe3.predict(x_train)
    val_pred = pipe3.predict(val_train)
    
    print('Training Loss:- ',mean_squared_error(train_pred, train_targets, squared=False))
    print('Validation Loss:- ',mean_squared_error(val_pred, val_targets, squared=False))

#### `max_depth`

In [26]:
hyperparameter(max_depth=3)



Training Accuracy:-  73.06306580141906 %
Validation Accuracy:-  55.75249395434345 %
Training Loss:-  49.16425622423544
Validation Loss:-  52.7728326439832


In [27]:
hyperparameter(max_depth=5)

Training Accuracy:-  81.42227396340233 %
Validation Accuracy:-  67.36624953311055 %
Training Loss:-  40.82927494824099
Validation Loss:-  45.321001847351475




In [28]:
hyperparameter(max_depth=10)



Training Accuracy:-  92.20444378887711 %
Validation Accuracy:-  62.60652378388913 %
Training Loss:-  26.448371982831627
Validation Loss:-  48.51364742755195


In [29]:
hyperparameter(max_depth=70)



Training Accuracy:-  98.57496034093268 %
Validation Accuracy:-  61.713650280190755 %
Training Loss:-  11.308079841010274
Validation Loss:-  49.08942989892523


Here we can see that Training Accuracy is increasing and Loss are decreasing and best at `70` but Validation Accuracy and remains same but Validation Loss is best at `max_depth=70`.

#### `max_leaf_nodes`

In [30]:
hyperparameter(max_leaf_nodes=20)



Training Accuracy:-  82.46240297205412 %
Validation Accuracy:-  65.67841837897835 %
Training Loss:-  39.66983865191205
Validation Loss:-  46.47823783151817


In [31]:
hyperparameter(max_leaf_nodes=50)

Training Accuracy:-  88.55405712342323 %
Validation Accuracy:-  58.95855487937678 %
Training Loss:-  32.04801813835332
Validation Loss:-  50.8249932520262




In [32]:
hyperparameter(max_leaf_nodes=100)



Training Accuracy:-  91.77234064084513 %
Validation Accuracy:-  81.27306025861665 %
Training Loss:-  27.171495482944987
Validation Loss:-  34.33202451441454


In [33]:
hyperparameter(max_leaf_nodes=150)



Training Accuracy:-  93.17290866636804 %
Validation Accuracy:-  67.26842790570021 %
Training Loss:-  24.75102832352215
Validation Loss:-  45.38887723306384


Here the Accuracy Score for Both Training set and Validation set are same. But best `max_leaf_nodes` is `100`.

Now Let's try to putting all together.

In [34]:
hyperparameter(max_depth=70,max_leaf_nodes=100)



Training Accuracy:-  91.77234064084513 %
Validation Accuracy:-  81.48401304623282 %
Training Loss:-  27.171495482944987
Validation Loss:-  34.13810740941063


## RandomForest

In [35]:
from sklearn.ensemble import RandomForestRegressor
pipe4 = Pipeline(steps=[
    ('step1',trf),
    ('step2',RandomForestRegressor())
])

In [36]:
pipe4.fit(x_train, train_targets)



In [37]:
pipe4.score(x_train, train_targets)

0.9559116986529111

In [38]:
pipe4.score(val_train, val_targets)

0.7817597308218681

Here `RandomForestRegressor` is best among all the model.

Here In `RandomForestRegressor` model `Training Score` is `95%` and for `Validation Score` is `80%`.

In [39]:
train_pred2 = pipe4.predict(x_train)
val_pred2 = pipe4.predict(val_train)

print('Training Loss:- ',mean_squared_error(train_pred2, train_targets, squared=False))
print('Validation Loss:- ',mean_squared_error(val_pred2, val_targets, squared=False))

Training Loss:-  19.890104914746153
Validation Loss:-  37.0623925616468


Here `Training Loss` is `20 Lakhs` and`Validation Loss` is `35 Lakhs`.

### Hyperparameter and Tuning


In [40]:
def hyperparameter_random(**params):
    pipe5 = Pipeline(steps=[
    ('step1',trf),
    ('step2',RandomForestRegressor(**params))
    ])
    pipe5.fit(x_train, train_targets)
    print('Training Accuracy:- ',(pipe5.score(x_train, train_targets))*100,'%')
    print('Validation Accuracy:- ',(pipe5.score(val_train, val_targets))*100,'%')
    
    train_pred = pipe5.predict(x_train)
    val_pred = pipe5.predict(val_train)
    
    print('Training Loss:- ',mean_squared_error(train_pred, train_targets, squared=False))
    print('Validation Loss:- ',mean_squared_error(val_pred, val_targets, squared=False))

#### `n_estimators`

In [41]:
hyperparameter_random(n_estimators=50)



Training Accuracy:-  95.26319814010269 %
Validation Accuracy:-  76.99959333652903 %
Training Loss:-  20.61664533928336
Validation Loss:-  38.04816924804704


In [42]:
hyperparameter_random(n_estimators=70)



Training Accuracy:-  95.37904753364776 %
Validation Accuracy:-  78.97648538451668 %
Training Loss:-  20.362970929462517
Validation Loss:-  36.376312154435645


In [43]:
hyperparameter_random(n_estimators=100)



Training Accuracy:-  95.14198096478691 %
Validation Accuracy:-  78.16106559300768 %
Training Loss:-  20.878774164675157
Validation Loss:-  37.075048633304064


In [44]:
hyperparameter_random(n_estimators=500)



Training Accuracy:-  95.39844082321218 %
Validation Accuracy:-  77.71165941757046 %
Training Loss:-  20.320196176618275
Validation Loss:-  37.45457514765667


Best `n_estimators` is `70`.

#### `max_depth`

In [45]:
hyperparameter_random(max_depth=5)



Training Accuracy:-  83.27203861415332 %
Validation Accuracy:-  71.49203140407158 %
Training Loss:-  38.743325861764326
Validation Loss:-  42.35933595591711


In [46]:
hyperparameter_random(max_depth=10)



Training Accuracy:-  91.29333780871454 %
Validation Accuracy:-  77.01929009141742 %
Training Loss:-  27.951250217905727
Validation Loss:-  38.03187418881474


In [47]:
hyperparameter_random(max_depth=50)



Training Accuracy:-  95.28955796835284 %
Validation Accuracy:-  78.01114935875745 %
Training Loss:-  20.55920052521758
Validation Loss:-  37.20208426395048


In [48]:
hyperparameter_random(max_depth=70)



Training Accuracy:-  95.50498356345206 %
Validation Accuracy:-  79.02612394457724 %
Training Loss:-  20.083575476036092
Validation Loss:-  36.33334277257731


Best maximum `max_depth` is `70`.

#### `max_features`

In [49]:
hyperparameter_random(max_features=3)



Training Accuracy:-  95.54853606750814 %
Validation Accuracy:-  81.53596574532683 %
Training Loss:-  19.986043122866622
Validation Loss:-  34.09018091604664


In [50]:
hyperparameter_random(max_features=6)



Training Accuracy:-  95.44143315473181 %
Validation Accuracy:-  80.02909969831533 %
Training Loss:-  20.225047694953787
Validation Loss:-  35.453966437388814


In [51]:
hyperparameter_random(max_features=6)



Training Accuracy:-  95.3086826692775 %
Validation Accuracy:-  78.95544326967989 %
Training Loss:-  20.51742223098696
Validation Loss:-  36.39451184938535


In [52]:
hyperparameter_random(max_features=10)



Training Accuracy:-  95.5286067315867 %
Validation Accuracy:-  80.64824702885878 %
Training Loss:-  20.03073221375038
Validation Loss:-  34.90005916492804


Best `max_features` is `3`.

#### Putting All Together

In [63]:
hyperparameter_random(n_estimators=70,max_depth=70, max_features=3)



Training Accuracy:-  92.47591564652221 %
Validation Accuracy:-  78.78264448202601 %
Training Loss:-  25.983773305942837
Validation Loss:-  36.5436257175461


Putting all Together is not well where `max_feature` has highest `accuracy score` and lowest `Loss`.

## XGBoost

In [54]:
from xgboost import XGBRegressor

In [55]:
pipe6 = Pipeline(steps=[
    ('step1',trf),
    ('step2',XGBRegressor())
])

In [56]:
pipe6.fit(x_train, train_targets)



In [57]:
pipe6.score(x_train, train_targets)

0.9411533965307344

In [58]:
pipe6.score(val_train, val_targets)

0.8085866860894552

In [59]:
train_prediction = pipe6.predict(x_train)
val_prediction = pipe6.predict(val_train)
print('Training Loss:- ',mean_squared_error(train_prediction, train_targets, squared=False))
print('Validation Loss:- ',mean_squared_error(val_prediction, val_targets, squared=False))

Training Loss:-  22.979262763614344
Validation Loss:-  34.70979738577674


Here `XGBRegressor` give good model.

# Saving the Best model

Best model is `XGBRegressor` with `94%` `accuracy score` and `₹23 Lakhs` `Loss` with `Training Data set` and `80%` `accuracy score` and `₹34 Lakhs` `Loss` with `Validation Data set`

In [60]:
pipe6.score(x_train, train_targets)

0.9411533965307344

In [61]:
pipe6.score(val_train, val_targets)

0.8085866860894552

In [120]:
import pickle
pickle.dump(pipe6, open('model.pkl', 'wb'))

In [7]:
df.columns

Index(['location', 'total_sqft', 'bath', 'price', 'bhk'], dtype='object')

In [6]:
df.location.unique()

array(['other', ' Devarachikkanahalli', '1st Phase JP Nagar',
       '5th Phase JP Nagar', '6th Phase JP Nagar', '7th Phase JP Nagar',
       '8th Phase JP Nagar', '9th Phase JP Nagar', 'Abbigere',
       'Akshaya Nagar', 'Ambalipura', 'Ambedkar Nagar', 'Amruthahalli',
       'Anandapura', 'Ananth Nagar', 'Anekal', 'Anjanapura', 'Ardendale',
       'Arekere', 'Attibele', 'BEML Layout', 'BTM 2nd Stage',
       'BTM Layout', 'Babusapalaya', 'Balagere', 'Banashankari',
       'Banashankari Stage III', 'Banashankari Stage V', 'Banaswadi',
       'Bannerghatta Road', 'Basavangudi', 'Battarahalli', 'Begur',
       'Begur Road', 'Bellandur', 'Bhoganhalli', 'Billekahalli',
       'Binny Pete', 'Bommanahalli', 'Bommasandra',
       'Bommasandra Industrial Area', 'Brookefield', 'Budigere',
       'CV Raman Nagar', 'Chandapura', 'Channasandra', 'Chikka Tirupathi',
       'Chikkalasandra', 'Choodasandra', 'Devanahalli', 'Dodda Nekkundi',
       'Doddathoguru', 'Domlur', 'EPIP Zone', 'Electronic Ci

In [None]:
['other', ' Devarachikkanahalli', '1st Phase JP Nagar', '5th Phase JP Nagar', '6th Phase JP Nagar', '7th Phase JP Nagar', '8th Phase JP Nagar', '9th Phase JP Nagar', 'Abbigere', 'Akshaya Nagar', 'Ambalipura', 'Ambedkar Nagar', 'Amruthahalli', 'Anandapura', 'Ananth Nagar', 'Anekal', 'Anjanapura', 'Ardendale', 'Arekere', 'Attibele', 'BEML Layout', 'BTM 2nd Stage', 'BTM Layout', 'Babusapalaya', 'Balagere', 'Banashankari', 'Banashankari Stage III', 'Banashankari Stage V', 'Banaswadi', 'Bannerghatta Road', 'Basavangudi', 'Battarahalli', 'Begur', 'Begur Road', 'Bellandur', 'Bhoganhalli', 'Billekahalli', 'Binny Pete', 'Bommanahalli', 'Bommasandra', 'Bommasandra Industrial Area', 'Brookefield', 'Budigere', 'CV Raman Nagar', 'Chandapura', 'Channasandra', 'Chikka Tirupathi', 'Chikkalasandra', 'Choodasandra', 'Devanahalli', 'Dodda Nekkundi', 'Doddathoguru', 'Domlur', 'EPIP Zone', 'Electronic City', 'Electronic City Phase II', 'Electronics City Phase 1', 'Frazer Town', 'Garudachar Palya', 'Gottigere', 'Green Glen Layout', 'Gubbalala', 'Gunjur', 'HBR Layout', 'HRBR Layout', 'HSR Layout', 'Haralur Road', 'Harlur', 'Hebbal', 'Hebbal Kempapura', 'Hegde Nagar', 'Hennur', 'Hennur Road', 'Hoodi', 'Horamavu Agara', 'Horamavu Banaswadi', 'Hormavu', 'Hosa Road', 'Hosakerehalli', 'Hosur Road', 'Hulimavu', 'Iblur Village', 'Indira Nagar', 'JP Nagar', 'Jakkur', 'Jalahalli', 'Jigani', 'KR Puram', 'Kadugodi', 'Kaggadasapura', 'Kaggalipura', 'Kaikondrahalli', 'Kalena Agrahara', 'Kalyan nagar', 'Kambipura', 'Kammasandra', 'Kanakapura', 'Kanakpura Road', 'Kannamangala', 'Kasavanhalli', 'Kathriguppe', 'Kaval Byrasandra', 'Kengeri', 'Kengeri Satellite Town', 'Kereguddadahalli', 'Kodichikkanahalli', 'Kogilu', 'Koramangala', 'Kothannur', 'Kothanur', 'Kudlu', 'Kudlu Gate', 'Kumaraswami Layout', 'Kundalahalli', 'Lakshminarayana Pura', 'Lingadheeranahalli', 'Mahadevpura', 'Mallasandra', 'Malleshpalya', 'Malleshwaram', 'Marathahalli', 'Margondanahalli', 'Munnekollal', 'Murugeshpalya', 'Mysore Road', 'Nagarbhavi', 'Nagavara', 'Neeladri Nagar', 'OMBR Layout', 'Old Airport Road', 'Old Madras Road', 'Padmanabhanagar', 'Pai Layout', 'Panathur', 'Parappana Agrahara', 'Poorna Pragna Layout', 'R.T. Nagar', 'Rachenahalli', 'Raja Rajeshwari Nagar', 'Rajaji Nagar', 'Ramagondanahalli', 'Ramamurthy Nagar', 'Rayasandra', 'Sahakara Nagar', 'Sanjay nagar', 'Sarjapur', 'Sarjapur  Road', 'Sarjapura - Attibele Road', 'Sector 7 HSR Layout', 'Seegehalli', 'Singasandra', 'Somasundara Palya', 'Sonnenahalli', 'Subramanyapura', 'TC Palaya', 'Talaghattapura', 'Thanisandra', 'Thigalarapalya', 'Thubarahalli', 'Tumkur Road', 'Ulsoor', 'Uttarahalli', 'Varthur', 'Vidyaranyapura', 'Vijayanagar', 'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka', 'Yelahanka New Town', 'Yeshwanthpur']