## Banglore House Price Prediction

Bangalore House Price Prediction model using a Machine Learning algorithm. This model predicts the price of Bangalore's house with the help of a few parameters like availability, size, total square feet, bath, location, etc. 

In [1]:
#Import dependencies
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.model_selection import ShuffleSplit,cross_val_score,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import catboost as cb
import lightgbm as lgb
from sklearn.feature_selection import VarianceThreshold,chi2
import pickle
import json

### Datasets

What are the things that a potential home buyer considers before purchasing a house? The location, the size of the property, schools, parks, restaurants, hospitals etc...? What about the most important factor — the price?

Buying a home, especially in a city like Bengaluru, is a tricky choice.
While the major factors are usually the same for all metros, there are others to be considered for the Silicon Valley of India.With its huge crowd, vibrant culture, great climate and a slew of job opportunities, it is difficult to ascertain the price of a house in Bengaluru.

The dataset which used for this project is open source and collected from Kaggle


In [2]:
#Import dataset
housepricedata = pd.read_csv('../Dataset/Bengaluru_House_Data.csv')
housepricedf = housepricedata.copy()

In [3]:
housepricedata.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
def isfloat(x):
    '''convert totsl_sqrt to numerical values'''
    try:
        float(x)
    except:
        return False
    return True
def convert_sqft_to_num(x):
    '''To convert totsl_sqrt to numerical values'''
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   
def iqr(dff):
    ''' Return iqr range'''
    q1 = dff.quantile(0.25)
    q3 = dff.quantile(0.75)
    iqr = q3-q1
    return q1-1.5*iqr,q3+1.5*iqr

def remove_pps_outliers(df):
    ''' Remove the outlies in price_per_sqrft'''
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        q1,q3 = iqr(df['price_per_sqrft'])
        reduced_df = subdf[(subdf.price_per_sqrft>(q1)) & (subdf.price_per_sqrft<=(q3))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

def remove_pps_outliers_(df):
    ''' Remove the outlies in price_per_sqrft'''
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqrft)
        st = np.std(subdf.price_per_sqrft)
        reduced_df = subdf[(subdf.price_per_sqrft>(m-st)) & (subdf.price_per_sqrft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

### Data Preprocessing 

In [5]:
#drop null values from the data
housepricedf.dropna(inplace=True)

#convert size to bhk as numerical values
housepricedf['bhk'] = housepricedf['size'].apply(lambda x:int(x.split(' ')[0]))

#convert total_sqrt to numerical
housepricedf['total_sqft'] = housepricedf['total_sqft'].apply(convert_sqft_to_num)

#let's creat new features 
#price per sqrft
housepricedf['price_per_sqrft'] = housepricedf['price']*100000/housepricedf['total_sqft']

#remove the locations with less obervations
housepricedf['location'] = housepricedf['location'].apply(lambda x : x.strip())
location_stats = housepricedf['location'].value_counts(ascending=False)
location_stats_less_than_10 = location_stats[location_stats<=10]
housepricedf['location'] = housepricedf['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

#remove obervations with sqrft/bhk less than 300(Since minimum sqrft is 300 for single room)
housepricedf = housepricedf[~((housepricedf['total_sqft']/housepricedf['bhk'])<300)]

### Outliers

In [6]:
#remove outlies in price_per_sqrft using iqr range
housepricedf = remove_pps_outliers_(housepricedf)

#remove obeservations having more number of bathrooms than bhk
housepricedf = housepricedf[housepricedf['bath']<housepricedf['bhk']+2]

#drop columns - size
housepricedf.drop(columns=['size','price_per_sqrft'],inplace=True)

#Get copy of housedf
housedfcopy = housepricedf.copy()

### Encoding

In [7]:
#Encoding location features - 
locationdummydf = pd.get_dummies(housepricedf['location'])
locationdummydf.drop(columns = ['other'],inplace=True)
housepricedf = pd.concat([housepricedf,locationdummydf],axis=1)
housepricedf.drop(columns=['location'],inplace=True)

### Feature Selection

In [8]:
#drop columns which are not relevent for price predictions
housepricedf.drop(columns=['area_type','availability','society','balcony'],inplace=True)

##Variance Threshold Feature selection for continuos features
# var_thresh = VarianceThreshold(threshold=0)
# var_thresh.fit(housepricedf)
# var_thresh.get_support()

## Chi2 test for categorical features
#catfeatures = ['area_type','size']
## split the data into train and test for feature selection
# pvalues = chi2(x_train,y_train)

## Model Training

In [9]:
#split the data int train and test
Y = housepricedf['price']
X = housepricedf.drop(['price'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,shuffle=True)

### 1 - Linear Regression

In [10]:
lr_clf = LinearRegression()
#Fit the data using Linear Regression model
lr_clf.fit(x_train,y_train)

#Cross validation scores 
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=2021)
cross_valscore = cross_val_score(LinearRegression(),X,Y,cv=cv)
print('Cross Val Score is ',cross_valscore)

Cross Val Score is  [0.89428649 0.89702319 0.90166458 0.87920837 0.89596262]


### Gridsearch CV
Gridsearch for three models : Linear Regression, Decision Tree Regression,Lasso Regression

In [11]:
def best_model_gridsearch(x,y):
    algos = {
            'lr':{
                 'model':LinearRegression(),
                 'params':{ 
                          'normalize' :[True,False]}
                 },
            'lasso':{
                    'model' : Lasso(),
                    'params' : {
                                'alpha' : [1,2],
                                'selection' : ['random','cyclic']
                                }
                    },
            'dt' :{
                  'model' : DecisionTreeRegressor(),
                  'params':{
                            'criterion':['mse','friedman_mse'],
                            'splitter' : ['best','random']
                           }
                  }
            }
    scores = []
    cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=2021)
    for algo_name,config in algos.items():
        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)
        gs.fit(X,Y)
        scores.append({
                    'model':algo_name,
                    'best_score':gs.best_score_,
                    'best_params':gs.best_params_
                     })
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [12]:
best_model_gridsearch(X,Y)

Unnamed: 0,model,best_score,best_params
0,lr,0.893629,{'normalize': False}
1,lasso,0.768875,"{'alpha': 1, 'selection': 'random'}"
2,dt,0.819474,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


In [13]:
#Test for some obeservations
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

### Save Model

In [14]:
#save model using Pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [15]:
#save location names for deploymnet 
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

## Advanced Models
Train advanced models like XGBoost and Catboost Regressor

## Catboost Regressor

In [16]:
# Split the train data into train and validation
train, validation = train_test_split(housedfcopy, test_size=0.2)

In [None]:
avoid_cols = ['price']
feats = [col for col in train.columns if col not in avoid_cols]
target = 'price'

# Categorical features
cat_cols = ['location']
# Categorical features index 
cat_indx = [i for i,c in enumerate(feats) if c in cat_cols]

# Catboost pool
val_pool = cb.Pool(validation[feats].values, validation[target], cat_features=cat_indx)

# Model setup
cat_reg = cb.CatBoostRegressor(iterations=3000,
                                   learning_rate=0.005,
                                   l2_leaf_reg=5,
                                   eval_metric='RMSE',
                                   random_seed = 42,
                                   metric_period=500,
                                   od_wait=20,
                                   use_best_model=True
                                   )
# Model fit
cat_reg.fit(X=train[feats], y=train[target], eval_set=val_pool, 
            cat_features=cat_indx, use_best_model=True, verbose=True, plot=False)

## LightGBM Regressor

In [17]:
# Spli the data into train and test
Y = housepricedf['price']
X = housepricedf.drop(['price'],axis=1)
train_x,validation_x,train_y ,validation_y = train_test_split(X,Y,test_size=0.2)

In [18]:
def light_gbm_model_run(train_x, train_y, validation_x, validation_y):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" :70,
        "min_data_in_leaf" :20,
        "learning_rate" : 0.001,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42}
    
    # Given its a regression case, I am using the RMSE as the metric.

    lg_train = lgb.Dataset(train_x, label=train_y)
    lg_validation = lgb.Dataset(validation_x, label=validation_y)
    evals_result_lgbm = {}

    model_light_gbm = lgb.train(params, lg_train, 5000,
                      valid_sets=[lg_train, lg_validation],
                      early_stopping_rounds=100,
                      verbose_eval=500,
                      evals_result=evals_result_lgbm )

    return model_light_gbm
#Run Lightgbm code
lgbmmodel = light_gbm_model_run(train_x, train_y, validation_x, validation_y)



Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 37.3702	valid_1's rmse: 36.9855
[1000]	training's rmse: 29.5978	valid_1's rmse: 28.6942
[1500]	training's rmse: 25.1462	valid_1's rmse: 24.4712
[2000]	training's rmse: 22.6631	valid_1's rmse: 22.4895
[2500]	training's rmse: 21.4742	valid_1's rmse: 21.6051
[3000]	training's rmse: 20.8184	valid_1's rmse: 21.0658
[3500]	training's rmse: 20.358	valid_1's rmse: 20.8036
[4000]	training's rmse: 20.0469	valid_1's rmse: 20.6137
[4500]	training's rmse: 19.8263	valid_1's rmse: 20.5638
Early stopping, best iteration is:
[4410]	training's rmse: 19.862	valid_1's rmse: 20.5579
