In [1]:
#Import dependencies
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.model_selection import ShuffleSplit,cross_val_score,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import pickle
import json

In [2]:
#Import dataset
housepricedata = pd.read_csv('./Dataset/Bengaluru_House_Data.csv')
housepricedf = housepricedata.copy()

FileNotFoundError: [Errno 2] File ./Dataset/Bengaluru_House_Data.csv does not exist: './Dataset/Bengaluru_House_Data.csv'

## Data Preprocessing and Feature Engineering

In [237]:
def isfloat(x):
    '''convert totsl_sqrt to numerical values'''
    try:
        float(x)
    except:
        return False
    return True
def convert_sqft_to_num(x):
    '''To convert totsl_sqrt to numerical values'''
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   
def iqr(dff):
    ''' Return iqr range'''
    q1 = dff.quantile(0.25)
    q3 = dff.quantile(0.75)
    iqr = q3-q1
    return q1-1.5*iqr,q3+1.5*iqr

def remove_pps_outliers(df):
    ''' Remove the outlies in price_per_sqrft'''
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        q1,q3 = iqr(df['price_per_sqrft'])
        reduced_df = subdf[(subdf.price_per_sqrft>(q1)) & (subdf.price_per_sqrft<=(q3))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

def remove_pps_outliers_(df):
    ''' Remove the outlies in price_per_sqrft'''
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqrft)
        st = np.std(subdf.price_per_sqrft)
        reduced_df = subdf[(subdf.price_per_sqrft>(m-st)) & (subdf.price_per_sqrft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [238]:
#drop columns which are not relevent for price predictions
housepricedf.drop(columns=['area_type','availability','society','balcony'],inplace=True)

#drop null values from the data
housepricedf.dropna(inplace=True)

#convert size to bhk as numerical values
housepricedf['bhk'] = housepricedf['size'].apply(lambda x:int(x.split(' ')[0]))

#convert total_sqrt to numerical
housepricedf['total_sqft'] = housepricedf['total_sqft'].apply(convert_sqft_to_num)

#let's creat new features 
#price per sqrft
housepricedf['price_per_sqrft'] = housepricedf['price']*100000/housepricedf['total_sqft']

#remove the locations with small obervations
housepricedf['location'] = housepricedf['location'].apply(lambda x : x.strip())
location_stats = housepricedf['location'].value_counts(ascending=False)
location_stats_less_than_10 = location_stats[location_stats<=10]
housepricedf['location'] = housepricedf['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

#remove obervations with sqrft/bhk less than 300(Since minimum sqrft is 300 for single room)
housepricedf = housepricedf[~((housepricedf['total_sqft']/housepricedf['bhk'])<300)]

#remove outlies in price_per_sqrft using iqr range
housepricedf = remove_pps_outliers_(housepricedf)

#remove obeservations having more number of bathrooms than bhk
housepricedf = housepricedf[housepricedf['bath']<housepricedf['bhk']+2]

#drop columns - size
housepricedf.drop(columns=['size','price_per_sqrft'],inplace=True)

#Encoding location features - 
locationdummydf = pd.get_dummies(housepricedf['location'])
locationdummydf.drop(columns = ['other'],inplace=True)
housepricedf = pd.concat([housepricedf,locationdummydf],axis=1)
# housepricedf.drop(columns=['location'],inplace=True)

## Model Training

In [244]:
#split the data int train and test
Y = housepricedf['price']
X = housepricedf.drop(['price'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,shuffle=True)

### Linear Regression

In [253]:
lr_clf = LinearRegression()
#Fit the data using Linear Regression model
lr_clf.fit(x_train,y_train)

#Cross validation scores 
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=2021)
cross_valscore = cross_val_score(LinearRegression(),X,Y,cv=cv)
print('Cross Val Score is ',cross_valscore)

Cross Val Score [0.81928163 0.78652451 0.79986841 0.77168779 0.8490252 ]


### Gridsearch CV
Gridsearch for three models : Linear Regression, Decision Tree Regression,Lasso Regression

In [290]:
def best_model_gridsearch(x,y):
    algos = {
            'lr':{
                 'model':LinearRegression(),
                 'params':{ 
                          'normalize' :[True,False]}
                 },
            'lasso':{
                    'model' : Lasso(),
                    'params' : {
                                'alpha' : [1,2],
                                'selection' : ['random','cyclic']
                                }
                    },
            'dt' :{
                  'model' : DecisionTreeRegressor(),
                  'params':{
                            'criterion':['mse','friedman_mse'],
                            'splitter' : ['best','random']
                           }
                  }
            }
    scores = []
    cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=2021)
    for algo_name,config in algos.items():
        gs = GridSearchCV(config['model'],config['params'],cv=cv,return_train_score=False)
        gs.fit(X,Y)
        scores.append({
                    'model':algo_name,
                    'best_score':gs.best_score_,
                    'best_params':gs.best_params_
                     })
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [291]:
best_model_gridsearch(X,Y)

Unnamed: 0,model,best_score,best_params
0,lr,0.805278,{'normalize': False}
1,lasso,0.682656,"{'alpha': 1, 'selection': 'random'}"
2,dt,0.722964,"{'criterion': 'mse', 'splitter': 'best'}"


In [292]:
#Test for some obeservations
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

### Save Model

In [298]:
#save model using Pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [300]:
#save location names for deploymnet 
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))