# Modeling

In [13]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('./data/data_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491
1,1st Block Jayanagar,1200.0,6.0,2.0,125.0,6,10416.666667
2,1st Block Jayanagar,1000.0,3.0,2.0,60.0,2,6000.0
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3,10833.333333
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2,11983.805668


## One-hot encoding

In [5]:
dummies = pd.get_dummies(df.location)
dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df2 = pd.concat([df, dummies.drop('Yeshwanthpur', axis = 1)], axis = 1)
df2.head(3)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vidyaranyapura,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli
0,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,11901.840491,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1200.0,6.0,2.0,125.0,6,10416.666667,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1000.0,3.0,2.0,60.0,2,6000.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df3 = df2.drop(['location', 'price_per_sqft'], axis = 1)
df3.head(3)

Unnamed: 0,total_sqft,bath,balcony,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,...,Vidyaranyapura,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli
0,1630.0,3.0,2.0,194.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1200.0,6.0,2.0,125.0,6,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1000.0,3.0,2.0,60.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X = df3.drop('price', axis = 1)
X.head(3)

Unnamed: 0,total_sqft,bath,balcony,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Phase JP Nagar,6th Phase JP Nagar,...,Vidyaranyapura,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli
0,1630.0,3.0,2.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1200.0,6.0,2.0,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1000.0,3.0,2.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
Y = df3.price
Y.head(3)

0    194.0
1    125.0
2     60.0
Name: price, dtype: float64

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 2023)

In [14]:
lr_clf = LinearRegression()
lr_clf.fit(X_train, Y_train)
lr_clf.score(X_test, Y_test)

0.7288990172680351

In [16]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 0)
cross_val_score(LinearRegression(), X, Y, cv=cv)

array([ 7.30120510e-01,  7.69771752e-01, -2.18075569e+17,  7.40733546e-01,
        7.66523065e-01])