In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import nbformat
import numpy as np
import os
import warnings
import pickle
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
pd.set_option('display.float_format', lambda x: '%.2f' % x)
plt.rcParams['axes.formatter.useoffset'] = False
plt.rcParams['axes.formatter.limits'] = (-5, 5)

In [16]:
data = pd.read_csv('house_price_india.csv')

In [17]:
data

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810145,42491,5,2.50,3650,9050,2.00,0,4,5,...,1921,0,122003,52.86,-114.56,2880,5400,2,58,2380000
1,6762810635,42491,4,2.50,2920,4000,1.50,0,0,5,...,1909,0,122004,52.89,-114.47,2470,4000,2,51,1400000
2,6762810998,42491,5,2.75,2910,9480,1.50,0,0,3,...,1939,0,122004,52.89,-114.47,2940,6600,1,53,1200000
3,6762812605,42491,4,2.50,3310,42998,2.00,0,0,3,...,2001,0,122005,52.95,-114.32,3350,42847,3,76,838000
4,6762812919,42491,3,2.00,2710,4500,1.50,0,0,4,...,1929,0,122006,52.90,-114.48,2060,4500,1,51,805000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14615,6762830250,42734,2,1.50,1556,20000,1.00,0,0,4,...,1957,0,122066,52.62,-114.47,2250,17286,3,76,221700
14616,6762830339,42734,3,2.00,1680,7000,1.50,0,0,4,...,1968,0,122072,52.51,-114.39,1540,7480,3,59,219200
14617,6762830618,42734,2,1.00,1070,6120,1.00,0,0,3,...,1962,0,122056,52.73,-114.51,1130,6120,2,64,209000
14618,6762830709,42734,4,1.00,1030,6621,1.00,0,0,4,...,1955,0,122042,52.72,-114.41,1420,6631,3,54,205000


To predict the prices we do not need the id and date columns so we will drop them

In [18]:
data_df = data.drop(columns=['id','Date'])

In [19]:
data_df

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,grade of the house,Area of the house(excluding basement),...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,5,2.50,3650,9050,2.00,0,4,5,10,3370,...,1921,0,122003,52.86,-114.56,2880,5400,2,58,2380000
1,4,2.50,2920,4000,1.50,0,0,5,8,1910,...,1909,0,122004,52.89,-114.47,2470,4000,2,51,1400000
2,5,2.75,2910,9480,1.50,0,0,3,8,2910,...,1939,0,122004,52.89,-114.47,2940,6600,1,53,1200000
3,4,2.50,3310,42998,2.00,0,0,3,9,3310,...,2001,0,122005,52.95,-114.32,3350,42847,3,76,838000
4,3,2.00,2710,4500,1.50,0,0,4,8,1880,...,1929,0,122006,52.90,-114.48,2060,4500,1,51,805000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14615,2,1.50,1556,20000,1.00,0,0,4,7,1556,...,1957,0,122066,52.62,-114.47,2250,17286,3,76,221700
14616,3,2.00,1680,7000,1.50,0,0,4,7,1680,...,1968,0,122072,52.51,-114.39,1540,7480,3,59,219200
14617,2,1.00,1070,6120,1.00,0,0,3,6,1070,...,1962,0,122056,52.73,-114.51,1130,6120,2,64,209000
14618,4,1.00,1030,6621,1.00,0,0,4,6,1030,...,1955,0,122042,52.72,-114.41,1420,6631,3,54,205000


Now we will create a new column of total area where we will add the values of Area of the house(excluding basement) and area of the basement to get the total area and then remove the columns Area of the house(excluding basement) and area of the basement

In [24]:
data_df['Total_area'] = data_df['Area of the house(excluding basement)'] + data_df['Area of the basement']
data_df.drop(columns=['Area of the house(excluding basement)','Area of the basement'])

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,grade of the house,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price,Total_area
0,5,2.50,3650,9050,2.00,0,4,5,10,1921,0,122003,52.86,-114.56,2880,5400,2,58,2380000,3650
1,4,2.50,2920,4000,1.50,0,0,5,8,1909,0,122004,52.89,-114.47,2470,4000,2,51,1400000,2920
2,5,2.75,2910,9480,1.50,0,0,3,8,1939,0,122004,52.89,-114.47,2940,6600,1,53,1200000,2910
3,4,2.50,3310,42998,2.00,0,0,3,9,2001,0,122005,52.95,-114.32,3350,42847,3,76,838000,3310
4,3,2.00,2710,4500,1.50,0,0,4,8,1929,0,122006,52.90,-114.48,2060,4500,1,51,805000,2710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14615,2,1.50,1556,20000,1.00,0,0,4,7,1957,0,122066,52.62,-114.47,2250,17286,3,76,221700,1556
14616,3,2.00,1680,7000,1.50,0,0,4,7,1968,0,122072,52.51,-114.39,1540,7480,3,59,219200,1680
14617,2,1.00,1070,6120,1.00,0,0,3,6,1962,0,122056,52.73,-114.51,1130,6120,2,64,209000,1070
14618,4,1.00,1030,6621,1.00,0,0,4,6,1955,0,122042,52.72,-114.41,1420,6631,3,54,205000,1030


In [25]:
data_df

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,grade of the house,Area of the house(excluding basement),...,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price,Total_area
0,5,2.50,3650,9050,2.00,0,4,5,10,3370,...,0,122003,52.86,-114.56,2880,5400,2,58,2380000,3650
1,4,2.50,2920,4000,1.50,0,0,5,8,1910,...,0,122004,52.89,-114.47,2470,4000,2,51,1400000,2920
2,5,2.75,2910,9480,1.50,0,0,3,8,2910,...,0,122004,52.89,-114.47,2940,6600,1,53,1200000,2910
3,4,2.50,3310,42998,2.00,0,0,3,9,3310,...,0,122005,52.95,-114.32,3350,42847,3,76,838000,3310
4,3,2.00,2710,4500,1.50,0,0,4,8,1880,...,0,122006,52.90,-114.48,2060,4500,1,51,805000,2710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14615,2,1.50,1556,20000,1.00,0,0,4,7,1556,...,0,122066,52.62,-114.47,2250,17286,3,76,221700,1556
14616,3,2.00,1680,7000,1.50,0,0,4,7,1680,...,0,122072,52.51,-114.39,1540,7480,3,59,219200,1680
14617,2,1.00,1070,6120,1.00,0,0,3,6,1070,...,0,122056,52.73,-114.51,1130,6120,2,64,209000,1070
14618,4,1.00,1030,6621,1.00,0,0,4,6,1030,...,0,122042,52.72,-114.41,1420,6631,3,54,205000,1030


In [27]:
data_df_cols = data_df.columns.tolist()

In [28]:
data_df_cols

['number of bedrooms',
 'number of bathrooms',
 'living area',
 'lot area',
 'number of floors',
 'waterfront present',
 'number of views',
 'condition of the house',
 'grade of the house',
 'Area of the house(excluding basement)',
 'Area of the basement',
 'Built Year',
 'Renovation Year',
 'Postal Code',
 'Lattitude',
 'Longitude',
 'living_area_renov',
 'lot_area_renov',
 'Number of schools nearby',
 'Distance from the airport',
 'Price',
 'Total_area']

In [None]:
cols = ['number of bedrooms',
 'number of bathrooms',
 'living area',
 'lot area',
 'number of floors',
 'waterfront present',
 'number of views',
 'condition of the house',
 'grade of the house',
 'Area of the house(excluding basement)',
 'Area of the basement',
 'Built Year',
 'Renovation Year',
 'Postal Code',
 'Lattitude',
 'Longitude',
 'living_area_renov',
 'lot_area_renov',
 'Number of schools nearby',
 'Distance from the airport',
 'Total_area',
 'Price',
 ]