Import Libraries

In [None]:
import folium
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from geopy.geocoders import ArcGIS

import warnings
warnings.filterwarnings('ignore')

Load Dataset

In [None]:
NJ_prop = pd.read_csv('NJ_Property.csv')

In [None]:
NJ_prop.head()

In [None]:
NJ_prop.shape

In [None]:
NJ_prop.info()

In [None]:
NJ_prop.Availability.value_counts()

In [None]:
NJ_prop.Region.value_counts().head(20)

In [None]:
NJ_prop.Region.nunique()

In [None]:
NJ_prop.describe().round(2)

In [None]:
Q1 = np.quantile(NJ_prop.Area_SqFt,0.02)
Q3 = np.quantile(NJ_prop.Area_SqFt,0.98)
med = np.median(NJ_prop.Area_SqFt)
IQR = Q3 - Q1
upper_bound = Q3+(1.5*IQR)
lower_bound = Q1-(1.5*IQR)
outliers1 = NJ_prop.Area_SqFt[(NJ_prop.Area_SqFt <= lower_bound) | (NJ_prop.Area_SqFt >= upper_bound)]

NJ_prop = NJ_prop.drop(outliers1.index).reset_index(drop=True)

In [None]:
NJ_prop.describe().round(2)

In [None]:
NJ_prop.duplicated().sum()

In [None]:
NJ_prop = NJ_prop.drop_duplicates(ignore_index=True)

In [None]:
NJ_prop.isna().sum()

In [None]:
NJ_prop.shape

In [None]:
NJ_prop.to_csv('Final_Project.csv', index_label=False)
NJ_proj = pd.read_csv('Final_Project.csv')

Data Visualization

Set rcParams

In [None]:
rcParams = {'xtick.labelsize':'14','ytick.labelsize':'14','axes.labelsize':'16'}

Find Outliers

In [None]:
fig, ax = plt.subplots(6,2, figsize = (12,14))

fig.suptitle('Histplot & Box Plot : With Outliers', size = 18, fontweight="bold")
sns.kdeplot(ax= ax[0,0], data = NJ_prop, x = 'Area_SqFt')
sns.boxplot(ax= ax[0,1], data = NJ_prop, x = 'Area_SqFt')
sns.kdeplot(ax= ax[1,0], data = NJ_prop, x = 'Rate_SqFt')
sns.boxplot(ax= ax[1,1], data = NJ_prop, x = 'Rate_SqFt')
sns.kdeplot(ax= ax[2,0], data = NJ_prop, x = 'Floor_No')
sns.boxplot(ax= ax[2,1], data = NJ_prop, x = 'Floor_No')
sns.kdeplot(ax= ax[3,0], data = NJ_prop, x = 'Bedroom')
sns.boxplot(ax= ax[3,1], data = NJ_prop, x = 'Bedroom')
sns.kdeplot(ax= ax[4,0], data = NJ_prop, x = 'Bathroom')
sns.boxplot(ax= ax[4,1], data = NJ_prop, x = 'Bathroom')
sns.kdeplot(ax= ax[5,0], data = NJ_prop, x = 'Price')
sns.boxplot(ax= ax[5,1], data = NJ_prop, x = 'Price')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

Heatmap

In [None]:
fig = plt.figure(figsize=(10,8))

sns.heatmap(NJ_prop.corr(), annot = True, cmap='YlGnBu', linewidth=.5)
fig.suptitle('Heatmap NJ Property Data',fontsize=18, fontweight="bold")
pylab.rcParams.update(rcParams)
fig.tight_layout()
plt.show()

Exploratory Data Analysis

Checking for binary, ordinal, continuous data and target columns

In [None]:
num_columns = NJ_prop.describe().columns
categorical_cols = NJ_prop.describe(include= 'object').columns

In [None]:
NJ_prop.nunique()

In [None]:
binary_variables = [i  for i in NJ_prop.columns  if (len(NJ_prop[i].unique()) == 2)]

In [None]:
binary_variables

In [None]:
ordinal_variables=[i for i in NJ_prop.columns if ((len(NJ_prop[i].unique()) > 2 ) and (len(NJ_prop[i].unique()) <= 53))]


In [None]:
ordinal_variables

In [None]:
continous_variable = [i for i in NJ_prop.columns if ((len(NJ_prop[i].unique()) > 53))]
continous_variable = continous_variable[3:-1] # Remove Price and Location from data

In [None]:
continous_variable

In [None]:
target_variable = 'Price'

In [None]:
target_variable

Binary Variable

In [None]:
print(binary_variables)
print('Element in binary category :', len(binary_variables))

Availability Count

In [None]:
fig = plt.figure(figsize=(6,8))

plot = sns.barplot(data =NJ_prop , x = NJ_prop.Availability.value_counts().index,
            y = NJ_prop.Availability.value_counts().values)
for bar in plot.patches:
    plot.annotate(format(bar.get_height(), '.0f'),(bar.get_x() + bar.get_width()/2, bar.get_height()),
                  ha='center', va='center', size=15, xytext=(0,8), textcoords='offset points')

fig.suptitle('Availability : Counts',fontsize=18, fontweight="bold")
plt.xlabel('Availability',)
plt.ylabel('Count')

pylab.rcParams.update(rcParams)
fig.tight_layout()
plt.show()

#fig.savefig('Availability_Bar', dpi = 500)

Ordinal Variables

In [None]:
print(ordinal_variables)
print('Number of ordinal features is :', len(ordinal_variables))

Price with respect to Property Age

In [None]:
fig = plt.figure(figsize=(10,6))

sns.barplot(data = NJ_prop , x = NJ_prop.groupby('Property_Age').median().Price , y = NJ_prop.groupby('Property_Age').median().index)
fig.suptitle('Median Price with respect to Property Age', fontsize= 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu',fontsize = 13, color ='blue')
pylab.rcParams.update(rcParams)
fig.tight_layout()
plt.show()

#fig.savefig('Property_Age_Price_Bar', dpi = 500)

In [None]:
fig = plt.figure(figsize = (8,8))
plt.pie(NJ_prop['Property_Age'].value_counts(),
       labels=['1 to 5 Year','0 to 1 Year','5 to 10 Year','10+ Year','Under Construction'],
       autopct= '%.2f%%', textprops = {'size':'large'},explode = [0.005,0.005,0.005,0.005,0.005])
plt.legend(loc= 'upper left')
plt.title("Property Age Distribution", fontsize = 18, fontweight = 'bold')
# fig.text(0.9, 0.15, 'Shanu', fontsize = 13, color ='red')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('Price_Age_Distribution', dpi = 500)

In [None]:
fig = plt.figure(figsize=(12,6))
fig.suptitle('All Prices with respect to Property Age', size = 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu Halli',fontsize = 13, color ='blue')

sns.scatterplot(data = NJ_prop , x = NJ_prop['Property_Age'], y = NJ_prop['Price'])
plt.xlabel("Property Age", size=16)
plt.ylabel("Price (Dollars)", size=16)
plt.xticks(rotation=90)

pylab.rcParams.update(rcParams)
fig.tight_layout()
plt.show()

#fig.savefig('Property_Age_Price_Scatter', dpi = 500)

Price with respect to Bed and Bath

In [None]:
fig, ax = plt.subplots(2,1, figsize = (12,12))

sns.barplot(ax= ax[0], data =NJ_prop , x = NJ_prop.groupby('Bedroom').median().index, y = NJ_prop.groupby('Bedroom').median().Price)
sns.barplot(ax= ax[1], data = NJ_prop , x = NJ_prop.groupby('Bathroom').median().index, y = NJ_prop.groupby('Bathroom').median().Price)
fig.suptitle('Median Price with respect to Bed & Bath', size = 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='black')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('BednBath_Price_Bar', dpi = 500)

In [None]:
fig, ax = plt.subplots(2,1, figsize = (12,10))

sns.scatterplot(ax= ax[0], data = NJ_prop , x = NJ_prop['Bedroom'], y = NJ_prop['Price'])
sns.scatterplot(ax= ax[1], data = NJ_prop , x = NJ_prop['Bathroom'], y = NJ_prop['Price'])
fig.suptitle('All Prices with respect to Bed & Bath', size = 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='blue')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('Bed_Bath_Price_Scatter', dpi = 500)

Area Type Distribution

In [None]:
fig = plt.figure(figsize = (8,8))
plt.pie(NJ_prop['Area_Type'].value_counts(),
       labels=['Super Built Up Area','Carpet Area','Built Up Area','Plot Area'],
       autopct= '%.2f%%', textprops = {'size':'large'},explode = [0.005,0.005,0.005,0.005])
plt.legend(loc= 'upper left')
plt.title("Type of Area", fontsize = 18, fontweight = 'bold')
# fig.text(0.9, 0.15, 'Shanu', fontsize = 13, color ='red')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('Area_Type_Distribution', dpi = 500)

In [None]:
fig = plt.figure(figsize=(10,4))

sns.barplot(data =NJ_prop , x =NJ_prop.groupby('Area_Type').median().Price, y =NJ_prop.groupby('Area_Type').median().index)
fig.suptitle('Median Price with respect to Type of Area', fontsize= 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='red')
pylab.rcParams.update(rcParams)
fig.tight_layout()
plt.show()

#fig.savefig('Area_Type_Count_Bar', dpi = 500)

Price with respect to Floor Numbers

In [None]:
fig = plt.figure(figsize=(12,6))
params = {'xtick.labelsize':'8','ytick.labelsize':'14','axes.labelsize':'16'}

sns.barplot(data= NJ_prop, x= NJ_prop.groupby('Floor_No').median().index, y= NJ_prop.groupby('Floor_No').median().Price, palette= 'muted')
fig.suptitle('Median Price with respect to Floor Numbers', fontsize= 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='blue')
pylab.rcParams.update(params)
fig.tight_layout()
plt.show()

#fig.savefig('Property_Floor_Numbers_Bar', dpi = 500)

In [None]:
fig = plt.figure(figsize=(12,6))

group_full = NJ_prop.groupby('Floor_No')['Price'].mean()
group = group_full.reset_index()
group = group[group['Floor_No'] > 0]
group = group[group['Floor_No'] < 60]

x = group['Floor_No']
y = group['Price']
fig.suptitle('Mean Price with respect to Floor No', fontsize= 18 , fontweight='bold')
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='blue')
sns.scatterplot(x=x, y=y)

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('FloorNo_Price_Scatter', dpi = 500)

Continuous Variables

In [None]:
for i in continous_variable:
    print("Length of", i,":", len(NJ_prop[i].unique()))

Price with respect to SqFt Area

In [None]:
fig = plt.figure(figsize=(12,8))

group_full = NJ_prop.groupby('Area_SqFt')['Price'].mean()
group = group_full.reset_index()
group = group[group['Area_SqFt'] > 0]
group = group[group['Area_SqFt'] < 2000]

x = group['Area_SqFt']
y = group['Price']
fig.suptitle('Mean Price with respect to SqFt Area', fontsize= 18 , fontweight='bold')
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='black')
sns.scatterplot(x=x, y=y)

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('SqFt_Area_Price_Scatter', dpi = 500)

Target Variable

In [None]:
NJ_prop['Price_Cat'] = pd.cut(x = NJ_prop['Price'], bins = [13,70,190,16500], labels=['Low','Medium','High'], right = False)
NJ_prop.Price_Cat.value_counts()

Checking for Imbalanced or balanced dataset with regards to the Target

In [None]:
fig = plt.figure(figsize = (8,8))

plt.pie(NJ_prop['Price_Cat'].value_counts(), labels= ['$70 - $190','$190 onwards','up to $70'],
        explode= [0.005,0.005,0.005], autopct= '%.2f%%', startangle= 181, textprops= {'size':'large','fontweight':'bold'})
plt.legend(loc= 'upper right')
plt.title("Price Range Level Distribution", fontsize = 18, fontweight = 'bold')
# fig.text(0.9, 0.15, 'Shanu', fontsize = 13, color ='red')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('Price_Range_Distribution', dpi = 500)

In [None]:
fig, ax = plt.subplots(2,1, figsize = (12,10))

fig.suptitle('Price with respect to Property Count', size = 18, fontweight="bold")
# fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='red')
sns.histplot(ax= ax[0],data=NJ_prop["Price"], kde =  True, bins = 200)
sns.boxplot(ax= ax[1], data=NJ_prop["Price"], orient="h", palette='muted')

pylab.rcParams.update(rcParams)
fig.tight_layout()
fig.subplots_adjust(top=0.93)
plt.show()

#fig.savefig('Price_Count_Bar', dpi = 500)


Bar plot Region-wise Price

In [None]:
import plotly.graph_objects as px
import plotly.express as go
import numpy as np

x = NJ_prop['Region']
y = NJ_prop['Price']

plot = px.Figure(data=[px.Bar(x=x, y=y)])
plot.update_layout(xaxis=dict(rangeselector=dict(buttons=list([dict(count=1,stepmode="backward")])),
                              rangeslider=dict(visible=True)),width=900, height=900)

plot.show()

Check for a given region how 2, 3, and 4 bhk (bedroom, hall, kitchen) property prices are distributed 

In [None]:
NJ_prop.Region.nunique()

In [None]:
def plot_scatter_chart1(NJ_prop,Region):
    fig  = plt.figure(figsize=(10,7))
    bhk2 = NJ_prop[(NJ_prop.Region==Region) & (NJ_prop.Bedroom==2)]
    bhk3 = NJ_prop[(NJ_prop.Region==Region) & (NJ_prop.Bedroom==3)]
    bhk4 = NJ_prop[(NJ_prop.Region==Region) & (NJ_prop.Bedroom==4)]
    plt.rcParams['figure.figsize'] = (14,12)
    sns.scatterplot(x=bhk2.Area_SqFt,y=bhk2.Price,marker='p',color='blue',label='2 BHK',s=100)
    sns.scatterplot(x=bhk3.Area_SqFt,y=bhk3.Price,marker='o',color='red',label='3 BHK',s=100)
    sns.scatterplot(x=bhk4.Area_SqFt,y=bhk4.Price,marker='*',color='green',label='4 BHK',s=300)
    plt.xlabel("Total Square Feet Area", )
    plt.ylabel("Price (US Dollars)")
    plt.title(Region, fontsize = 18, fontweight="bold")
    # fig.text(0.9, 0.15, 'Shanu Halli', fontsize = 13, color ='blue')
    
      
    pylab.rcParams.update(rcParams)
    plt.legend(fontsize = 12)
    fig.tight_layout()
    fig.subplots_adjust(top=0.90)
    #fig.savefig(Region, dpi = 500)

In [None]:
# plot_scatter_chart1(NJ_prop,"Flemington")