In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from sklearn.metrics import *
import folium as fs
from statistics import mean
import seaborn as sns

## Exploratory data analysis 

In [45]:
df = pd.read_csv("hdb_details.csv")
from tabulate import tabulate
pdtabulate=lambda df:tabulate(df,headers='keys',tablefmt='psql')
# print(pdtabulate(df))

In [46]:
df["town"].value_counts().sort_index()

Ang Mo Kio          88
Bedok               91
Bishan              61
Bukit Batok         79
Bukit Merah         90
Bukit Panjang       76
Bukit Timah          8
Central Area        14
Choa Chu Kang      112
Clementi            45
Geylang             64
Hougang             78
Jurong East         48
Jurong West        120
Kallang/Whampoa     78
Marine Parade       36
Pasir Ris           51
Punggol            133
Queenstown         104
Sembawang           99
Sengkang           144
Serangoon           29
Tampines           128
Toa Payoh          126
Woodlands          185
Yishun             148
Name: town, dtype: int64

In [47]:
# encode the town to their district
area = {
    "West" : ["Jurong West", "Bukit Batok", "Bukit Panjang", "Choa Chu Kang", "Clementi", "Jurong East"],
    "East" : ["Bedok", "Pasir Ris", "Tampines"],
    "Central" : ["Bishan", "Bukit Merah", "Bukit Timah", "Central Area", "Geylang", "Kallang/Whampoa",
     "Marine Parade", "Queenstown", "Toa Payoh"],
    "North-East": ["Ang Mo Kio", "Hougang", "Punggol", "Sengkang", "Serangoon"],
    "North" : ["Sembawang", "Woodlands", "Yishun"]
}

In [48]:
def check_area(town, area):
    # print(town)
    for key , value in area.items():
        if (town in value) :
            return key;
    print(town)
    return None;


In [49]:
def create_dummy(area) :
    if area == "West" :
        return 0
    elif area == "East" :
        return 1
    elif area == "Central":
        return 2
    elif area == "North-East":
        return 3
    else:
        return 4


    

In [50]:
areas = []
# creating new column Area and creating dummy variables for linear regression model
for index, row in df.iterrows():
    areas.append(check_area(row["town"], area))
df = df.assign(Area = areas)
df["Area_Encoding"] = df["Area"].apply(create_dummy)

In [51]:
# initial plot of histogram to see the distribution
fig = px.histogram(
    data_frame= df,
    x = 'price',
    nbins = 50
)
fig.update_layout(bargap = 0.1)
fig.show()

In [52]:
# checking for extreme outliers
boxplot = px.box(df,  "price")
boxplot.show()

In [53]:
# removing outliers
df = df[(df['price'] < 2_000_000) & (df['price'] > 50_000)]


In [54]:
fig = px.histogram(
    data_frame= df,
    x = 'price',
    nbins = 50
)
fig.update_layout(bargap = 0.1)
fig.show()

In [55]:
area_prop = df["Area"].unique()
area_prop.sort()
avg_price = []
for area in area_prop:
    avg_price.append(mean(df[df["Area"] == area]["price"]))
details = {
    'Areas' : area_prop,
    'Avg Price' : avg_price,
}
  
area_df = pd.DataFrame(details, columns = ['Areas', 'Avg Price'])
px.bar(area_df, x = "Areas", y = "Avg Price").show()

In [56]:
px.box(df, "Area", "price").show()

In [57]:
towns = df["town"].unique()
towns.sort()
towns
avg_price = []
for town in towns:
    avg_price.append(mean(df[df["town"] == town]["price"]))
details = {
    'Town' : towns,
    'Avg Price' : avg_price,
}
  
new_df = pd.DataFrame(details, columns = ['Town', 'Avg Price'])
print(new_df)

               Town      Avg Price
0        Ang Mo Kio  594914.034091
1             Bedok  573354.288889
2            Bishan  810237.163934
3       Bukit Batok  624375.468354
4       Bukit Merah  748455.555556
5     Bukit Panjang  647320.078947
6       Bukit Timah  650000.000000
7      Central Area  598698.285714
8     Choa Chu Kang  597058.473214
9          Clementi  614841.955556
10          Geylang  610191.250000
11          Hougang  611105.282051
12      Jurong East  563243.000000
13      Jurong West  576293.383333
14  Kallang/Whampoa  680261.487179
15    Marine Parade  570694.444444
16        Pasir Ris  748145.882353
17          Punggol  609983.278195
18       Queenstown  739436.884615
19        Sembawang  571951.868687
20         Sengkang  599513.090909
21        Serangoon  716966.275862
22         Tampines  625873.960938
23        Toa Payoh  707131.571429
24        Woodlands  593120.016216
25           Yishun  583560.551020


In [58]:
px.scatter(x = new_df["Town"], y = new_df["Avg Price"]).show()

In [59]:
# performing train test split for the linear regression model
X = df[["size", "bedrooms", "bathrooms", "Area_Encoding"]]
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, train_size= 0.7, random_state= 101)


### Linear Regression

In [60]:
model = LinearRegression()
model.fit(X_train, y_train)
print(model.coef_)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

[   229.75729123 128938.70506915  68048.73734638  -5839.37426402]


0.2342166145331872