In [1]:
import plotly_express as px
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy
import os

In [2]:
PRICES_PATH = "../data_processed/resale_flat_prices/engineered_data.csv"
df = pd.read_csv(PRICES_PATH)

In [3]:
display(df.head(5))

Unnamed: 0,id,town,flat_type,flat_model,sale_date,floor,age,relative_tenure,floor_area,psm,...,nearest_post_offices,nearest_bus_stops,nearest_pharmacies,nearest_primary_schools,nearest_parks,nearest_trains,sale_month,cpi,cpi_psm,avg_sora
0,1,JURONG_WEST,3_ROOM,IMPROVED,1990-01-01,5.0,16,0.899953,70.0,334.285714,...,0.798752,0.195397,0.761661,0.541789,0.947906,0.904104,1,61.9,595.125779,
1,2,OTHER,5_ROOM,STANDARD,1990-01-01,5.0,15,0.90228,120.0,916.666667,...,3.042559,0.090376,1.133326,0.159056,0.33027,1.954674,1,61.9,1631.933226,
2,3,BUKIT_MERAH,3_ROOM,IMPROVED,1990-01-01,5.0,15,0.90228,63.0,722.222222,...,1.148847,0.446105,0.850333,0.943485,0.737703,0.932695,1,61.9,1285.765572,
3,4,BUKIT_MERAH,OTHER,IMPROVED,1990-01-01,8.0,21,0.887322,29.0,251.724138,...,1.294327,1.620819,0.431442,0.691163,0.311404,0.448964,1,61.9,448.142165,
4,5,BUKIT_MERAH,OTHER,IMPROVED,1990-01-01,11.0,21,0.887322,29.0,251.724138,...,1.294327,1.620819,0.431442,0.691163,0.311404,0.448964,1,61.9,448.142165,


In [4]:
def town_data_over_time(df):
  new_df = deepcopy(df)
  new_df = new_df[(103 <= new_df["longitude"]) & (new_df["longitude"] <= 104.5) &
                  (1 <= new_df["latitude"]) & (new_df["latitude"] < 2)]
  new_df = new_df.assign(sale_year = lambda x: np.floor(x["sale_month"] / 12 + 1990).astype(np.int64))
  res_df = new_df.groupby(by=["sale_year", "town"]).agg({"flat_type":"count", "longitude": "mean", "latitude": "mean"})
  res_df = res_df.rename(columns = {"flat_type": "num_datapoints", "longitude": "mean_longitude", "latitude": "mean_latitude"}).reset_index()
  return res_df

In [5]:
town_over_time_df = town_data_over_time(df)
display(town_over_time_df.head(5))

Unnamed: 0,sale_year,town,num_datapoints,mean_longitude,mean_latitude
0,1990,ANG_MO_KIO,1436,103.847328,1.370549
1,1990,BEDOK,1080,103.929756,1.32888
2,1990,BISHAN,89,103.839208,1.352446
3,1990,BUKIT_BATOK,591,103.748986,1.350473
4,1990,BUKIT_MERAH,606,103.822145,1.280924


In [6]:
year_df = town_over_time_df.groupby("sale_year").agg(num_datapoints = ("num_datapoints", lambda x: np.average(x, weights=town_over_time_df.loc[x.index, "num_datapoints"])))
year_df = year_df.reset_index()
display(year_df.head(10))

Unnamed: 0,sale_year,num_datapoints
0,1990,754.96284
1,1991,866.148326
2,1992,938.040433
3,1993,1179.438069
4,1994,1760.295061
5,1995,1762.022266
6,1996,2211.885681
7,1997,1897.824623
8,1998,3002.171024
9,1999,3571.172736


In [7]:
import plotly.graph_objects as go
groups = town_over_time_df["town"].unique()

town_over_time_fig = go.Figure()
for group in groups:
  df_group = town_over_time_df[town_over_time_df["town"] == group]
  town_over_time_fig.add_trace(go.Scatter(x = df_group["sale_year"], y = df_group["num_datapoints"],
                                name = group, line=dict(width = 1.5), mode="lines+markers", marker=dict(size=4)))
town_over_time_fig.add_trace(go.Scatter(x = year_df["sale_year"], y = year_df["num_datapoints"], name = "Mean", 
                                    line=dict(width = 4, dash="dash"), mode="lines+markers", marker=dict(size=4)))
town_over_time_fig.update_layout(title="Number of datapoints per town across time", xaxis_title="Year of sale",
                        yaxis_title="Number of datapoints")
town_over_time_fig.write_html("town-over-time-fig.html")


In [8]:
def town_data_overall(df):
  new_df = deepcopy(df)
  new_df = new_df[(103 <= new_df["longitude"]) & (new_df["longitude"] <= 104.5) &
                  (1 <= new_df["latitude"]) & (new_df["latitude"] < 2)]
  res_df = new_df.groupby(by=["town"]).agg({"flat_type":"count", "longitude": "mean", "latitude": "mean"})
  res_df = res_df.rename(columns = {"flat_type": "num_datapoints", "longitude": "mean_longitude", "latitude": "mean_latitude"}).reset_index()
  return res_df

In [9]:
town_overall_df = town_data_overall(df)
display(town_overall_df.head(5))

Unnamed: 0,town,num_datapoints,mean_longitude,mean_latitude
0,ANG_MO_KIO,50307,103.847228,1.370894
1,BEDOK,61894,103.927488,1.329329
2,BISHAN,20590,103.846147,1.352746
3,BUKIT_BATOK,41542,103.750141,1.353053
4,BUKIT_MERAH,31801,103.821729,1.281152


In [10]:
town_overall_fig = px.scatter_mapbox(
  town_overall_df, lat="mean_latitude", lon="mean_longitude", 
  hover_name="town", size="num_datapoints", hover_data=["num_datapoints"], width = 1000, height = 800
).update_layout(
  mapbox={
    "style": "carto-positron",
    "zoom": 10,
  }
)
town_overall_fig.write_html("./town-overall-fig.html")

In [19]:
def all_geocoding(df):
  new_df = deepcopy(df)
  new_df = new_df[(103 <= new_df["longitude"]) & (new_df["longitude"] <= 104.5) &
                  (1 <= new_df["latitude"]) & (new_df["latitude"] < 2)]
  new_df = new_df.assign(sale_year = lambda x: np.floor(x["sale_month"] / 12 + 1990).astype(np.int64))
  res_df = new_df.groupby(by=["sale_year", "longitude","latitude"]).agg({"flat_type":"count", "address": "first"})
  res_df = res_df.rename(columns = {"flat_type": "num_datapoints"}).reset_index()
  return res_df

In [20]:
geocoding_df = all_geocoding(df)
display(geocoding_df.head(5))

Unnamed: 0,sale_year,longitude,latitude,num_datapoints,address
0,1990,103.704256,1.421215,2,BLOCK 3 LIM CHU KANG RD
1,1990,103.704754,1.421395,1,BLOCK 4 LIM CHU KANG RD
2,1990,103.706381,1.346952,11,BLOCK 168 BOON LAY DR
3,1990,103.706519,1.34677,12,BLOCK 167 BOON LAY DR
4,1990,103.707197,1.347256,10,BLOCK 170 BOON LAY DR


In [21]:
geocoding_fig = px.scatter_mapbox(
  geocoding_df, lat="latitude", lon="longitude", 
  hover_name="address", size="num_datapoints", hover_data=["num_datapoints"], width = 1000, height = 800,
  animation_frame="sale_year",
).update_layout(
  mapbox={
    "style": "carto-positron",
    "zoom": 10,
  }
)
geocoding_fig.write_html("./geocoding_fig.html")

In [22]:
def town_geocoding(df):
  new_df = deepcopy(df)
  new_df = new_df[(103 <= new_df["longitude"]) & (new_df["longitude"] <= 104.5) &
                  (1 <= new_df["latitude"]) & (new_df["latitude"] < 2)]
  new_df = new_df.assign(sale_year = lambda x: np.floor(x["sale_month"] / 12 + 1990).astype(np.int64))
  res_df = new_df.groupby(by=["sale_year", "town"]).agg({"flat_type":"count", "latitude":"mean", "longitude":"mean"})
  res_df = res_df.rename(columns = {"flat_type": "num_datapoints"}).reset_index()
  return res_df

In [23]:
town_geocoding_df = town_geocoding(df)
display(town_geocoding_df.head(5))

Unnamed: 0,sale_year,town,num_datapoints,latitude,longitude
0,1990,ANG_MO_KIO,1436,1.370549,103.847328
1,1990,BEDOK,1080,1.32888,103.929756
2,1990,BISHAN,89,1.352446,103.839208
3,1990,BUKIT_BATOK,591,1.350473,103.748986
4,1990,BUKIT_MERAH,606,1.280924,103.822145


In [25]:
town_geocoding_fig = px.scatter_mapbox(
  town_geocoding_df, lat="latitude", lon="longitude", 
  hover_name="town", size="num_datapoints", hover_data=["num_datapoints"], width = 1000, height = 800,
  animation_frame="sale_year",
).update_layout(
  mapbox={
    "style": "carto-positron",
    "zoom": 10,
  }
)
town_geocoding_fig.write_html("./town_geocoding_fig.html")