# Data Preparasion

In [1]:
import warnings, os, numpy as np, pandas as pd, geopandas as gpd
warnings.filterwarnings("ignore")

from shapely.strtree import STRtree
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_context("notebook")

from libpysal.weights import KNN
from esda.moran import Moran, Moran_Local

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
import mapclassify as mc

## Paths (processed layers in EPSG:3414)

In [12]:
PATH_HDB   = "processed/hdb_2023_points_3414.geojson"
PATH_MRT   = "processed/mrt_2024_points_3414.geojson"
PATH_BUS   = "processed/bus_2025_points_3414.geojson"
PATH_HEALTH_CEN = "processed/health_centroids_3414.geojson"
PATH_SPORTS_CEN = "processed/sports_centroids_3414.geojson"

# Optional polygons (for mapping/aggregation)
# PATH_PA    = "data/mp19_planning_areas_3414.geojson"
PATH_SUB   = "processed/mp19_subzones_3414.geojson"

## Load layers

In [13]:
gdf_hdb   = gpd.read_file(PATH_HDB)
gdf_mrt   = gpd.read_file(PATH_MRT)
gdf_bus   = gpd.read_file(PATH_BUS)
gdf_hcen  = gpd.read_file(PATH_HEALTH_CEN)   # centroids
gdf_scen  = gpd.read_file(PATH_SPORTS_CEN)   # centroids

# Assert projected CRS
for name, g in [("HDB", gdf_hdb), ("MRT", gdf_mrt), ("BUS", gdf_bus), ("HEALTH", gdf_hcen), ("SPORTS", gdf_scen)]:
    assert g.crs and str(g.crs).lower().endswith("3414"), f"{name} not in EPSG:3414."

gdf_hdb.head(3)

Unnamed: 0.1,Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,resale_year,resale_month,resale_age,ADDRESS,LAT,LNG,X,Y,POSTAL,geometry
0,0,2023-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,01 TO 03,44.0,Improved,1979,...,2023,1,44,406 ANG MO KIO AVE 10,1.362005,103.85388,30288.234663,38229.067463,560406,POINT (30288.235 38229.067)
1,1,2023-01,ANG MO KIO,2 ROOM,323,ANG MO KIO AVE 3,04 TO 06,49.0,Improved,1977,...,2023,1,46,323 ANG MO KIO AVE 3,1.367908,103.847714,29602.047153,38881.891694,560323,POINT (29602.047 38881.892)
2,2,2023-01,ANG MO KIO,2 ROOM,314,ANG MO KIO AVE 3,04 TO 06,44.0,Improved,1978,...,2023,1,45,314 ANG MO KIO AVE 3,1.366227,103.850086,29865.998046,38695.970271,560314,POINT (29865.998 38695.97)


In [14]:
gdf_hdb.columns

Index(['Unnamed: 0', 'month', 'town', 'flat_type', 'block', 'street_name',
       'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'resale_year', 'resale_month', 'resale_age', 'ADDRESS',
       'LAT', 'LNG', 'X', 'Y', 'POSTAL', 'geometry'],
      dtype='object')

In [16]:
# gdf_hdb_n = gdf_hdb[["month", "town", "flat_type", "storey_range", "floor_area_sqm","flat_model", "lease_commence_date", "resale_price", "resale_year", "resale_age", "LAT", "LNG", "X", "Y", "geometry"]].copy()
# gdf_hdb_n.to_file("processed_n/hdb.geojson", driver="GeoJSON")

In [8]:
gdf_scen_n = gdf_scen[["id", "geometry"]].copy()
gdf_scen_n['x'] = gdf_scen_n.geometry.x
gdf_scen_n['y'] = gdf_scen_n.geometry.y
gdf_scen_n.to_file("processed_n/recreation.geojson", driver="GeoJSON")

gdf_hcen_n = gdf_hcen[["id", "geometry"]].copy()
gdf_hcen_n['x'] = gdf_hcen_n.geometry.x
gdf_hcen_n['y'] = gdf_hcen_n.geometry.y
gdf_hcen_n.to_file("processed_n/healthcare.geojson", driver="GeoJSON")

gdf_bus_n = gdf_bus.copy()
gdf_bus_n['x'] = gdf_bus_n.geometry.x
gdf_bus_n['y'] = gdf_bus_n.geometry.y
gdf_bus_n.to_file("processed_n/bus.geojson", driver="GeoJSON")

gdf_mrt_n = gdf_mrt[["station_name", "geometry"]].copy()
gdf_mrt_n['x'] = gdf_mrt_n.geometry.x
gdf_mrt_n['y'] = gdf_mrt_n.geometry.y
gdf_mrt_n.to_file("processed_n/mrt.geojson", driver="GeoJSON")

gdf_hdb_n = gdf_hdb[["month", "town", "flat_type", "storey_range", "lease_commence_date", "resale_price", "resale_year", "resale_age", "LAT", "LNG", "X", "Y", "geometry"]].copy()
gdf_hdb_n.to_file("processed_n/hdb.geojson", driver="GeoJSON")

In [9]:
gdf_hdb_n

Unnamed: 0,month,town,flat_type,storey_range,lease_commence_date,resale_price,resale_year,resale_age,LAT,LNG,X,Y,geometry
0,2023-01,ANG MO KIO,2 ROOM,01 TO 03,1979,267000.0,2023,44,1.362005,103.853880,30288.234663,38229.067463,POINT (30288.235 38229.067)
1,2023-01,ANG MO KIO,2 ROOM,04 TO 06,1977,300000.0,2023,46,1.367908,103.847714,29602.047153,38881.891694,POINT (29602.047 38881.892)
2,2023-01,ANG MO KIO,2 ROOM,04 TO 06,1978,280000.0,2023,45,1.366227,103.850086,29865.998046,38695.970271,POINT (29865.998 38695.97)
3,2023-01,ANG MO KIO,2 ROOM,07 TO 09,1978,282000.0,2023,45,1.366227,103.850086,29865.998046,38695.970271,POINT (29865.998 38695.97)
4,2023-01,ANG MO KIO,2 ROOM,01 TO 03,1986,289800.0,2023,37,1.374001,103.836432,28346.433332,39555.534275,POINT (28346.433 39555.534)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25755,2023-12,YISHUN,EXECUTIVE,10 TO 12,1987,838000.0,2023,36,1.421335,103.837437,28458.318995,44789.537249,POINT (28458.319 44789.537)
25756,2023-12,YISHUN,EXECUTIVE,10 TO 12,1987,755000.0,2023,36,1.421335,103.837437,28458.318995,44789.537249,POINT (28458.319 44789.537)
25757,2023-12,YISHUN,EXECUTIVE,01 TO 03,1988,780000.0,2023,35,1.414174,103.832454,27903.805992,43997.703489,POINT (27903.806 43997.703)
25758,2023-12,YISHUN,EXECUTIVE,01 TO 03,1988,785000.0,2023,35,1.415088,103.834000,28075.777355,44098.765290,POINT (28075.777 44098.765)


In [11]:
print(f"\n 'HDB 2023': {len(gdf_hdb_n)} records, CRS={gdf_hdb_n.crs}")
display(gdf_hdb_n.head(2))

print(f"\n 'MRT Station': {len(gdf_mrt_n)} records, CRS={gdf_mrt_n.crs}")
display(gdf_mrt_n.head(2))

print(f"\n 'Bus Station': {len(gdf_bus_n)} records, CRS={gdf_bus_n.crs}")
display(gdf_bus_n.head(2))

print(f"\n 'Health Care Location': {len(gdf_hcen_n)} records, CRS={gdf_hcen_n.crs}")
display(gdf_hcen_n.head(2))

print(f"\n 'Recreation Location': {len(gdf_scen_n)} records, CRS={gdf_scen_n.crs}")
display(gdf_scen_n.head(2))



 'HDB 2023': 25760 records, CRS=EPSG:3414


Unnamed: 0,month,town,flat_type,storey_range,lease_commence_date,resale_price,resale_year,resale_age,LAT,LNG,X,Y,geometry
0,2023-01,ANG MO KIO,2 ROOM,01 TO 03,1979,267000.0,2023,44,1.362005,103.85388,30288.234663,38229.067463,POINT (30288.235 38229.067)
1,2023-01,ANG MO KIO,2 ROOM,04 TO 06,1977,300000.0,2023,46,1.367908,103.847714,29602.047153,38881.891694,POINT (29602.047 38881.892)



 'MRT Station': 563 records, CRS=EPSG:3414


Unnamed: 0,station_name,geometry,x,y
0,kml_1,POINT (36438.828 35234.527),36438.8275,35234.5268
1,kml_2,POINT (39147.711 35415.194),39147.711,35415.1942



 'Bus Station': 5172 records, CRS=EPSG:3414


Unnamed: 0,bus_stop_no,desc,geometry,x,y
0,65059,ST ANNE'S CH,POINT (35565.661 41659.525),35565.6608,41659.5249
1,16171,YUSOF ISHAK HSE,POINT (21439.911 31253.632),21439.9113,31253.6323



 'Health Care Location': 198 records, CRS=EPSG:3414


Unnamed: 0,id,geometry,x,y
0,kml_1544,POINT (19767.829 40055.89),19767.828666,40055.88968
1,kml_1866,POINT (17843.486 36837.994),17843.486421,36837.994327



 'Recreation Location': 243 records, CRS=EPSG:3414


Unnamed: 0,id,geometry,x,y
0,kml_178,POINT (44373.085 41293.406),44373.08508,41293.405809
1,kml_233,POINT (19062.126 37852.066),19062.125758,37852.066398


**All projected in SVY21 (meters) — perfect for direct distance computation and GWR.**

Data Summary:
| Dataset                           | Records | CRS       | Geometry | Role in Analysis                                                       |
| --------------------------------- | ------- | --------- | -------- | ---------------------------------------------------------------------- |
| **HDB 2023**                      | 25,760  | EPSG:3414 | Points   | Core dataset — dependent variable (resale price) & internal attributes |
| **MRT stations**                  | 563     | EPSG:3414 | Points   | Compute nearest distance                                               |
| **Bus stops**                     | 5,172   | EPSG:3414 | Points   | Count within buffers (400 m or 800 m)                                  |
| **Healthcare locations**          | 198     | EPSG:3414 | Points   | Compute nearest distance                                               |
| **Recreation (sports) locations** | 243     | EPSG:3414 | Points   | Compute nearest distance                                               |
