In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path

In [3]:
# ------------ paths ------------
hdb_parquet = "processed_n/hdb_ols.parquet"   
subzone_shp = "data/master-plan-2019-subzone_landuse_area.zip"

In [4]:
# ------------ load HDB transactions ------------
# Requires pyarrow or fastparquet. Install one if needed:
# pip install pyarrow
df = pd.read_parquet(hdb_parquet)

# If your Parquet is already a GeoDataFrame with point geometry, keep it.
# Otherwise, build point geometry from lon/lat columns (auto-detect common names).
if isinstance(df, gpd.GeoDataFrame) and "geometry" in df.columns:
    gdf_points = df.copy()
    if gdf_points.crs is None:
        # assume WGS84 if CRS missing
        gdf_points = gdf_points.set_crs(4326)
else:
    # Guess lon/lat column names
    lon_candidates = [c for c in df.columns if c.lower() in ("lon","lng","longitude","x","long")]
    lat_candidates = [c for c in df.columns if c.lower() in ("lat","latitude","y")]
    if not lon_candidates or not lat_candidates:
        raise ValueError(
            "Could not find longitude/latitude columns. "
            "Please ensure columns like lon/lng/longitude and lat/latitude exist."
        )
    lon_col, lat_col = lon_candidates[0], lat_candidates[0]
    gdf_points = gpd.GeoDataFrame(
        df.copy(),
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs=4326
    )

In [5]:
# ------------ load Subzones ------------
subzones = gpd.read_file(subzone_shp)

# Try to fix any invalid polygon geometries
try:
    subzones["geometry"] = subzones.buffer(0)
except Exception:
    pass

# Find a good name column in the subzones layer
name_candidates = ["SUBZONE_N", "SUBZONE_NAME", "SUBZONE", "Name", "name"]
subzone_name_col = next((c for c in name_candidates if c in subzones.columns), None)
if subzone_name_col is None:
    # fallback to first non-geometry column
    subzone_name_col = [c for c in subzones.columns if c != "geometry"][0]

# Carry polygon geometry through the join by duplicating it into a data column
subzones = subzones.copy()
subzones["subzone_geom"] = subzones.geometry  # will be retained as an attribute during sjoin
subzones = subzones[[subzone_name_col, "subzone_geom", "geometry"]].rename(
    columns={subzone_name_col: "subzone_name"}
)

In [6]:
# ------------ match CRS & spatial join ------------
if subzones.crs is None:
    # MP19 layers are usually SVY21 / EPSG:3414; set if missing
    subzones = subzones.set_crs(3414)

if gdf_points.crs != subzones.crs:
    gdf_points = gdf_points.to_crs(subzones.crs)

# point-in-polygon join (keeps point geometry as the main 'geometry')
joined = gpd.sjoin(
    gdf_points,
    subzones,
    how="left",
    predicate="within",
)

# Clean up sjoin helper columns
for c in list(joined.columns):
    if c.startswith("index_"):
        joined.drop(columns=c, inplace=True, errors="ignore")

# Ensure the two required columns are present and at the front
front = ["subzone_name", "subzone_geom"]
ordered_cols = front + [c for c in joined.columns if c not in front]
joined = joined[ordered_cols]

In [7]:
# ------------ save outputs ------------
# 1) GeoParquet (preserves point geometry and subzone_geom column)
out_parquet = "processed_n/hdb_features_with_subzone_geometry.parquet"
joined.to_parquet(out_parquet, index=False)

# # 2) CSV (drops geometry columns for light sharing)
# out_csv = "processed_n/hdb_features_with_subzone_light.csv"
# joined.drop(columns=["geometry", "subzone_geom"], errors="ignore").to_csv(out_csv, index=False)

# 3) Optional: GeoPackage (widely compatible). Saves *point* layer; polygon geometry is kept as an attribute column.
# Comment out if you don't need it.
# out_gpkg = "processed_n/hdb_features_with_subzone.gpkg"
# joined.to_file(out_gpkg, layer="hdb_with_subzone", driver="GPKG")

print("Done.")
print("Subzone name field used:", "subzone_name")
print("Saved:", out_parquet)
# print("Saved:", out_csv)
# print("Saved:", out_gpkg)

Done.
Subzone name field used: subzone_name
Saved: processed_n/hdb_features_with_subzone_geometry.parquet


In [8]:
join_hdb_data = "processed_n/hdb_features_with_subzone_geometry.parquet"
df_1 = gpd.read_parquet(join_hdb_data)

In [9]:
df_1.head(2)

Unnamed: 0,subzone_name,subzone_geom,month,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,...,model_Model A2,model_Multi Generation,model_New Generation,model_Premium Apartment,model_Premium Apartment Loft,model_Simplified,model_Standard,model_Terrace,model_Type S1,model_Type S2
0,CHONG BOON,"POLYGON Z ((30676.168 39006.867 0, 30761.414 3...",2023-01,ANG MO KIO,2 ROOM,01 TO 03,44.0,Improved,1979,267000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TOWNSVILLE,"POLYGON Z ((29649.875 38978.996 0, 29671.324 3...",2023-01,ANG MO KIO,2 ROOM,04 TO 06,49.0,Improved,1977,300000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
df_1.columns.unique()

Index(['subzone_name', 'subzone_geom', 'month', 'town', 'flat_type',
       'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'resale_year', 'resale_age', 'LAT', 'LNG', 'X', 'Y',
       'geometry', 'log_price', 'dist_mrt', 'dist_hcen', 'dist_scen',
       'bus_count_400m', 'storey_mid', 'type_2 ROOM', 'type_3 ROOM',
       'type_4 ROOM', 'type_5 ROOM', 'type_EXECUTIVE', 'type_MULTI-GENERATION',
       'model_3Gen', 'model_Adjoined flat', 'model_Apartment', 'model_DBSS',
       'model_Improved', 'model_Improved-Maisonette', 'model_Maisonette',
       'model_Model A', 'model_Model A-Maisonette', 'model_Model A2',
       'model_Multi Generation', 'model_New Generation',
       'model_Premium Apartment', 'model_Premium Apartment Loft',
       'model_Simplified', 'model_Standard', 'model_Terrace', 'model_Type S1',
       'model_Type S2'],
      dtype='object')

In [15]:
# ============================================================
# Full coefficient table per subzone (with geometry export)
# ============================================================
import pandas as pd
import geopandas as gpd
import numpy as np
import statsmodels.api as sm
from shapely.geometry import MultiPoint

# --------- columns ---------
Y = "log_price"
X_vars = [
    "dist_mrt", "dist_hcen", "dist_scen", "bus_count_400m", "resale_age", "storey_mid", "floor_area_sqm",
    "type_2 ROOM", "type_3 ROOM", "type_4 ROOM", "type_5 ROOM", "type_EXECUTIVE", "type_MULTI-GENERATION",
    "model_3Gen", "model_Adjoined flat", "model_Apartment", "model_DBSS", "model_Improved",
    "model_Improved-Maisonette", "model_Maisonette", "model_Model A", "model_Model A-Maisonette",
    "model_Model A2", "model_Multi Generation", "model_New Generation", "model_Premium Apartment",
    "model_Premium Apartment Loft", "model_Simplified", "model_Standard", "model_Terrace",
    "model_Type S1", "model_Type S2"
]

# --------- ensure GeoDataFrame & decode WKB if needed ---------
df1 = df_1.copy()

# decode point geometry from WKB if necessary
if not isinstance(df1, gpd.GeoDataFrame):
    df1["geometry"] = gpd.GeoSeries.from_wkb(df1["geometry"])
    df1 = gpd.GeoDataFrame(df1, geometry="geometry")

# decode polygon geometry from WKB if necessary
if "subzone_geom" in df1.columns and df1["subzone_geom"].notna().any():
    if df1["subzone_geom"].dtype == object and isinstance(df1["subzone_geom"].dropna().iloc[0], (bytes, bytearray)):
        df1["subzone_geom"] = gpd.GeoSeries.from_wkb(df1["subzone_geom"])

# set CRS if missing (MP19 is usually EPSG:3414)
if df1.crs is None:
    df1.set_crs(epsg=3414, inplace=True)

# --------- representative geometry per subzone ---------
subzone_geoms = {}

has_poly = "subzone_geom" in df1.columns and df1["subzone_geom"].notna().any()
if has_poly:
    poly = (
        df1.loc[df1["subzone_geom"].notna(), ["subzone_name", "subzone_geom"]]
        .drop_duplicates(subset=["subzone_name", "subzone_geom"])
        .groupby("subzone_name")["subzone_geom"]
        .apply(lambda s: s.unary_union)
        .reset_index()
        .rename(columns={"subzone_geom": "geometry"})
    )
    poly_gdf = gpd.GeoDataFrame(poly, geometry="geometry", crs=df1.crs)
    subzone_geoms = dict(zip(poly_gdf["subzone_name"], poly_gdf["geometry"]))

# fallback: convex hull of the points
for sz, g in df1.groupby("subzone_name"):
    if sz not in subzone_geoms or subzone_geoms[sz] is None:
        pts = [geom for geom in g.geometry.values if geom is not None]
        if len(pts) >= 3:
            subzone_geoms[sz] = MultiPoint(pts).convex_hull
        elif len(pts) > 0:
            subzone_geoms[sz] = MultiPoint(pts).envelope

# --------- fit OLS per subzone & collect coefficients ---------
coeff_rows = []   # wide table rows
tidy_rows  = []   # long/tidy rows
min_n = len(X_vars) + 5

for sz, sub in df1.groupby("subzone_name"):
    sub = sub[[Y] + X_vars].dropna()
    if len(sub) < min_n:
        continue
    try:
        X = sm.add_constant(sub[X_vars])
        y = sub[Y]
        model = sm.OLS(y, X).fit()
    except Exception:
        # skip ill-conditioned groups
        continue

    # ---- wide row: one column per parameter ----
    row = {"subzone_name": sz, "n_obs": len(sub), "R2": model.rsquared, "Adj_R2": model.rsquared_adj}
    for param, val in model.params.items():
        row[param] = val
    coeff_rows.append(row)

    # ---- tidy rows: one row per coefficient ----
    for param, val in model.params.items():
        tidy_rows.append({
            "subzone_name": sz,
            "n_obs": len(sub),
            "term": param,
            "coef": float(val),
            "std_err": float(model.bse.get(param, np.nan)),
            "t": float(model.tvalues.get(param, np.nan)),
            "pval": float(model.pvalues.get(param, np.nan))
        })

# --------- build DataFrames ---------
coeff_wide = pd.DataFrame(coeff_rows).set_index("subzone_name").sort_index()
coeff_tidy = pd.DataFrame(tidy_rows).sort_values(["subzone_name", "term"]).reset_index(drop=True)

# attach geometry to wide table
geo_df = coeff_wide.reset_index()
geo_df["geometry"] = geo_df["subzone_name"].map(subzone_geoms)
gcoeff_wide = gpd.GeoDataFrame(geo_df, geometry="geometry", crs=df1.crs).dropna(subset=["geometry"])

# --------- save outputs ---------
# CSVs (no geometry)
coeff_wide.to_csv("processed_n/subzone_level_ols_coefficients_wide.csv", index=True)
coeff_tidy.to_csv("processed_n/subzone_level_ols_coefficients_tidy.csv", index=False)

# GeoPackage + GeoParquet (with geometry)
gcoeff_wide.to_file("processed_n/subzone_level_ols_coefficients_wide.gpkg", driver="GPKG")
# gcoeff_wide.to_parquet("processed_n/subzone_level_ols_coefficients_wide.parquet", index=False)

print("Saved:")
# print(" - processed_n/subzone_level_ols_coefficients_wide.csv")
# print(" - processed_n/subzone_level_ols_coefficients_tidy.csv")
print(" - processed_n/subzone_level_ols_coefficients_wide.gpkg")
# print(" - processed_n/subzone_level_ols_coefficients_wide.parquet")
print(gcoeff_wide.head())


  .apply(lambda s: s.unary_union)


Saved:
 - processed_n/subzone_level_ols_coefficients_wide.gpkg
     subzone_name  n_obs        R2    Adj_R2     const  dist_mrt  dist_hcen  \
0       ADMIRALTY     93  0.969211  0.964593  8.743540  0.000161  -0.000201   
1  ALEXANDRA HILL     71  0.984519  0.980297  9.099769  0.000376   0.000030   
2        ALJUNIED    174  0.973979  0.971143  9.992028 -0.000225   0.000031   
3      ANCHORVALE    385  0.914311  0.910585  9.384674  0.000056   0.000104   
4       BALESTIER    222  0.979386  0.977885  9.283172 -0.000086   0.000019   

   dist_scen  bus_count_400m  resale_age  ...  model_Multi Generation  \
0   0.000062        0.002110   -0.005458  ...                     0.0   
1  -0.000594        0.004871   -0.012904  ...                     0.0   
2  -0.000200       -0.000154   -0.011066  ...                     0.0   
3  -0.000005        0.000598   -0.013421  ...                     0.0   
4   0.000088       -0.001503   -0.009023  ...                     0.0   

   model_New Generation

In [None]:
# # ============================================================
# # OLS per subzone + attach geometry (GeoParquet output)
# # ============================================================
# import pandas as pd
# import geopandas as gpd
# import numpy as np
# import statsmodels.api as sm
# from shapely.geometry import MultiPoint

# # ------- Columns -------
# Y = "log_price"
# X_vars = [
#     "dist_mrt", "dist_hcen", "dist_scen", "bus_count_400m", "resale_age", "storey_mid",
#     "type_2 ROOM", "type_3 ROOM", "type_4 ROOM", "type_5 ROOM", "type_EXECUTIVE", "type_MULTI-GENERATION",
#     "model_3Gen", "model_Adjoined flat", "model_Apartment", "model_DBSS", "model_Improved",
#     "model_Improved-Maisonette", "model_Maisonette", "model_Model A", "model_Model A-Maisonette",
#     "model_Model A2", "model_Multi Generation", "model_New Generation", "model_Premium Apartment",
#     "model_Premium Apartment Loft", "model_Simplified", "model_Standard", "model_Terrace",
#     "model_Type S1", "model_Type S2"
# ]

# # ------- Ensure GeoDataFrame + decode WKB if needed -------
# df1 = df_1.copy()

# # Decode main 'geometry' (points) if it looks like WKB (bytes)
# if not isinstance(df1, gpd.GeoDataFrame):
#     try:
#         df1["geometry"] = gpd.GeoSeries.from_wkb(df1["geometry"])
#         df1 = gpd.GeoDataFrame(df1, geometry="geometry")
#     except Exception as e:
#         raise ValueError("Failed to interpret 'geometry' column as WKB/geometry.") from e

# # Decode 'subzone_geom' (polygons) if stored as WKB bytes
# if "subzone_geom" in df1.columns and df1["subzone_geom"].notna().any():
#     if df1["subzone_geom"].dtype == object and isinstance(df1["subzone_geom"].dropna().iloc[0], (bytes, bytearray)):
#         df1["subzone_geom"] = gpd.GeoSeries.from_wkb(df1["subzone_geom"])

# # Set/keep CRS (Master Plan 2019 is typically EPSG:3414)
# if df1.crs is None:
#     df1.set_crs(epsg=3414, inplace=True)

# # ------- Build a representative geometry per subzone -------
# subzone_geoms = {}

# has_poly = "subzone_geom" in df1.columns and df1["subzone_geom"].notna().any()

# if has_poly:
#     # dissolve available polygons by subzone_name
#     poly = (
#         df1.loc[df1["subzone_geom"].notna(), ["subzone_name", "subzone_geom"]]
#         .drop_duplicates(subset=["subzone_name", "subzone_geom"])
#         .groupby("subzone_name")["subzone_geom"]
#         .apply(lambda s: s.unary_union)
#         .reset_index()
#         .rename(columns={"subzone_geom": "geometry"})
#     )
#     poly_gdf = gpd.GeoDataFrame(poly, geometry="geometry", crs=df1.crs)
#     subzone_geoms = dict(zip(poly_gdf["subzone_name"], poly_gdf["geometry"]))

# # fallback: convex hull of transaction points when polygon missing
# for sz, g in df1.groupby("subzone_name"):
#     if sz not in subzone_geoms or subzone_geoms[sz] is None:
#         pts = [geom for geom in g.geometry.values if geom is not None]
#         if len(pts) >= 3:
#             subzone_geoms[sz] = MultiPoint(pts).convex_hull
#         elif len(pts) > 0:
#             # hull not defined for <3 points—use point buffer(0) (degenerate) or the point itself
#             subzone_geoms[sz] = MultiPoint(pts).envelope

# # ------- OLS per subzone + collect metrics -------
# results_rows = []
# min_n = len(X_vars) + 5   # simple safeguard against underspecified models

# for sz, sub in df1.groupby("subzone_name"):
#     sub = sub[[Y] + X_vars].dropna()
#     if len(sub) < min_n:
#         continue

#     X = sm.add_constant(sub[X_vars])
#     y = sub[Y]
#     model = sm.OLS(y, X).fit()

#     results_rows.append({
#         "subzone_name": sz,
#         "n_obs": len(sub),
#         "R2": model.rsquared,
#         "Adj_R2": model.rsquared_adj,
#         "AIC": model.aic,
#         "BIC": model.bic,
#         "F_pval": model.f_pvalue
#     })

# # ------- Attach geometry & save as GeoParquet -------
# results_df = pd.DataFrame(results_rows).sort_values("R2", ascending=False)
# results_df["geometry"] = results_df["subzone_name"].map(subzone_geoms)

# gresults = gpd.GeoDataFrame(results_df, geometry="geometry", crs=df1.crs)
# gresults = gresults.dropna(subset=["geometry"]).reset_index(drop=True)

# # out_parquet = "processed_n/subzone_level_ols_summary.parquet"
# # gresults.to_parquet(out_parquet, index=False)

# # ---------- Save as GeoPackage (.gpkg) ----------
# out_gpkg = "processed_n/subzone_level_ols_summary.gpkg"

# # The 'driver' argument tells GeoPandas to use the GPKG format
# gresults.to_file(out_gpkg, driver="GPKG")

# print(f"Saved GeoPackage: {out_gpkg}")


# # print(f"Saved GeoParquet: {out_parquet}")
# # print(gresults.head())


In [None]:
# gresults.head()

#### GWR model

In [13]:
df_1.columns.unique()

Index(['subzone_name', 'subzone_geom', 'month', 'town', 'flat_type',
       'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
       'resale_price', 'resale_year', 'resale_age', 'LAT', 'LNG', 'X', 'Y',
       'geometry', 'log_price', 'dist_mrt', 'dist_hcen', 'dist_scen',
       'bus_count_400m', 'storey_mid', 'type_2 ROOM', 'type_3 ROOM',
       'type_4 ROOM', 'type_5 ROOM', 'type_EXECUTIVE', 'type_MULTI-GENERATION',
       'model_3Gen', 'model_Adjoined flat', 'model_Apartment', 'model_DBSS',
       'model_Improved', 'model_Improved-Maisonette', 'model_Maisonette',
       'model_Model A', 'model_Model A-Maisonette', 'model_Model A2',
       'model_Multi Generation', 'model_New Generation',
       'model_Premium Apartment', 'model_Premium Apartment Loft',
       'model_Simplified', 'model_Standard', 'model_Terrace', 'model_Type S1',
       'model_Type S2'],
      dtype='object')

In [None]:
!pip install mgwr

In [14]:
# ============================================================
# Robust GWR per-point + subzone aggregation (version-safe)
# ============================================================
import pandas as pd
import geopandas as gpd
import numpy as np
import statsmodels.api as sm
from shapely.geometry import MultiPoint
import warnings
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from statsmodels.stats.outliers_influence import variance_inflation_factor

# -------- Configuration --------
Y = "log_price"
X_vars = [
    "dist_mrt", "dist_hcen", "dist_scen", "bus_count_400m", "resale_age", "storey_mid", "floor_area_sqm"]

# ============================================================
# 1) Prepare data
# ============================================================
df1 = df_1.copy()

# Decode WKB → geometry
if not isinstance(df1, gpd.GeoDataFrame):
    df1["geometry"] = gpd.GeoSeries.from_wkb(df1["geometry"])
    df1 = gpd.GeoDataFrame(df1, geometry="geometry")

if "subzone_geom" in df1.columns and df1["subzone_geom"].notna().any():
    if isinstance(df1["subzone_geom"].dropna().iloc[0], (bytes, bytearray)):
        df1["subzone_geom"] = gpd.GeoSeries.from_wkb(df1["subzone_geom"])

# Ensure projected CRS (EPSG:3414)
if df1.crs is None:
    df1.set_crs(3414, inplace=True)
elif df1.crs.to_epsg() != 3414:
    df1 = df1.to_crs(3414)

# Keep complete cases
cols_needed = [Y] + X_vars + ["subzone_name", "geometry"]
df1 = df1.dropna(subset=[c for c in cols_needed if c in df1.columns]).copy()

# ============================================================
# 2) Fix collinearity
# ============================================================
type_cols  = [c for c in X_vars if c.startswith("type_")]
model_cols = [c for c in X_vars if c.startswith("model_")]

def drop_one_present(cols, prefer):
    for p in prefer:
        if p in cols:
            return p
    return cols[0] if cols else None

drop_type  = drop_one_present(type_cols,  ["type_2 ROOM"])
drop_model = drop_one_present(model_cols, ["model_Standard"])
to_drop = [c for c in [drop_type, drop_model] if c]
X_vars = [c for c in X_vars if c not in to_drop]

# Remove zero-variance
zero_var = [c for c in X_vars if df1[c].std(ddof=0) == 0]
if zero_var:
    warnings.warn(f"Dropping zero-variance vars: {zero_var}")
    X_vars = [c for c in X_vars if c not in zero_var]

# Iterative VIF pruning
def prune_by_vif(df, predictors, thresh=30.0, max_loops=10):
    preds = predictors[:]
    for _ in range(max_loops):
        X = sm.add_constant(df[preds].values, has_constant='add')
        vifs = [variance_inflation_factor(X, i+1) for i in range(len(preds))]
        if np.nanmax(vifs) < thresh:
            break
        worst = preds[int(np.nanargmax(vifs))]
        warnings.warn(f"Dropping high-VIF var: {worst} (VIF≈{np.nanmax(vifs):.1f})")
        preds.remove(worst)
    return preds

X_vars = prune_by_vif(df1, X_vars, thresh=30.0)

# ============================================================
# 3) Build matrices and bandwidth
# ============================================================
coords = np.column_stack([df1.geometry.x.values, df1.geometry.y.values])
y = df1[[Y]].values
X = df1[X_vars].values
n, k = X.shape
p = k + 1

bw_min_safe = max(p + 5, int(0.02 * n), 60)
bw_max_safe = max(bw_min_safe + 40, int(0.25 * n))

def pick_bw(coords, y, X, bw_min, bw_max, fixed=False, kernel='bisquare'):
    sel = Sel_BW(coords, y, X, fixed=fixed, kernel=kernel)
    try:
        return sel.search(bw_min=bw_min, bw_max=bw_max)
    except Exception:
        warnings.warn("Bandwidth search failed; retry with fixed kernel & larger bw_min.")
        sel2 = Sel_BW(coords, y, X, fixed=True, kernel=kernel)
        return sel2.search(bw_min=bw_min+40, bw_max=bw_max+200)

bw = pick_bw(coords, y, X, bw_min_safe, bw_max_safe, fixed=False)
print(f"Selected bandwidth: {bw}")

# ============================================================
# 4) Fit GWR safely
# ============================================================
def fit_gwr(coords, y, X, bw, fixed=False, kernel='bisquare'):
    try:
        mdl = GWR(coords, y, X, bw=bw, fixed=fixed, kernel=kernel, spherical=False)
        return mdl.fit()
    except Exception as e:
        warnings.warn(f"GWR fit failed ({e}); retrying with larger bw.")
        bw2 = max(bw + 40, p + 40)
        mdl2 = GWR(coords, y, X, bw=bw2, fixed=True, kernel=kernel, spherical=False)
        return mdl2.fit()

gwr_res = fit_gwr(coords, y, X, bw, fixed=False)
print(f"AICc: {gwr_res.aicc:.3f}, RSS: {gwr_res.resid_ss:.3f}")

# ============================================================
# 5) Collect local results (robust SE logic)
# ============================================================
param_names = ["Intercept"] + X_vars
params = pd.DataFrame(gwr_res.params, columns=param_names)
tvals  = pd.DataFrame(gwr_res.tvalues, columns=[f"t_{c}" for c in param_names])

# robust SE handling
if hasattr(gwr_res, "se"):
    se_arr = gwr_res.se
elif hasattr(gwr_res, "bse"):
    se_arr = gwr_res.bse
else:
    t_only = tvals.copy()
    t_only.columns = [c.replace("t_", "") for c in t_only.columns]
    tiny = 1e-12
    t_only = t_only.where(t_only.abs() > tiny, np.nan)
    se_arr = params.values / t_only.values

se = pd.DataFrame(se_arr, columns=[f"se_{c}" for c in param_names])

locals_df = pd.concat(
    [df1[["subzone_name"]].reset_index(drop=True), params, tvals, se],
    axis=1
)
locals_df["local_R2"] = gwr_res.localR2

locals_gdf = gpd.GeoDataFrame(
    pd.concat([locals_df, df1[["geometry"]].reset_index(drop=True)], axis=1),
    geometry="geometry", crs=df1.crs
)

# ============================================================
# 6) Subzone-level aggregation (same as before)
# ============================================================
subzone_geoms = {}
has_poly = "subzone_geom" in df1.columns and df1["subzone_geom"].notna().any()
if has_poly:
    poly = (
        df1.loc[df1["subzone_geom"].notna(), ["subzone_name", "subzone_geom"]]
        .drop_duplicates(subset=["subzone_name", "subzone_geom"])
        .groupby("subzone_name")["subzone_geom"].apply(lambda s: s.unary_union)
        .reset_index().rename(columns={"subzone_geom": "geometry"})
    )
    poly_gdf = gpd.GeoDataFrame(poly, geometry="geometry", crs=df1.crs)
    subzone_geoms = dict(zip(poly_gdf["subzone_name"], poly_gdf["geometry"]))

for sz, g in df1.groupby("subzone_name"):
    if sz not in subzone_geoms or subzone_geoms[sz] is None:
        pts = [geom for geom in g.geometry.values if geom is not None]
        if len(pts) >= 3:
            subzone_geoms[sz] = MultiPoint(pts).convex_hull
        elif len(pts) > 0:
            subzone_geoms[sz] = MultiPoint(pts).envelope

agg_cols = param_names + [f"t_{c}" for c in param_names] + [f"se_{c}" for c in param_names] + ["local_R2"]
agg_map = {c: "mean" for c in agg_cols}
subzone_wide = locals_df.groupby("subzone_name").agg(agg_map).reset_index()
subzone_wide["geometry"] = subzone_wide["subzone_name"].map(subzone_geoms)
gsubzone = gpd.GeoDataFrame(subzone_wide, geometry="geometry", crs=df1.crs).dropna(subset=["geometry"])

# ============================================================
# 7) Save outputs
# ============================================================
locals_gdf.to_parquet("processed_n/gwr_locals_points.parquet", index=False)
locals_gdf.to_file("processed_n/gwr_locals_points.gpkg", driver="GPKG")

gsubzone.to_parquet("processed_n/gwr_subzone_coeffs_wide.parquet", index=False)
gsubzone.to_file("processed_n/gwr_subzone_coeffs_wide.gpkg", driver="GPKG")

locals_df.to_csv("processed_n/gwr_locals_points_wide.csv", index=False)
subzone_wide.to_csv("processed_n/gwr_subzone_coeffs_wide.csv", index=False)

print("✅ GWR completed successfully and files saved.")

Selected bandwidth: 516.0
AICc: -72261.306, RSS: 85.806


  .groupby("subzone_name")["subzone_geom"].apply(lambda s: s.unary_union)


✅ GWR completed successfully and files saved.


#### The following is trash code, just test, don't care.

In [None]:
# ============================================================
# Run OLS separately by town (local models)
# ============================================================

import pandas as pd
import numpy as np
import statsmodels.api as sm

# Assuming df is already loaded (from hdb_ols.parquet)
# and you have a 'town' column
# assert "subzone_name" in df.columns, "Your dataset must contain a 'subzone_name' column."

Y = "log_price"
# X_vars = [
#     "dist_mrt", "dist_hcen", "dist_scen", "bus_count_400m", "resale_age", "storey_mid",
#     "type_2 ROOM", "type_3 ROOM", "type_4 ROOM", "type_5 ROOM", "type_EXECUTIVE", "type_MULTI-GENERATION",
#     "model_3Gen", "model_Adjoined flat", "model_Apartment", "model_DBSS", "model_Improved",
#     "model_Improved-Maisonette", "model_Maisonette", "model_Model A", "model_Model A-Maisonette",
#     "model_Model A2", "model_Multi Generation", "model_New Generation", "model_Premium Apartment",
#     "model_Premium Apartment Loft", "model_Simplified", "model_Standard", "model_Terrace",
#     "model_Type S1", "model_Type S2"
# ]
X_vars = [
    "dist_mrt", "dist_hcen", "dist_scen", "bus_count_400m", "resale_age", "storey_mid",
    "type_2 ROOM", "type_3 ROOM", "type_4 ROOM", "type_5 ROOM", "type_EXECUTIVE", "type_MULTI-GENERATION",
    "model_3Gen", "model_Adjoined flat", "model_Apartment", "model_DBSS", "model_Improved",
    "model_Improved-Maisonette", "model_Maisonette", "model_Model A", "model_Model A-Maisonette",
    "model_Model A2", "model_Multi Generation", "model_New Generation", "model_Premium Apartment",
    "model_Premium Apartment Loft", "model_Simplified", "model_Standard", "model_Terrace",
    "model_Type S1", "model_Type S2"
]

results_summary = []

In [None]:
for town, df_sub in df_1.groupby("town"):
    df_sub = df_sub[[Y] + X_vars].dropna()
    if len(df_sub) < len(X_vars) + 5:
        print(f"Skipping {town}: not enough observations ({len(df_sub)})")
        continue
    
    X = sm.add_constant(df_sub[X_vars])
    y = df_sub[Y]
    
    model = sm.OLS(y, X).fit()
    
    results_summary.append({
        "town": town,
        "n_obs": len(df_sub),
        "R2": model.rsquared,
        "Adj_R2": model.rsquared_adj,
        "AIC": model.aic,
        "BIC": model.bic,
        "F_pval": model.f_pvalue
    })

results_df = pd.DataFrame(results_summary).sort_values("R2", ascending=False)
results_df.head(10)

In [None]:
# ============================================================
# Extract full coefficient table per town
# ============================================================

coeffs_list = []

for town, df_sub in df_1.groupby("town"):
    df_sub = df_sub[[Y] + X_vars].dropna()
    if len(df_sub) < len(X_vars) + 5:
        continue

    X = sm.add_constant(df_sub[X_vars])
    y = df_sub[Y]
    model = sm.OLS(y, X).fit()

    # Convert coefficients to a dict row
    row = {"town": town, "n_obs": len(df_sub)}
    for param, val in model.params.items():
        row[param] = val
    coeffs_list.append(row)

coeff_df = pd.DataFrame(coeffs_list).set_index("town")
coeff_df.to_csv("town_level_ols_coefficients_resaleprice.csv")
coeff_df.head()


In [None]:
for town, df_sub in df_1.groupby("town"):
    df_sub = df_sub[[Y] + X_vars].dropna()
    if len(df_sub) < len(X_vars) + 5:
        continue
    # a += len(df_sub)
    # print(a)

In [None]:
df_1['resale_age']