In [1]:
import os
import zipfile
from pathlib import Path
import warnings

import pandas as pd
import geopandas as gpd
import fiona
from shapely.geometry import Point
from shapely.ops import transform as shp_transform
from pyproj import Transformer

warnings.filterwarnings("ignore")

In [2]:
# INPUTS (as you uploaded)
PATH_HDB_CSV = "data/hdb_data_2023.csv.xz"
PATH_ZIP_MRT = "data/MRT Station_06 Jun 2024.zip"
PATH_ZIP_BUS = "data/BusStopLocation_Aug2025.zip"
PATH_ZIP_PA  = "data/master-plan-2019-planningarea.zip"
PATH_ZIP_SZ  = "data/master-plan-2019-subzone_landuse_area.zip"

# OUTPUT DIRECTORY (created if missing)
OUT_DIR = Path("processed"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# OUTPUT FILES (GeoJSON recommended for portability)
OUT_HDB = OUT_DIR / "hdb_2023_points_3414.geojson"
OUT_MRT = OUT_DIR / "mrt_2024_points_3414.geojson"
OUT_BUS = OUT_DIR / "bus_2025_points_3414.geojson"
OUT_PA  = OUT_DIR / "mp19_planning_areas_3414.geojson"
OUT_SZ  = OUT_DIR / "mp19_subzones_3414.geojson"

# CRS TARGET
TARGET_EPSG = 3414
TARGET_CRS = f"EPSG:{TARGET_EPSG}"

# TEMP extraction
TMP_DIR = Path("tmp_extract"); TMP_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
def unzip_to_dir(zip_path: str | Path, base_dir: Path = TMP_DIR) -> Path:
    """Extract a .zip to a temp folder and return the folder path."""
    zip_path = Path(zip_path)
    out_dir = base_dir / zip_path.stem
    out_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(out_dir)
    return out_dir

def first_shp(dir_path: Path) -> Path | None:
    """Find the first .shp in a directory tree."""
    shps = list(Path(dir_path).rglob("*.shp"))
    return shps[0] if shps else None

def to_2d_geometry(geom):
    """Drop Z dimension from any Shapely geometry. Works for points, lines, polygons."""
    if geom is None:
        return None
    try:
        return shp_transform(lambda x, y, z=None: (x, y), geom)
    except Exception:
        # Fallback for simple Point-like
        if hasattr(geom, "x") and hasattr(geom, "y"):
            return Point(geom.x, geom.y)
        return geom

def reproject_gdf(gdf: gpd.GeoDataFrame, target_crs: str = TARGET_CRS) -> gpd.GeoDataFrame:
    """Reproject GeoDataFrame to target_crs; if crs missing, you may set gdf.set_crs(...) first."""
    if gdf.crs is None:
        raise ValueError("Input layer has no CRS. Set the source CRS before calling reproject.")
    if str(gdf.crs).lower() != target_crs.lower():
        gdf = gdf.to_crs(target_crs)
    return gdf

def robust_reproject_write_polygons_with_fiona(zip_path: str | Path, out_geojson: str | Path):
    """
    Robustly read (with Fiona), reproject (pyproj), convert Polygon->MultiPolygon if needed,
    and write to GeoJSON. This avoids common Shapely multipart array interface issues.
    """
    shp_dir = unzip_to_dir(zip_path)
    shp_path = first_shp(shp_dir)
    if shp_path is None:
        raise FileNotFoundError(f"No .shp found in {zip_path}")

    with fiona.open(shp_path) as src:
        # Build coordinate transformer using source CRS -> target EPSG
        src_crs = src.crs_wkt or src.crs
        transformer = Transformer.from_crs(src_crs, TARGET_CRS, always_xy=True)

        # Destination schema: force MultiPolygon (safe superset)
        schema = {
            "geometry": "MultiPolygon",
            "properties": {k: v for k, v in src.schema["properties"].items()},
        }

        # Write GeoJSON output
        with fiona.open(out_geojson, mode="w", driver="GeoJSON", crs=TARGET_CRS, schema=schema) as dst:
            for feat in src:
                geom = feat["geometry"]
                if geom is None:
                    continue

                # Normalize Polygon to MultiPolygon to match schema
                if geom["type"] == "Polygon":
                    geom = {"type": "MultiPolygon", "coordinates": [geom["coordinates"]]}
                elif geom["type"] != "MultiPolygon":
                    # Attempt to coerce any other type if present (rare)
                    continue

                # Transform coordinates & drop Z
                tgeom = transform_geom_xy(geom, transformer)
                dst.write({"geometry": tgeom, "properties": feat["properties"]})

def transform_geom_xy(geom, transformer: Transformer):
    """Transform all coordinates in a (Multi)Polygon dict to target CRS and drop Z."""
    gtype = geom["type"]
    coords = geom["coordinates"]

    def tx_xy(xy):
        # xy may include Z; only use x,y
        x, y = xy[0], xy[1]
        X, Y = transformer.transform(x, y)
        return (X, Y)

    if gtype == "MultiPolygon":
        return {
            "type": "MultiPolygon",
            "coordinates": [
                [[tx_xy(c) for c in ring] for ring in poly]
                for poly in coords
            ],
        }
    elif gtype == "Polygon":  # kept for completeness; we normalize to MultiPolygon above
        return {
            "type": "Polygon",
            "coordinates": [[tx_xy(c) for c in ring] for ring in coords],
        }
    else:
        return geom  # should not occur here


In [4]:
# Load HDB CSV and build point geometry from X/Y (already in SVY21 in your file)
hdb = pd.read_csv(PATH_HDB_CSV)

# Confirm XY columns
assert {"X", "Y"}.issubset(hdb.columns), "Expected columns 'X' and 'Y' in HDB CSV."

# Clean coordinate types
hdb["X"] = pd.to_numeric(hdb["X"], errors="coerce")
hdb["Y"] = pd.to_numeric(hdb["Y"], errors="coerce")
hdb = hdb.dropna(subset=["X", "Y"])

# Build GeoDataFrame with EPSG:3414
gdf_hdb = gpd.GeoDataFrame(hdb, geometry=gpd.points_from_xy(hdb["X"], hdb["Y"]), crs=TARGET_CRS)

# Save to GeoJSON (or GPKG if you prefer)
gdf_hdb.to_file(OUT_HDB, driver="GeoJSON")
print(f"✅ HDB saved: {OUT_HDB}  |  rows={len(gdf_hdb)}")

✅ HDB saved: processed\hdb_2023_points_3414.geojson  |  rows=25760


In [5]:
mrt_dir = unzip_to_dir(PATH_ZIP_MRT)
mrt_shp = first_shp(mrt_dir)
if mrt_shp is None:
    raise FileNotFoundError("No .shp found in MRT zip.")

gdf_mrt = gpd.read_file(mrt_shp)
gdf_mrt = reproject_gdf(gdf_mrt, TARGET_CRS)
gdf_mrt["geometry"] = gdf_mrt["geometry"].apply(to_2d_geometry)

# Standardize a name column if present
if "Name" in gdf_mrt.columns and "station_name" not in gdf_mrt.columns:
    gdf_mrt = gdf_mrt.rename(columns={"Name": "station_name"})

gdf_mrt.to_file(OUT_MRT, driver="GeoJSON")
print(f"✅ MRT saved: {OUT_MRT}  |  rows={len(gdf_mrt)}  |  crs={gdf_mrt.crs}")

✅ MRT saved: processed\mrt_2024_points_3414.geojson  |  rows=563  |  crs=EPSG:3414


In [6]:
bus_dir = unzip_to_dir(PATH_ZIP_BUS)
bus_shp = first_shp(bus_dir)
if bus_shp is None:
    raise FileNotFoundError("No .shp found in Bus zip.")

gdf_bus = gpd.read_file(bus_shp)
# Reproject if needed
if str(gdf_bus.crs).lower() != TARGET_CRS.lower():
    gdf_bus = gdf_bus.to_crs(TARGET_CRS)

# Rename helpful fields (optional)
rename_map = {}
if "BUS_STOP_N" in gdf_bus.columns: rename_map["BUS_STOP_N"] = "bus_stop_no"
if "LOC_DESC"   in gdf_bus.columns: rename_map["LOC_DESC"]   = "desc"
if rename_map:
    gdf_bus = gdf_bus.rename(columns=rename_map)

gdf_bus.to_file(OUT_BUS, driver="GeoJSON")
print(f"✅ Bus saved: {OUT_BUS}  |  rows={len(gdf_bus)}  |  crs={gdf_bus.crs}")

✅ Bus saved: processed\bus_2025_points_3414.geojson  |  rows=5172  |  crs=EPSG:3414


In [7]:
# Uses a Fiona-based pipeline to avoid Shapely MultiPolygon array-interface hiccups
robust_reproject_write_polygons_with_fiona(PATH_ZIP_PA, OUT_PA)
print(f"✅ Planning Areas saved: {OUT_PA}")

✅ Planning Areas saved: processed\mp19_planning_areas_3414.geojson


In [8]:
robust_reproject_write_polygons_with_fiona(PATH_ZIP_SZ, OUT_SZ)
print(f"✅ Subzones saved: {OUT_SZ}")

✅ Subzones saved: processed\mp19_subzones_3414.geojson


In [9]:
# Reload the GeoJSONs (optional) and print compact stats
def layer_stats(path):
    g = gpd.read_file(path)
    return {
        "path": str(path),
        "rows": len(g),
        "crs": str(g.crs),
        "geom_types": list(g.geom_type.unique())
    }

report = {
    "HDB": layer_stats(OUT_HDB),
    "MRT": layer_stats(OUT_MRT),
    "Bus": layer_stats(OUT_BUS),
    "PlanningAreas": layer_stats(OUT_PA),
    "Subzones": layer_stats(OUT_SZ),
}

report

{'HDB': {'path': 'processed\\hdb_2023_points_3414.geojson',
  'rows': 25760,
  'crs': 'EPSG:3414',
  'geom_types': ['Point']},
 'MRT': {'path': 'processed\\mrt_2024_points_3414.geojson',
  'rows': 563,
  'crs': 'EPSG:3414',
  'geom_types': ['Point']},
 'Bus': {'path': 'processed\\bus_2025_points_3414.geojson',
  'rows': 5172,
  'crs': 'EPSG:3414',
  'geom_types': ['Point']},
 'PlanningAreas': {'path': 'processed\\mp19_planning_areas_3414.geojson',
  'rows': 55,
  'crs': 'EPSG:3414',
  'geom_types': ['MultiPolygon']},
 'Subzones': {'path': 'processed\\mp19_subzones_3414.geojson',
  'rows': 332,
  'crs': 'EPSG:3414',
  'geom_types': ['MultiPolygon']}}

Preprocess Healthcare and Recreation Poltgon data.

In [10]:
# INPUT GPKGs (edit paths if different)
GPKG_HEALTH = "data/SG_HEALTH & MEDICAL CARE.gpkg"
GPKG_SPORTS = "data/SG_SPORTS & RECREATION.gpkg"

# Output folder
OUT_DIR = Path("processed"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# Target CRS (meters)
TARGET_CRS = "EPSG:3414"

# ---- Inspect helper (robust) ----
def inspect_with_fiona(path):
    layers = fiona.listlayers(path)
    summaries = []
    for lyr in layers:
        with fiona.open(path, layer=lyr) as src:
            crs = src.crs_wkt or src.crs
            try:
                crs_str = CRS.from_wkt(crs).to_string() if isinstance(crs, str) else CRS.from_user_input(crs).to_string()
            except Exception:
                crs_str = str(crs)
            geom_type = src.schema.get("geometry", "Unknown")
            props = list(src.schema.get("properties", {}).keys())
            cnt = sum(1 for _ in src)
            summaries.append({"layer": lyr, "count": cnt, "crs": crs_str,
                              "geometry_type": geom_type, "fields_sample": props[:12]})
    return layers, summaries

print("HEALTH:", inspect_with_fiona(GPKG_HEALTH))
print("SPORTS:", inspect_with_fiona(GPKG_SPORTS))

# ---- Polygon centroid utils (no Shapely dependency) ----
def polygon_centroid(coords):
    # coords = [ring0, ring1, ...]; ring0 = outer ring [(x,y), ...]
    if not coords or not coords[0]:
        return None
    ring = coords[0]
    if ring[0] != ring[-1]:
        ring = ring + [ring[0]]
    A = Cx = Cy = 0.0
    for i in range(len(ring)-1):
        x0,y0 = ring[i]; x1,y1 = ring[i+1]
        cross = x0*y1 - x1*y0
        A += cross; Cx += (x0 + x1)*cross; Cy += (y0 + y1)*cross
    if A == 0:
        xs = [p[0] for p in ring[:-1]]; ys = [p[1] for p in ring[:-1]]
        return (sum(xs)/len(xs), sum(ys)/len(ys))
    A *= 0.5; Cx /= (6*A); Cy /= (6*A)
    return (Cx, Cy)

def multipolygon_centroid(multi_coords):
    # area-weighted centroid across polygons
    total_A = Cx_sum = Cy_sum = 0.0
    for poly in multi_coords:
        ring = poly[0] if poly else None
        if not ring: 
            continue
        if ring[0] != ring[-1]:
            ring = ring + [ring[0]]
        A = Cx = Cy = 0.0
        for i in range(len(ring)-1):
            x0,y0 = ring[i]; x1,y1 = ring[i+1]
            cross = x0*y1 - x1*y0
            A += cross; Cx += (x0 + x1)*cross; Cy += (y0 + y1)*cross
        A *= 0.5
        if A == 0: 
            continue
        Cx /= (6*A); Cy /= (6*A)
        total_A += abs(A); Cx_sum += Cx*abs(A); Cy_sum += Cy*abs(A)
    if total_A == 0:
        return None
    return (Cx_sum/total_A, Cy_sum/total_A)

def reproject_write_polygons_and_centroids(src_gpkg, src_layer, out_poly_path, out_pts_path):
    with fiona.open(src_gpkg, layer=src_layer) as src:
        transformer = Transformer.from_crs(src.crs_wkt or src.crs, TARGET_CRS, always_xy=True)
        # Polygon writer (MultiPolygon schema)
        schema_poly = {"geometry":"MultiPolygon", "properties": {k: v for k,v in src.schema["properties"].items()}}
        with fiona.open(out_poly_path, mode="w", driver="GeoJSON", crs=TARGET_CRS, schema=schema_poly) as dst_poly, \
             fiona.open(out_pts_path,  mode="w", driver="GeoJSON", crs=TARGET_CRS,
                        schema={"geometry":"Point","properties":schema_poly["properties"]}) as dst_pt:
            for feat in src:
                geom = feat["geometry"]
                if geom is None:
                    continue
                # normalize to MultiPolygon
                if geom["type"] == "Polygon":
                    mcoords = [geom["coordinates"]]
                elif geom["type"] == "MultiPolygon":
                    mcoords = geom["coordinates"]
                else:
                    continue  # skip non-polygons
                # transform coords to 3414 + drop Z
                mcoords_tx = []
                for poly in mcoords:
                    poly_tx = []
                    for ring in poly:
                        ring_tx = []
                        for c in ring:
                            X,Y = transformer.transform(c[0], c[1])
                            ring_tx.append((X, Y))
                        poly_tx.append(ring_tx)
                    mcoords_tx.append(poly_tx)
                # write polygon
                dst_poly.write({"geometry": {"type":"MultiPolygon", "coordinates": mcoords_tx},
                                "properties": feat["properties"]})
                # area-weighted centroid (in 3414)
                cen = multipolygon_centroid(mcoords_tx)
                if cen is not None:
                    dst_pt.write({"geometry": {"type":"Point", "coordinates": cen},
                                  "properties": feat["properties"]})

# ---- Run for HEALTH ----
OUT_HEALTH_POLY = OUT_DIR / "health_polygons_3414.geojson"
OUT_HEALTH_PTS  = OUT_DIR / "health_centroids_3414.geojson"
reproject_write_polygons_and_centroids(GPKG_HEALTH, "LU_DESC_HEALTH & MEDICAL CARE",
                                       OUT_HEALTH_POLY, OUT_HEALTH_PTS)
print("Saved:", OUT_HEALTH_POLY, OUT_HEALTH_PTS)

# ---- Run for SPORTS ----
OUT_SPORTS_POLY = OUT_DIR / "sports_polygons_3414.geojson"
OUT_SPORTS_PTS  = OUT_DIR / "sports_centroids_3414.geojson"
reproject_write_polygons_and_centroids(GPKG_SPORTS, "LU_DESC_SPORTS & RECREATION",
                                       OUT_SPORTS_POLY, OUT_SPORTS_PTS)
print("Saved:", OUT_SPORTS_POLY, OUT_SPORTS_PTS)

HEALTH: (['LU_DESC_HEALTH & MEDICAL CARE'], [{'layer': 'LU_DESC_HEALTH & MEDICAL CARE', 'count': 198, 'crs': 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG","4326"]]', 'geometry_type': '3D Polygon', 'fields_sample': ['id', 'Name', 'description', 'timestamp', 'begin', 'end', 'altitudeMode', 'tessellate', 'extrude', 'visibility', 'drawOrder', 'icon']}])
SPORTS: (['LU_DESC_SPORTS & RECREATION'], [{'layer': 'LU_DESC_SPORTS & RECREATION', 'count': 243, 'crs': 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AXIS["Latitude",NORTH],AXIS["Longitude",EAST],AUTHORITY["EPSG"