# Facilities equity analysis (planning-area level)

This notebook parses the SportSG GeoJSON, extracts facility counts from the HTML `Description` field, computes centroids, and (optionally) spatially joins to the MasterPlan planning-area polygons to produce per-planning-area supply metrics.


## Setup: create and activate the conda environment (one-time)
Run these commands in a terminal BEFORE executing heavy spatial cells: 

```bash
conda env create -f environment.yml
conda activate urban-transformer
jupyter notebook
```

In [15]:
# Cell 1: imports and environment checks
import sys
from pathlib import Path
import json
import re
import pandas as pd

# geopandas/bs4 are optional: we will attempt to import and fall back gracefully
try:
    import geopandas as gpd
    from shapely.geometry import shape
    GEOPANDAS_OK = True
except Exception:
    GEOPANDAS_OK = False

try:
    from bs4 import BeautifulSoup
    BS4_OK = True
except Exception:
    BS4_OK = False

print('geopandas available:', GEOPANDAS_OK)
print('beautifulsoup4 available:', BS4_OK)

ROOT = Path('..').resolve() if Path('.').resolve().name == 'notebooks' else Path('.').resolve()
DATA_DIR = ROOT / 'data'
OUT_DIR = ROOT / 'outputs'
OUT_DIR.mkdir(exist_ok=True)

print('DATA_DIR =', DATA_DIR)
print('OUT_DIR =', OUT_DIR)

geopandas available: True
beautifulsoup4 available: True
DATA_DIR = /home/amber/Urban-Transformer/data
OUT_DIR = /home/amber/Urban-Transformer/outputs


In [16]:
# Cell 2: helper to parse the HTML Description table (uses BeautifulSoup if available)
def parse_description_table(html):
    if not html:
        return {}
    if BS4_OK:
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table')
        if not table:
            return {}
        out = {}
        for tr in table.find_all('tr'):
            th = tr.find('th')
            td = tr.find('td')
            if th and td:
                out[th.get_text(strip=True)] = td.get_text(strip=True)
        return out
    # fallback: simple regex extraction of <th>..</th><td>..</td> pairs
    out = {}
    pairs = re.findall(r'<th[^>]*>([^<]+)</th>*<td[^>]*>([^<]+)</td>', html, flags=re.I)
    for k, v in pairs:
        out[k.strip()] = v.strip()
    return out

def to_number(s):
    if s is None:
        return 0
    s = str(s).strip()
    if s == '':
        return 0
    s2 = re.sub(r'[^0-9.-]', '', s)
    try:
        if '.' in s2:
            return float(s2)
        return int(s2)
    except Exception:
        return 0

In [17]:
# Cell 3: parse facility GeoJSON into a DataFrame (non-spatial fallback works without geopandas)
FAC_PATH = DATA_DIR / 'SportSGSportFacilitiesGEOJSON.geojson'
assert FAC_PATH.exists(), f'Missing {FAC_PATH}'

with open(FAC_PATH, 'r', encoding='utf8') as fh:
    gj = json.load(fh)

records = []
for feat in gj.get('features', []):
    props = feat.get('properties', {}) or {}
    desc = props.get('Description') or props.get('description') or ''
    attrs = parse_description_table(desc)
    rec = {}
    rec['name'] = attrs.get('SPORTS_CEN') or props.get('SPORTS_CEN') or props.get('Name') or ''
    # extract some common numeric fields (if present)
    for key in ['BADMINTON_', 'TABLE_TENN', 'TENNIS_COU', 'SWIMMING_C', 'WADING_POO', 'GYM', 'FOOTBALL_F', 'ATHLETICS_']:
        if key in attrs:
            rec[key] = to_number(attrs.get(key))
        elif key in props:
            rec[key] = to_number(props.get(key))
        else:
            rec[key] = 0
    # geometry centroid (lon, lat) if geometry present
    geom = feat.get('geometry')
    if geom and geom.get('type') in ('Polygon', 'MultiPolygon'):
        # compute centroid roughly by averaging coordinates (fast fallback)
        coords = []
        def collect_coords(g):
            if isinstance(g, list):
                for x in g:
                    collect_coords(x)
            elif isinstance(g, (int, float)):
                pass
        # safer approach: find the first list of coordinate tuples
        try:
            rings = geom.get('coordinates', [])
            # dig until we find numeric pairs
            def find_pairs(o):
                if isinstance(o, list) and len(o) and isinstance(o[0], (list, tuple)):
                    return o
                if isinstance(o, list):
                    for e in o:
                        res = find_pairs(e)
                        if res:
                            return res
                return None
            pairs = find_pairs(rings) or []
            xs = [p[0] for p in pairs if isinstance(p, (list, tuple)) and len(p)>=2]
            ys = [p[1] for p in pairs if isinstance(p, (list, tuple)) and len(p)>=2]
            if xs and ys:
                rec['centroid_lon'] = sum(xs)/len(xs)
                rec['centroid_lat'] = sum(ys)/len(ys)
            else:
                rec['centroid_lon'] = None
                rec['centroid_lat'] = None
        except Exception:
            rec['centroid_lon'] = None
            rec['centroid_lat'] = None
    else:
        rec['centroid_lon'] = None
        rec['centroid_lat'] = None
    records.append(rec)

fac_df = pd.DataFrame(records)
fac_out = OUT_DIR / 'parsed_sportsg_facilities.csv'
fac_df.to_csv(fac_out, index=False)
print('Wrote parsed facility attributes to', fac_out)
fac_df.head(10)

Wrote parsed facility attributes to /home/amber/Urban-Transformer/outputs/parsed_sportsg_facilities.csv


Unnamed: 0,name,BADMINTON_,TABLE_TENN,TENNIS_COU,SWIMMING_C,WADING_POO,GYM,FOOTBALL_F,ATHLETICS_,centroid_lon,centroid_lat
0,Clementi Stadium,0,0,0,0,0,0,1,1,,
1,Clementi Sports Centre,14,10,0,0,1,1,0,0,,
2,Jurong West Sports Centre,11,3,2,0,1,1,1,1,,
3,Kallang Basin Swimming Complex,0,0,0,0,1,0,0,0,,
4,Kallang Sports Centre,0,0,14,0,0,0,1,1,,
5,Katong Swimming Complex,0,0,0,0,1,0,0,0,,
6,AMK Swimming Complex,0,0,0,0,1,0,0,0,,
7,Bishan Sports Centre,4,9,0,0,1,1,1,1,,
8,Bukit Batok Swimming Complex,0,0,0,0,1,0,0,0,,
9,Farrer Park Field and Tennis Centre,0,0,8,0,0,0,1,0,,


In [18]:
# Cell 4: optional spatial join (requires geopandas)
if not GEOPANDAS_OK:
    print('geopandas not available — skip spatial join. Create conda env and enable geopandas to run this cell.')
else:
    PLAN_PATH = DATA_DIR / 'MasterPlan2019PlanningAreaBoundaryNoSea.geojson'
    assert PLAN_PATH.exists(), f'Missing {PLAN_PATH}'
    print('Loading planning areas...')
    plan_gdf = gpd.read_file(PLAN_PATH)
    # try to extract a planning-area name from Description table where present
    def get_plan_name(desc):
        d = parse_description_table(desc)
        for key in ('PLN_AREA_N', 'PLN_AREA_N'.upper(), 'PLN_AREA_N'.lower()):
            if key in d:
                return d[key]
        return None
    if 'planning_area' not in plan_gdf.columns:
        # create a planning_area column heuristically
        if 'Description' in plan_gdf.columns:
            plan_gdf['planning_area'] = plan_gdf['Description'].apply(lambda x: get_plan_name(x) or '')
        else:
            plan_gdf['planning_area'] = plan_gdf.index.astype(str)
    # load parsed facilities and convert to GeoDataFrame
    pf = fac_df.copy()
    pf = pf.dropna(subset=['centroid_lon', 'centroid_lat']).copy()
    pf['geometry'] = gpd.points_from_xy(pf['centroid_lon'], pf['centroid_lat'])
    pf_gdf = gpd.GeoDataFrame(pf, geometry='geometry', crs='EPSG:4326')
    # ensure same CRS and spatial join
    pf_gdf = pf_gdf.to_crs(plan_gdf.crs)
    joined = gpd.sjoin(pf_gdf, plan_gdf[['planning_area', 'geometry']], how='left', predicate='within')
    agg = joined.groupby('planning_area').agg(total_facilities=('name','count')).reset_index()
    merged = plan_gdf.merge(agg, on='planning_area', how='left')
    merged['total_facilities'] = merged['total_facilities'].fillna(0).astype(int)
    merged_out = OUT_DIR / 'facilities_by_planning_area.geojson'
    merged.to_file(merged_out, driver='GeoJSON')
    print('Wrote spatial aggregation to', merged_out)

Loading planning areas...
Wrote spatial aggregation to /home/amber/Urban-Transformer/outputs/facilities_by_planning_area.geojson


## Alignment with Master Plan 2025

This analysis supports Master Plan objectives such as promoting liveable and inclusive neighbourhoods, improving active mobility and sustainable transport, and ensuring equitable distribution of social infrastructure (sports facilities, cycling paths).

The planning-area summaries below produce per-area facility counts and (where population data exists) simple supply rates. These can be used to identify gaps relative to Master Plan spatial strategies and to inform targeted facility or cycling-path investments.

In [None]:
# Cell 5: quick alignment check with planning-area outputs
import numpy as np
import geopandas as gpd
from pathlib import Path
OUT_DIR = Path('..') / 'outputs' if Path('.').resolve().name == 'notebooks' else Path('outputs')
pa_fp = OUT_DIR / 'facilities_by_planning_area.geojson'
if not pa_fp.exists():
    print('Missing', pa_fp, "— run the spatial join cell (Cell 4) to create it.")
else:
    pa = gpd.read_file(pa_fp)
    # try find population column heuristically
    pop_cols = [c for c in pa.columns if 'pop' in c.lower() or 'population' in c.lower()]
    if pop_cols:
        pop_col = pop_cols[0]
        pa['fac_per_1000'] = pa['total_facilities'] / (pa[pop_col].replace(0, np.nan) / 1000)
        print('Using population column:', pop_col)
        display(pa[['planning_area','total_facilities', pop_col, 'fac_per_1000']].sort_values('fac_per_1000').head(10))
    else:
        print('No population column found in planning-area GeoJSON. Showing facility counts per planning area:')
        display(pa[['planning_area','total_facilities']].sort_values('total_facilities').head(20))
    print('\nFacility count summary:')
    display(pa['total_facilities'].describe())
    # write a CSV summary for downstream use
    out = OUT_DIR / 'facilities_by_pa_summary.csv'
    pa[['planning_area','total_facilities']].to_csv(out, index=False)
    print('Wrote summary to', out)

## Outcomes and deliverables (evidence of thorough understanding)

Below is a comprehensive list of concrete outcomes this project will (or already does) produce, mapped to the datasets and analyses in this repository.

### Data ingestion and cleaning
- Parsed SportSG facilities with attributes extracted from HTML Description table → `outputs/parsed_sportsg_facilities.csv` (from Cell 3).
- Valid centroids generated for facilities; handling of missing/invalid geometries documented.
- Planning areas loaded from Master Plan 2019 boundaries; heuristic extraction of `planning_area` field from polygon metadata where needed (Cell 4).
- CRS harmonisation to a projected CRS suitable for distance analysis (Singapore SVY21 EPSG:3414 recommended).
- Cycling network KML inspected and prepared for use in accessibility metrics (conversion to GeoDataFrame and, where feasible, routable network graph).

### Spatial aggregation and baseline supply
- Spatial join of facilities to planning areas using point-in-polygon → `outputs/facilities_by_planning_area.geojson` (Cell 4).
- Per planning area facility counts and summary statistics → `outputs/facilities_by_pa_summary.csv` (Cell 5).
- Optional subzone-level aggregation (using `MasterPlan2019SubzoneBoundaryNoSeaGEOJSON.geojson`) to test Modifiable Areal Unit Problem (MAUP).

### Accessibility metrics
- Euclidean metrics per planning area:
  - Nearest facility distance (meters) from area centroid.
  - Counts of facilities within 400/800/1600 m buffers.
- Network-aware metrics (if a routable cycling/street graph is available):
  - Shortest-path distance along network to nearest facility.
  - Cumulative opportunities (count of facilities within X minutes cycling).
- Cycling infrastructure indicators: total cycling-path length within area, density per km².

### Demographics and participation
- Integration of population/household stats with the planning areas (e.g., age bands, income).
- Participation metrics from "Top Sports & Exercise – ..." CSVs aligned to demographic strata (age/ethnicity/sex).
- Derived rates: participants per group population; per-capita facility access by group (facilities per 1,000 group members).

### Equity measurement and statistical testing
- Inequality indices over access metrics (e.g., Gini, Concentration/Atkinson indices).
- Hypothesis tests comparing access across demographic groups (ANOVA/Kruskal–Wallis, bootstrapped mean differences).
- Identification and mapping of underserved areas (e.g., bottom 20% by access and/or participation).

### Geospatial machine learning
- Predictive models for participation rate or underserved classification using built-environment and accessibility features:
  - Spatial regression (lag/error) via PySAL.
  - Geographically Weighted Regression (GWR) via mgwr.
  - Tree-based models (RandomForest/XGBoost) with spatial cross-validation.
- Model interpretability: feature importances and SHAP value explanations; spatial mapping of local contributions.

### Validation, sensitivity, and robustness
- Spatial block cross-validation to reduce spatial leakage; report of spatial-CV vs random-CV performance.
- Sensitivity to spatial unit (planning area vs subzone vs hex grid) and buffer thresholds (400/800/1600 m).
- Robustness checks: outlier handling, imputation choices, and alternative access definitions (Euclidean vs network).

### Visualizations and communication
- Choropleths of facilities per 1,000 residents and nearest-distance surfaces by area.
- Overlays of facilities and cycling paths for visual context.
- Interactive maps (folium/kepler.gl) to explore access and equity by demographic group.
- Summary dashboards/figures to support planning recommendations.

### Policy alignment and impact
- Direct alignment to Master Plan 2025 objectives: liveability, inclusivity, active mobility, and equitable social infrastructure distribution.
- Actionable insights: list of candidate areas for new/expanded sports facilities or cycling-path improvements based on quantified gaps.

### Reproducibility
- Single-source notebook (`notebooks/facilities_equity_analysis.ipynb`) with documented steps and outputs saved under `outputs/`.
- `environment.yml` updated with geospatial/ML dependencies for deterministic setup.
- Intermediate artifacts stored as GeoJSON/CSV/Parquet to accelerate iteration and enable auditability.

## Datasets needed (core + optional)

Below is a checklist of datasets required for the analysis, with purpose and the exact filenames found under `data/` in this repo.

### Core geospatial datasets
- Sport facilities (supply, geometry): `data/SportSGSportFacilitiesGEOJSON.geojson`
- Planning area boundaries (analysis geography): `data/MasterPlan2019PlanningAreaBoundaryNoSea.geojson`
- Subzone boundaries (alternative geography for MAUP checks): `data/MasterPlan2019SubzoneBoundaryNoSeaGEOJSON.geojson`
- Cycling paths (active mobility network; for network distance/coverage): `data/CyclingPathNetworkKML.kml`

### Demographics and participation (outcomes/explanatory)
- Participation by age: 
  - `data/Top Sports & Exercise – 13 to 19 years old.csv`
  - `data/Top Sports & Exercise – 20 to 39 years old.csv`
  - `data/Top Sports & Exercise – 40 to 59 years old.csv`
  - `data/Top Sports & Exercise – 60+ years old.csv`
- Participation by ethnicity:
  - `data/Top Sports & Exercise – Chinese.csv`
  - `data/Top Sports & Exercise – Indians.csv`
  - `data/Top Sports & Exercise – Malays.csv`
- Participation by sex:
  - `data/Top Sports & Exercise – Females.csv`
- Socioeconomic / population denominators:
  - Income (household): `data/ResidentHouseholdsbyMonthlyHouseholdIncomefromWork1andNumberofWorkingPersonsinHouseholdGeneralHouseholdSurvey2015.csv`
  - Working persons by planning area (proxy for population distribution): `data/ResidentWorkingPersonsAged15YearsandOverbyPlanningAreaandUsualModeofTransporttoWorkGeneralHouseholdSurvey2015.csv`
  - Mode to work by age/sex (additional demographic context): `data/ResidentWorkingPersonsAged15YearsandOverbyUsualModeofTransporttoWorkAgeGroupandSexGeneralHouseholdSurvey2015.csv`
  - Students travel time (youth distribution proxy): `data/ResidentStudentsAged5YearsandOverbyTravellingTimetoSchoolLevelofEducationAttendingandSexGeneralHouseholdSurvey2015.csv`

Note: If available, a direct "Population by Planning Area (by age/sex)" dataset would provide better denominators; otherwise, the above serve as proxies. Consider adding a SingStat population-by-planning-area file for current year.

### Built-environment and transport covariates (optional but valuable)
- Land use polygons (for land-use mix indices): `data/MasterPlan2019LandUselayer.geojson`
- Rail/MRT stations:
  - `data/AmendmenttoMasterPlan2019RailStationlayer.geojson`
  - `data/LTAMRTStationExitGEOJSON.geojson`
- School zones (youth-related context): `data/LTASchoolZone.geojson`

### Socioeconomic proxies (optional)
- Housing market proxies:
  - `data/Resale Flat Prices (Based on Approval Date), 1990 - 1999.csv`
  - `data/Resale Flat Prices (Based on Approval Date), 2000 - Feb 2012.csv`
  - `data/Resale Flat Prices (Based on Registration Date), From Mar 2012 to Dec 2014.csv`
  - `data/Resale Flat Prices (Based on Registration Date), From Jan 2015 to Dec 2016.csv`
  - `data/Resale flat prices based on registration date from Jan-2017 onwards.csv`
- Public transport usage (control): `data/PublicTransportUtilisationAveragePublicTransportRidership.csv`

### Not required for this study (present but out-of-scope)
- Industry/business sentiment and building energy datasets (e.g., manufacturing expectations, commercial building energy) unless used as broad economic context:
  - `data/Business Expectations for the Next Three and Six Months, Manufacturing clusters.csv`
  - `data/Business Expectations for the Next Three and Six Months, Manufacturing sub-clusters.csv`
  - `data/Business Expectations for the Next Three and Six Months, Total Manufacturing.csv`
  - `data/Listing of Building Energy Performance Data 2020.csv`
  - `data/Listing of Building Energy Performance Data for Commercial Buildings.csv`

### Minimal must-have subset (to run core pipeline)
1) `SportSGSportFacilitiesGEOJSON.geojson`
2) `MasterPlan2019PlanningAreaBoundaryNoSea.geojson`
3) At least one population/denominator table (preferably population by planning area; otherwise the working-persons proxy)
4) Participation CSVs (age/ethnicity/sex as available)
5) `CyclingPathNetworkKML.kml` (for network-based metrics; can be skipped for Euclidean-only)


## Integrating SG Bus Data (cheeaun/sgbusdata)

This project can leverage the SG Bus Data repo in two ways:

- Online (no build): read clean JSON from the public data server `https://data.busrouter.sg/v1/`.
- Offline (vendor a snapshot): clone/download `cheeaun/sgbusdata` and copy required JSON files from its `data/v1/` folder into `data/sgbusdata/` here.

Notes:
- Building the dataset from scratch (Node+scripts) requires an LTA Datamall API key and running the repo commands; you don’t need this if you just consume the published data.
- We’ll primarily use bus stops (points) and optionally routes/patterns (lines) to create access features: nearest stop distance, stop counts within 400/800 m, and stop density per planning area.


In [3]:
# Cell 6: Load SG Bus Data (stops) and compute proximity & coverage metrics (improved + empty dataset handling + dict-mapping fix)
"""
Enhancements over initial version:
- Robust handling if `fac_df` or `plan_gdf` are not yet in the session.
- Faster nearest-stop distance using spatial index (R-tree) instead of unary union distance.
- More efficient stop counts within radii using spatial index + filtering rather than per-facility buffer `.within` scans.
- Optional integration of bus service route lines if available (to compute route coverage length per planning area and facility proximity to routes).
- Graceful fallbacks, clear logging, and empty data protection.
- NEW: Correctly parse `stops.json` structure (dict mapping code -> [lon, lat, name, road]) instead of assuming list or `{'stops': ...}` only.
- NEW: Added additional candidate filenames (compressed / geojson variants) for stops data.
"""
import os
import json
from pathlib import Path
import math
import pandas as pd
from datetime import datetime

try:
    import geopandas as gpd
    from shapely.geometry import Point
    from shapely.strtree import STRtree  # if shapely >=2; fallback to sindex for geopandas
    GEOPANDAS_OK = True
except Exception:
    GEOPANDAS_OK = False

BASE_URL = os.environ.get('SGBUSDATA_BASE_URL', 'https://data.busrouter.sg/v1')
LOCAL_DIR = Path('..') / 'data' / 'sgbusdata' if Path('.').resolve().name == 'notebooks' else Path('data') / 'sgbusdata'
CACHE_DIR = Path('..') / 'outputs' / 'cache' if Path('.').resolve().name == 'notebooks' else Path('outputs') / 'cache'
CACHE_DIR.mkdir(parents=True, exist_ok=True)

STOP_FILES_CANDIDATES = [
    'bus-stops.json',          # historical naming (often 404)
    'stops.json',              # main JSON (dict mapping code->[lon,lat,name,road])
    'stops.min.json',          # minified variant
    'stops.geojson',           # geojson with FeatureCollection
    'stops.min.geojson',       # minified geojson
]
ROUTE_FILES_CANDIDATES = [
    'bus-services.json',       # list of services (sometimes)
    'services.json',
    'routes.json',             # route geometry (non-geojson)
    'routes.geojson',          # geojson routes
    'routes.min.geojson'
]
PATTERN_PREFIXES = [  # heuristic prefixes for pattern files; repository structure may differ
    'patterns', 'routes'
]


def _http_get(url, timeout=20):
    import requests
    try:
        r = requests.get(url, timeout=timeout)
        if r.ok:
            return r
    except Exception:
        return None
    return None


def load_json_candidates(base_url, local_dir, candidates):
    """Try remote then local for a list of candidate filenames; return first successful JSON object + origin."""
    for fname in candidates:
        url = f"{base_url.rstrip('/')}/{fname}"
        r = _http_get(url)
        if r and r.ok:
            # attempt parse regardless of content-type (server sometimes serves text/plain)
            try:
                return r.json(), f"remote:{fname}"
            except Exception:
                pass
    for fname in candidates:
        fp = local_dir / fname
        if fp.exists():
            try:
                with open(fp, 'r', encoding='utf-8') as fh:
                    return json.load(fh), f"local:{fname}"
            except Exception:
                pass
    return None, None


def load_stops():
    data, origin = load_json_candidates(BASE_URL, LOCAL_DIR, STOP_FILES_CANDIDATES)
    if data is None:
        return pd.DataFrame(), origin

    # Normalise to tabular rows
    # Supported shapes:
    # 1) {'stops': [...]} list of dicts
    # 2) [ {...}, {...} ] list of dicts
    # 3) { '01012': [lon, lat, name, road], ... } mapping code -> array
    # 4) GeoJSON FeatureCollection (if stops.geojson)
    items = []
    if isinstance(data, dict):
        if 'stops' in data and isinstance(data['stops'], list):
            items = data['stops']
        elif 'features' in data and isinstance(data['features'], list):  # geojson
            for feat in data['features']:
                props = feat.get('properties', {}) or {}
                geom = feat.get('geometry') or {}
                coords = geom.get('coordinates') if geom.get('type') == 'Point' else None
                rec = {
                    'code': props.get('code') or props.get('id') or props.get('BusStopCode'),
                    'lon': coords[0] if coords else props.get('lon') or props.get('lng'),
                    'lat': coords[1] if coords else props.get('lat'),
                    'name': props.get('name') or props.get('Description'),
                    'road': props.get('road') or props.get('Road')
                }
                items.append(rec)
        else:
            # Dict mapping code -> [lon, lat, name, road]
            # Create structured records
            sample_val = next(iter(data.values())) if data else []
            if isinstance(sample_val, (list, tuple)) and len(sample_val) >= 2:
                for code, arr in data.items():
                    lon = arr[0] if len(arr) > 0 else None
                    lat = arr[1] if len(arr) > 1 else None
                    name = arr[2] if len(arr) > 2 else None
                    road = arr[3] if len(arr) > 3 else None
                    items.append({'code': code, 'lon': lon, 'lat': lat, 'name': name, 'road': road})
            else:
                # Fallback: treat as list of unknown structure
                pass
    elif isinstance(data, list):
        items = data
    # Build DataFrame
    df = pd.DataFrame(items)

    # Harmonise column names (create standard lon/lat if alternate names exist)
    if 'longitude' in df.columns and 'lon' not in df.columns:
        df['lon'] = df['longitude']
    if 'latitude' in df.columns and 'lat' not in df.columns:
        df['lat'] = df['latitude']
    if 'lng' in df.columns and 'lon' not in df.columns:
        df['lon'] = df['lng']

    if not df.empty:
        df['__origin__'] = origin
    return df, origin


stops_df, stops_origin = load_stops()
if stops_df.empty:
    print(f"[WARN] Bus stops dataset is empty after parsing. Tried candidates: {STOP_FILES_CANDIDATES}. Origin: {stops_origin or 'none found'}")
    print("Set SGBUSDATA_BASE_URL env var or place a stops JSON/GeoJSON under data/sgbusdata/. Skipping bus stop proximity metrics.")
else:
    print(f"[{datetime.utcnow().isoformat()}] Loaded {len(stops_df)} bus stops from {stops_origin}")
    # Show simple head for verification
    display(stops_df.head())

# Heuristic column mapping (only proceed if data present)
lat_col = next((c for c in stops_df.columns if c.lower() in ['lat','latitude']), None)
lng_col = next((c for c in stops_df.columns if c.lower() in ['lon','lng','longitude']), None)
name_col = next((c for c in stops_df.columns if c.lower() in ['name','stopname','description']), None)
code_col = next((c for c in stops_df.columns if c.lower() in ['code','stopcode','busstopcode','id']), None)

if stops_df.empty:
    # Nothing further to compute
    print('Skipping spatial proximity calculations due to empty stops dataset.')
else:
    if not GEOPANDAS_OK:
        print('geopandas not available — skipping spatial metrics. Install geopandas to enable proximity calculations.')
    else:
        if not (lat_col and lng_col):
            print('[WARN] Could not infer lat/lon columns from stops data; skipping proximity metrics.')
        else:
            g_stops = gpd.GeoDataFrame(
                stops_df,
                geometry=gpd.points_from_xy(stops_df[lng_col], stops_df[lat_col]),
                crs='EPSG:4326'
            )
            target_crs = 'EPSG:3414'
            g_stops = g_stops.to_crs(target_crs)

            # Facility points from earlier parsing
            if 'fac_df' not in globals():
                print('fac_df not found; define facilities (Cell 3) before running proximity metrics.')
            else:
                pf = fac_df.dropna(subset=['centroid_lon','centroid_lat']).copy()
                if pf.empty:
                    print('[WARN] No facility points with valid centroids; skipping facility proximity metrics.')
                else:
                    g_fac_pts = gpd.GeoDataFrame(
                        pf,
                        geometry=gpd.points_from_xy(pf['centroid_lon'], pf['centroid_lat']),
                        crs='EPSG:4326'
                    ).to_crs(target_crs)

                    # Build spatial index for stops
                    sindex = g_stops.sindex

                    def nearest_stop_distance(pt):
                        possible_matches_index = list(sindex.nearest(pt.bounds, 1))
                        if not possible_matches_index:
                            return math.nan
                        stop_geom = g_stops.geometry.iloc[possible_matches_index[0]]
                        return pt.distance(stop_geom)

                    g_fac_pts['dist_to_bus_stop_m'] = g_fac_pts.geometry.apply(nearest_stop_distance)

                    def count_within_radius(pt, r):
                        bounds = (pt.x - r, pt.y - r, pt.x + r, pt.y + r)
                        candidate_idx = list(sindex.intersection(bounds))
                        if not candidate_idx:
                            return 0
                        candidates = g_stops.geometry.iloc[candidate_idx]
                        return int((candidates.distance(pt) <= r).sum())

                    for r in [400, 800]:
                        col = f'stops_within_{r}m'
                        g_fac_pts[col] = g_fac_pts.geometry.apply(lambda p, rad=r: count_within_radius(p, rad))

                    fac_bus_out = OUT_DIR / 'facilities_with_bus_proximity.csv'
                    g_fac_pts.drop(columns='geometry').to_csv(fac_bus_out, index=False)
                    print('Wrote', fac_bus_out)

            # Planning-area aggregation (optional)
            if 'plan_gdf' in globals():
                try:
                    plan_gdf = plan_gdf.to_crs(target_crs)
                    stop_in_pa = gpd.sjoin(g_stops, plan_gdf[['planning_area','geometry']], how='left', predicate='within')
                    pa_stop_counts = stop_in_pa.groupby('planning_area').size().reset_index(name='bus_stops_count')
                    pa_aug = plan_gdf.merge(pa_stop_counts, on='planning_area', how='left')
                    pa_aug['bus_stops_count'] = pa_aug['bus_stops_count'].fillna(0).astype(int)
                    out_fp = OUT_DIR / 'planning_areas_with_bus_stops.geojson'
                    pa_aug.to_file(out_fp, driver='GeoJSON')
                    print('Wrote', out_fp)
                except Exception as e:
                    print('[WARN] Failed planning-area bus stop aggregation:', e)
            else:
                print('plan_gdf not defined; skipping planning-area bus stop aggregation (run Cell 4 first).')

# Optional: attempt to load route/service data for advanced features (only if not empty)
if not stops_df.empty:
    route_data, route_origin = load_json_candidates(BASE_URL, LOCAL_DIR, ROUTE_FILES_CANDIDATES)
    if route_data:
        print(f"Loaded bus service/route metadata from {route_origin}")
        with open(CACHE_DIR / 'bus_service_metadata.json', 'w', encoding='utf-8') as fh:
            json.dump(route_data, fh)
    else:
        print('Bus service metadata not found (optional).')

print('Cell 6 completed.')

[2025-11-09T19:40:00.048493] Loaded 5175 bus stops from remote:stops.json


  print(f"[{datetime.utcnow().isoformat()}] Loaded {len(stops_df)} bus stops from {stops_origin}")


Unnamed: 0,code,lon,lat,name,road,__origin__
0,10009,103.81722,1.2821,Bt Merah Int,Bt Merah Ctrl,remote:stops.json
1,10011,103.8375,1.27774,Bef Neil Rd,New Bridge Rd,remote:stops.json
2,10017,103.83763,1.27832,Aft Hosp Dr,Eu Tong Sen St,remote:stops.json
3,10018,103.8386,1.27901,Outram Pk Stn Exit 6/SGH,Eu Tong Sen St,remote:stops.json
4,10021,103.83839,1.27745,Blk 3,Neil Rd,remote:stops.json


fac_df not found; define facilities (Cell 3) before running proximity metrics.
plan_gdf not defined; skipping planning-area bus stop aggregation (run Cell 4 first).
Loaded bus service/route metadata from remote:services.json
Cell 6 completed.
Loaded bus service/route metadata from remote:services.json
Cell 6 completed.


In [1]:
# Import necessary libraries
import geopandas as gpd
import pandas as pd

In [3]:
file = "../data/data_raw/LTAMRTStationExitGEOJSON.geojson"
LRT_MRT_gdf = gpd.read_file(file)
LRT_MRT_gdf.head()

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.90915 1.33492 0)
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.93349 1.33655 0)
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.84927 1.2977 0)
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.85084 1.2992 0)
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.90941 1.33531 0)


In [4]:
LRT_MRT_gdf.explore()