In [None]:
import sys
from pathlib import Path
# set the notebook's CWD to your repo root
%cd D:/deepdemand
ROOT = Path.cwd().parents[0]   # go up one level
sys.path.insert(0, str(ROOT))


In [4]:
# ============================================
# build_edge_region_mapping.py
# --------------------------------------------
# Map each edge_id to an English region
# using the start point of data/subgraphs/{edge_id}/edge.geojson
# and save as JSON: {edge_id: region_code or null}.
# ============================================

import os
import json
import numpy as np
from typing import Dict, List

import geopandas as gpd
from shapely.geometry import Point
from tqdm import tqdm

from model.dataloader import load_gt  # gives you edge_to_gt

# ---- PATHS ----
REGION_SHP = (
    "data/node_features/boundaries/"
    "Regions_December_2022_Boundaries_EN_BFC_V2/"
    "RGN_DEC_2022_EN_BFC_V2.shp"
)
SUBGRAPH_DIR = "data/subgraphs/subgraphs"
OUT_JSON     = "data/traffic_volume/edge_to_region.json"


def detect_region_code_column(gdf: gpd.GeoDataFrame) -> str:
    """
    Try to find a region code column in the regions shapefile.
    Typical names: RGN22CD, RGN21CD, RGN20CD, RGNCD, code, etc.
    """
    candidates = [
        "RGN22CD", "RGN21CD", "RGN20CD", "RGN19CD", "RGNCD",
        "rgn22cd", "rgncd", "code", "CODE"
    ]
    for c in candidates:
        if c in gdf.columns:
            return c
    raise ValueError(
        "Could not find a region code column in regions shapefile. "
        f"Available columns: {list(gdf.columns)}"
    )


def get_edge_ids_from_gt() -> List[str]:
    """
    Load raw GT 2022 car volume JSON and return the list of edge_ids.
    """
    GT_PATH = "data/traffic_volume/GT_2022_car.json"

    with open(GT_PATH, "r") as f:
        data = json.load(f)   # dict: edge_id -> value

    return list(data.keys())


def main():
    # 1) Load regions
    regions = gpd.read_file(REGION_SHP)
    if regions.crs is None:
        raise ValueError("Region shapefile has no CRS; please set it before using this script.")

    region_code_col = detect_region_code_column(regions)
    print(f"Using region code column: {region_code_col}")

    # 2) Detect edge CRS from a sample edge.geojson
    edge_ids = get_edge_ids_from_gt()
    if not edge_ids:
        raise RuntimeError("No edge_ids found from GT; check load_gt() and paths.")
    sample_edge = edge_ids[0]
    sample_path = os.path.join(SUBGRAPH_DIR, sample_edge, "edge.geojson")
    if not os.path.isfile(sample_path):
        raise FileNotFoundError(f"Sample edge.geojson not found: {sample_path}")

    edge_sample_gdf = gpd.read_file(sample_path)
    edge_crs = edge_sample_gdf.crs
    if edge_crs is None:
        # assume WGS84 if missing (typical for geojson)
        print("edge.geojson has no CRS; assuming EPSG:4326.")
        edge_crs = "EPSG:4326"

    # 3) Ensure both are in the same CRS (use edge CRS as canonical)
    if regions.crs != edge_crs:
        print(f"Reprojecting regions from {regions.crs} to {edge_crs}")
        regions = regions.to_crs(edge_crs)

    # 4) Build points for all edges (start coordinate of geometry)
    edge_points_records = []
    for edge_id in tqdm(edge_ids, desc="Reading edge.geojson → points"):
        edge_path = os.path.join(SUBGRAPH_DIR, edge_id, "edge.geojson")
        if not os.path.isfile(edge_path):
            # skip missing edge folders
            continue
        g = gpd.read_file(edge_path)
        if g.empty or g.geometry.isna().all():
            continue
        geom = g.geometry.iloc[0]

        # handle LineString / MultiLineString / Point
        if geom.geom_type == "LineString":
            x0, y0 = list(geom.coords)[0]
        elif geom.geom_type == "MultiLineString":
            first_ls = list(geom.geoms)[0]
            x0, y0 = list(first_ls.coords)[0]
        elif geom.geom_type == "Point":
            x0, y0 = geom.x, geom.y
        else:
            # fallback: centroid
            centroid = geom.centroid
            x0, y0 = centroid.x, centroid.y

        edge_points_records.append({"edge_id": edge_id, "geometry": Point(x0, y0)})

    if not edge_points_records:
        raise RuntimeError("No edge points built; check edge.geojson files.")

    edge_points = gpd.GeoDataFrame(edge_points_records, geometry="geometry", crs=edge_crs)

    # 5) Spatial join: each edge point → region polygon
    joined = gpd.sjoin(
        edge_points,
        regions[[region_code_col, "geometry"]],
        how="left",
        predicate="within"
    )

    # 6) Build mapping dict and save
    mapping: Dict[str, str] = {}
    for _, row in joined.iterrows():
        edge_id = row["edge_id"]
        code = row.get(region_code_col)
        if isinstance(code, float) and np.isnan(code):
            mapping[edge_id] = None
        else:
            mapping[edge_id] = str(code)

    os.makedirs(os.path.dirname(OUT_JSON), exist_ok=True)
    with open(OUT_JSON, "w") as f:
        json.dump(mapping, f, indent=2)

    print(f"Saved edge → region mapping to {OUT_JSON}")
    print(f"Total edges in mapping: {len(mapping)}")


if __name__ == "__main__":
    main()

Using region code column: RGN22CD
Reprojecting regions from EPSG:27700 to EPSG:4326


Reading edge.geojson → points: 100%|██████████| 4530/4530 [00:19<00:00, 230.44it/s]


Saved edge → region mapping to data/traffic_volume/edge_to_region.json
Total edges in mapping: 4530


In [1]:
import json
from collections import Counter

# path to your region-mapping json
REGION_JSON = "edge_to_region.json"     # update if needed

with open(REGION_JSON, "r") as f:
    data = json.load(f)    # {edge_id : region_code}

freq = Counter(data.values())

for region, count in freq.items():
    print(region, count)

E12000002 632
E12000004 445
E12000008 1002
E12000003 589
E12000009 403
E12000006 695
E12000005 525
E12000007 85
E12000001 147
None 7
