# Graph & Routing

This notebook contains the code for building our network graph and routing algorithm.

In [None]:
# necessary imports
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
from shapely.geometry import LineString, MultiLineString, Point
from collections import defaultdict

import random
import math
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union

import folium

# module imports
from src.routing_graph import (
    GraphBuildConfig,
    RiskConfig,
    RoutingMonthArtifacts,
    build_routing_graph_for_month,
    build_graph_with_risk_for_month,
    verify_graph_sanity,
    NodeKey,
)
from src.routing_algorithms import (
    nearest_graph_node,
    route_stats,
    shortest_path_by,
    constrained_min_risk_route,
    run_one_od_routing,
    path_to_multiline_latlon,
)

# set project root
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

# Save map to figs folder
fig_dir = PROJECT_ROOT / "figs"
fig_dir.mkdir(parents=True, exist_ok=True)



The next cell loads the data frames that were constructed through the merging procedure executed in `01_data_preparation.ipynb` and derives longitude and latitude data from the the geometry column.

In [28]:
junction_df = gpd.read_parquet("../data/merged/berlin_bike_accident_node_panel.parquet")
acc_node_df = gpd.read_parquet("../data/merged/acc_node.parquet")
segments_df = gpd.read_parquet("../data/merged/berlin_bike_accident_strava_risk_core_panel.parquet")

# Ensure expected CRS for folium (lat/lon)
junction_df = junction_df.to_crs(epsg=4326)
acc_node_df = acc_node_df.to_crs(epsg=4326)
segments_df = segments_df.to_crs(epsg=4326)

# Create longitude and latitude columns from geometry
if "longitude" not in junction_df.columns:
    junction_df["longitude"] = junction_df.geometry.x
if "latitude" not in junction_df.columns:
    junction_df["latitude"] = junction_df.geometry.y

if "longitude" not in acc_node_df.columns:
    acc_node_df["longitude"] = acc_node_df.geometry.x
if "latitude" not in acc_node_df.columns:
    acc_node_df["latitude"] = acc_node_df.geometry.y


acc_node_df


Unnamed: 0,acc_id,year,month,geometry,weekday_type,time_of_day,light_condition,accident_type,accident_kind,injury_severity,index_node,node_id,dist_node,has_crossing,longitude,latitude
0,29872,2021,4,POINT (13.48861 52.46449),weekday,work_hours (7h-18h),0.0,5,1,2,,,,False,13.488608,52.464495
1,25448,2024,3,POINT (13.19382 52.5337),weekend,work_hours (7h-18h),0.0,3,5,3,,,,False,13.193820,52.533695
2,9074,2019,9,POINT (13.60631 52.45275),weekday,night (22h-7h),1.0,2,5,3,,,,False,13.606309,52.452751
3,6943,2019,5,POINT (13.3133 52.57682),weekday,work_hours (7h-18h),0.0,2,5,3,,,,False,13.313300,52.576824
4,7115,2019,5,POINT (13.68352 52.37053),weekday,evening (18h-22h),0.0,1,0,1,,,,False,13.683520,52.370531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21661,2500,2018,6,POINT (13.34109 52.4852),weekday,work_hours (7h-18h),0.0,5,1,3,,,,False,13.341093,52.485197
21662,1817,2018,5,POINT (13.44447 52.53412),weekday,work_hours (7h-18h),0.0,7,6,3,,,,False,13.444473,52.534120
21663,32165,2021,9,POINT (13.45723 52.56107),weekday,evening (18h-22h),0.0,2,5,3,,,,False,13.457235,52.561067
21664,29554,2021,3,POINT (13.55801 52.45571),weekday,work_hours (7h-18h),0.0,2,5,3,1790.0,1852.0,15.145493,True,13.558009,52.455714


## Prepare spatial data

The following cell constructs a crossings GeoDataFrame with one geometry per `node_id`, derives longitude and latitude coordinates from segment geometries, and creates a coordinate-to-segment mapping for efficient spatial lookup.

In [30]:
# one geometry per node_id
crossings_gdf = (
    junction_df[["node_id", "geometry"]]
    .dropna(subset=["node_id", "geometry"])
    .drop_duplicates(subset=["node_id"])
    .copy()
)

crossings_gdf = gpd.GeoDataFrame(
    crossings_gdf,
    geometry="geometry",
    crs=junction_df.crs,
)

rep = segments_df.geometry.representative_point()
segments_df["longitude"] = rep.x
segments_df["latitude"] = rep.y

coords_to_segments = defaultdict(set)

for name, geom in zip(segments_df["counter_name"], segments_df.geometry):
    for lon, lat in geom.coords:
        coords_to_segments[(lat, lon)].add(name)


## Run routing for one OD pair

In [None]:
# Example origin-destination routing for Berlin
year = 2021
month = 6
metric_epsg = 32633
ETA = 1.0
EPS = 0.10

# These are Berlin coordinates (lon, lat)
origin_lonlat = (13.3777, 52.5163)    # Brandenburg Gate area
dest_lonlat = (13.4541, 52.5110)      # East Berlin

result_berlin = run_one_od_routing(
    segments_panel_gdf=segments_df,
    crossings_gdf=crossings_gdf,
    junction_panel_gdf=junction_df,
    year=year,
    month=month,
    origin_lonlat=origin_lonlat,
    dest_lonlat=dest_lonlat,
    eps=EPS,
    eta=ETA,
)

if result_berlin.get("status") == "disconnected":
    raise RuntimeError(f"OD pair disconnected for {year}-{month} after exposure filtering.")

G = result_berlin.get("graph", None)
if G is None:
    # fallback 
    artifacts = build_graph_with_risk_for_month(
        segments_panel_gdf=segments_df,
        crossings_gdf=crossings_gdf,
        junction_panel_gdf=junction_df,
        year=year,
        month=month,
        risk_cfg=RiskConfig(eta=ETA),
    )
    G = artifacts.G

# Shortest path geometry: choose_by length
shortest_geom = path_to_multiline_latlon(
    G, result_berlin["shortest_length_path"], metric_epsg=metric_epsg, choose_by="length_m"
)

# Safe path geometry: choose_by risk_total
safe_geom = path_to_multiline_latlon(
    G, result_berlin["constrained_min_risk_path"], metric_epsg=metric_epsg, choose_by="risk_total"
)


## Run routing over many OD pairs

In [9]:
# Extensive evaluation loop over MANY OD pairs

def _sample_reachable_od_pairs(
    G: nx.Graph,
    n_pairs: int,
    *,
    seed: int = 7,
    min_euclid_m: float = 1500.0,
    max_tries: int = 200000,
) -> list[tuple[NodeKey, NodeKey]]:
    """
    Samples OD pairs as graph nodes ensuring:
      - O != D
      - straight-line distance >= min_euclid_m
      - O and D are connected (same component) in the undirected sense
    """
    rng = random.Random(seed)
    nodes = list(G.nodes())
    if len(nodes) < 2:
        return []

    comp_map = {}
    for cid, comp in enumerate(nx.connected_components(G.to_undirected())):
        for n in comp:
            comp_map[n] = cid

    pairs: list[tuple[NodeKey, NodeKey]] = []
    tries = 0
    while len(pairs) < n_pairs and tries < max_tries:
        tries += 1
        a = rng.choice(nodes)
        b = rng.choice(nodes)
        if a == b:
            continue
        if comp_map.get(a) != comp_map.get(b):
            continue

        dx = float(a[0] - b[0])
        dy = float(a[1] - b[1])
        if math.sqrt(dx * dx + dy * dy) < min_euclid_m:
            continue

        pairs.append((a, b))

    return pairs


def evaluate_many_od_pairs(
    *,
    segments_panel_gdf: gpd.GeoDataFrame,
    crossings_gdf: gpd.GeoDataFrame,
    junction_panel_gdf: gpd.GeoDataFrame,
    year: int,
    month: int,
    n_pairs: int = 200,
    eps: float = 0.10,
    eta: float = 1.0, # junction importance
    metric_epsg: int = 32633,
    seed: int = 7,
    min_euclid_m: float = 1500.0,
) -> pd.DataFrame:
    """
    Builds month graph once, samples many reachable OD pairs on the graph,
    and evaluates:
      - shortest-by-length (P_dist)
      - optional shortest-by-cost (mixed objective)
      - constrained min-risk (P_safe) minimizing risk_total under length constraint

    Returns a DataFrame with per-OD statistics + metadata.
    """
    artifacts = build_graph_with_risk_for_month(
        segments_panel_gdf,
        year,
        month,
        crossings_gdf=crossings_gdf,
        junction_panel_gdf=junction_panel_gdf,
        graph_cfg=GraphBuildConfig(metric_epsg=metric_epsg),
        risk_cfg=RiskConfig(eta=eta),
        node_snap_m=20.0,
    )

    sanity = verify_graph_sanity(
        artifacts,
        expect_junction_penalties=(eta != 0.0),
    )
    G = artifacts.G

    od_pairs = _sample_reachable_od_pairs(G, n_pairs, seed=seed, min_euclid_m=min_euclid_m)
    if len(od_pairs) == 0:
        raise ValueError("Could not sample any reachable OD pairs. Check graph connectivity / size.")

    rows = []
    n_skipped = 0

    for i, (src, dst) in enumerate(od_pairs):
        # 1) Shortest length (P_dist)
        p_len = shortest_path_by(G, src, dst, weight="length_m")
        if p_len is None:
            n_skipped += 1
            continue
        st_len = route_stats(G, p_len, choose_by="length_m")
        shortest_len = float(st_len["length_m"])
        max_len = (1.0 + eps) * shortest_len

        
        # 2) Constrained min-risk minimizing risk_total
        p_safe = constrained_min_risk_route(
            G,
            src,
            dst,
            eps=eps,
            length_attr="length_m",
            risk_attr="risk_total",
        )
        if p_safe is None:
            n_skipped += 1
            continue
        st_safe = route_stats(G, p_safe, choose_by="risk_total")

        # Route-level risk in Methods is R(P) = sum risk_total
        R_len = float(st_len["risk_total_sum"])
        R_safe = float(st_safe["risk_total_sum"])

        row = {
            "pair_idx": i,
            "src_x": float(src[0]),
            "src_y": float(src[1]),
            "dst_x": float(dst[0]),
            "dst_y": float(dst[1]),

            "shortest_len_m": shortest_len,
            "shortest_risk_total_sum": R_len,

            "safe_len_m": float(st_safe["length_m"]),
            "safe_risk_total_sum": R_safe,

            "len_constraint_max_m": float(max_len),
            "safe_feasible": bool(float(st_safe["length_m"]) <= max_len + 1e-6),
        }

        
        rows.append(row)

    df = pd.DataFrame(rows)

    # Trade-offs relative to shortest-distance route
    df["delta_L"] = (df["safe_len_m"] - df["shortest_len_m"]) / df["shortest_len_m"]
    df["delta_R"] = (df["shortest_risk_total_sum"] - df["safe_risk_total_sum"]) / df["shortest_risk_total_sum"].replace(0, np.nan)

    
    # Metadata for reproducibility
    df.attrs["graph_sanity"] = sanity
    df.attrs["notes"] = artifacts.notes
    df.attrs["year"] = year
    df.attrs["month"] = month
    df.attrs["eps"] = eps
    df.attrs["eta"] = eta
    df.attrs["seed"] = seed
    df.attrs["min_euclid_m"] = min_euclid_m
    df.attrs["n_pairs_requested"] = n_pairs
    df.attrs["n_pairs_sampled"] = len(od_pairs)
    df.attrs["n_pairs_used"] = len(df)
    df.attrs["n_pairs_skipped"] = n_skipped

    return df


# Example usage:
eval_df = evaluate_many_od_pairs(
    segments_panel_gdf=segments_df,
    crossings_gdf=crossings_gdf,
    junction_panel_gdf=junction_df,
    year=2021, month=6,
    n_pairs=300,
    eps=0.10,
    eta=1.0,             
    seed=7,
    min_euclid_m=2000.0,
)
print(eval_df.attrs["graph_sanity"])
print(eval_df[["delta_L", "delta_R"]].describe())


{'n_nodes': 3074, 'n_edges': 4395, 'seg_risk_min': 0.0, 'seg_risk_median': 0.0, 'seg_risk_max': 4000.0, 'risk_total_median': 0.0, 'risk_total_max': 4000.0, 'length_median': 371.8095742150327, 'node_penalty_nonzero_share': 0.08964732650739476, 'graph_node_ids_attached': 2882, 'notes': 'Attached node_id to 2882/3074 graph nodes (snap<= 20.0m). Risk objective uses eta=1.0 for junction penalty weighting.'}
          delta_L     delta_R
count  300.000000  281.000000
mean     0.041628    0.738203
std      0.031892    0.340096
min      0.000000    0.000000
25%      0.010479    0.609771
50%      0.040833    0.902206
75%      0.068713    1.000000
max      0.099990    1.000000


## Loop routing over all months

In [12]:
results = []

ETA = 1.0        
EPS = 0.10       
N_PAIRS = 200

for (year, month), _ in segments_df.groupby(["year", "month"]):
    try:
        eval_df = evaluate_many_od_pairs(
            segments_panel_gdf=segments_df,
            crossings_gdf=crossings_gdf,
            junction_panel_gdf=junction_df,
            year=year,
            month=month,
            n_pairs=N_PAIRS,
            eps=EPS,
            eta=ETA,                
        )
    except ValueError as e:
        # e.g. no reachable OD pairs in this month
        print(f"[skip] {year}-{month}: {e}")
        continue

    if len(eval_df) == 0:
        print(f"[skip] {year}-{month}: empty evaluation frame")
        continue

    eval_df["year"] = year
    eval_df["month"] = month
    eval_df["eta"] = ETA
    eval_df["eps"] = EPS

    results.append(eval_df)

if len(results) == 0:
    raise RuntimeError("No valid OD-pair evaluations across any month.")

all_months_eval = pd.concat(results, ignore_index=True)


In [14]:
all_months_eval.attrs if hasattr(all_months_eval, "attrs") else "attrs lost on concat"

{}