# Bicycle Accidents x Geo Data x Strava Exposure Merge


#### Import some standard libraries and helper scripts:

In [1]:
import sys
from pathlib import Path

# Ensure project root is importable when running from notebooks/
_project_root = Path.cwd().resolve()
if not (_project_root / "src").exists() and (_project_root.parent / "src").exists():
    _project_root = _project_root.parent
sys.path.insert(0, str(_project_root))

import datetime as dt

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt

# Optional: reload project modules without restarting the kernel
import importlib
import src.accidents as accidents
import src.segments as segments
import src.strava_exposure as strava_exposure
import src.panels as panels
import src.nodes as nodes
for _m in (accidents, segments, strava_exposure, panels, nodes):
    importlib.reload(_m)

from src.accidents import (
    ACCIDENT_COLUMNS_EN,
    assign_accidents_to_nearest_segment,
    load_accidents_raw,
    prepare_accidents_bike_berlin,
)
from src.segments import load_segment_geometry
from src.strava_exposure import (
    build_exposure_panel_segment_year_month,
    column_stability_summary,
    load_strava_berlin_data,
)
from src.panels import (
    aggregate_accidents_segment_year_month_rich,
    build_core_risk_panel,
    merge_exposure_and_accidents,
    sanity_check_merge,
)
from src.nodes import (
    assign_accidents_to_nearest_crossing,
    build_node_exposure_panel_from_segments,
    build_node_risk_panel,
    build_nodes_from_segment_endpoints,
    cluster_nodes_snap_grid,
    select_crossings_by_degree,
)


In [2]:
import sys
print(sys.executable)

/Users/tobias/src/data_literacy/.venv/bin/python


In [3]:
# Kernel cleanup: drop old variables from pre-refactor runs
# (Equivalent to a kernel restart for the pipeline variables.)
import gc
import sys

# Ensure the removed module isn't lingering in memory
sys.modules.pop("src.merge_datasets", None)

# Drop previously computed objects/dataframes so we don't keep stale types around
for _name in [
    "seg",
    "clustering",
    "accidents_raw",
    "accidents_bike_berlin",
    "segment_geo_gdf",
    "segment_static",
    "strava_berlin_data",
    "summary_df",
    "final_exposure_ym",
    "accidents_agg_ym_rich",
    "merged_accidents_strava_ym",
    "core_panel",
    "nodes_raw",
    "node_points",
    "crossings_gdf",
    "crossing_ids",
    "segment_node_map",
    "node_exposure_ym",
    "node_panel_ym",
    "acc_node",
    "acc_node_ym",
    "joined_nearest_unique",
    "stats",
    "merge_keys",
    "min_year",
    "max_year",
    "out_dir",
    "out_path",
]:
    globals().pop(_name, None)

gc.collect()
print("kernel_cleanup_done")


kernel_cleanup_done


## Bicycle data for Berlin

In [4]:
import gc

accidents_raw = load_accidents_raw()
print("Raw accidents shape:", accidents_raw.shape)

accident_columns_en = ACCIDENT_COLUMNS_EN

accidents_bike_berlin = prepare_accidents_bike_berlin(accidents_raw, column_map=accident_columns_en)
print(f"Filtered to bicycle accidents in Berlin -> shape: {accidents_bike_berlin.shape}")

# Free big raw dataframe early to keep memory low for Strava aggregation
del accidents_raw
gc.collect()

accidents_bike_berlin.head()


Raw accidents shape: (2098019, 32)
Filtered to bicycle accidents in Berlin -> shape: (33181, 32)


Unnamed: 0,object_id,accident_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,...,XGCSWGS84,YGCSWGS84,source_file,object_id_alt,light_condition,involved_goods_vehicle,accident_id_extended,oid,plausibility_level,fid
0,,,11,0,3,3,2018,1,15,4,...,13.403228,52.583472,Unfallorte2018_LinRef.csv,112747.0,1.0,0.0,,,,
1,,,11,0,3,3,2018,1,11,5,...,13.432186,52.535255,Unfallorte2018_LinRef.csv,112892.0,0.0,0.0,,,,
2,,,11,0,2,2,2018,1,8,2,...,13.470897,52.514173,Unfallorte2018_LinRef.csv,112902.0,0.0,0.0,,,,
3,,,11,0,1,1,2018,1,19,4,...,13.394673,52.510848,Unfallorte2018_LinRef.csv,112921.0,2.0,0.0,,,,
4,,,11,0,9,9,2018,1,18,4,...,13.506372,52.458993,Unfallorte2018_LinRef.csv,112947.0,2.0,0.0,,,,


## Rename columns to English

In [5]:
# Column map is now maintained in src.accidents
accident_columns_en = ACCIDENT_COLUMNS_EN

# (Optional) quick per-column uniqueness scan
for col in accidents_bike_berlin.columns:
    uniq_cnt = accidents_bike_berlin[col].nunique(dropna=True)
    first_vals = accidents_bike_berlin[col].head(5).tolist()
    print(f"{col}: uniques={uniq_cnt}; first5={first_vals}")


object_id: uniques=14772; first5=[nan, nan, nan, nan, nan]
accident_id: uniques=0; first5=[nan, nan, nan, nan, nan]
land_code: uniques=1; first5=[11, 11, 11, 11, 11]
admin_region_code: uniques=1; first5=[0, 0, 0, 0, 0]
district_code: uniques=12; first5=[3, 3, 2, 1, 9]
municipality_code: uniques=12; first5=[3, 3, 2, 1, 9]
year: uniques=7; first5=[2018, 2018, 2018, 2018, 2018]
month: uniques=12; first5=[1, 1, 1, 1, 1]
hour: uniques=24; first5=[15, 11, 8, 19, 18]
weekday: uniques=7; first5=[4, 5, 2, 4, 4]
injury_severity: uniques=3; first5=[3, 3, 3, 3, 2]
accident_kind: uniques=10; first5=[6, 5, 5, 5, 5]
accident_type: uniques=7; first5=[7, 2, 2, 7, 3]
involved_bicycle: uniques=1; first5=[1, 1, 1, 1, 1]
involved_passenger_car: uniques=2; first5=[0, 1, 1, 1, 1]
involved_pedestrian: uniques=2; first5=[1, 0, 0, 0, 0]
involved_motorcycle: uniques=2; first5=[0, 0, 0, 0, 0]
involved_other_vehicle: uniques=2; first5=[0, 0, 0, 0, 0]
light_condition_old: uniques=0; first5=[nan, nan, nan, nan, nan]

In [6]:
seg = load_segment_geometry(canonical_crs="EPSG:32633")
CANONICAL_CRS = seg.canonical_crs

segment_geo_gdf = seg.segments_gdf
segment_static = seg.segment_static

segment_geo_gdf.head()


Unnamed: 0,geometry,counter_name,latitude,longitude
0,"LINESTRING (388283.894 5816533.578, 388349.119...",streetsegment_0,52.486743,13.35535
1,"LINESTRING (389240.438 5813521.134, 389260.513...",streetsegment_1,52.461885,13.369878
2,"LINESTRING (388562.846 5831195.503, 388578.563...",streetsegment_2,52.61982,13.354749
3,"LINESTRING (388683.345 5831306.663, 388717.453...",streetsegment_3,52.620476,13.357354
4,"LINESTRING (386530.508 5820675.884, 386544.692...",streetsegment_4,52.524039,13.328604


## Spatial Join: Accidents with Strava data (code from Luise and Eric) + edited by Tobi to achieve canonical geometry data


### Attempt 2: Use sjoin_nearest to assign exactly one (the nearest) segment to each accident
Challenges:
* need to find the right maximum distance so accidents that are not on a segment are not assigned to one.
* assigns two segments if their distance is equal

In [7]:
# Assign each accident to exactly one nearest segment (within max_distance)
joined_nearest_unique = assign_accidents_to_nearest_segment(
    accidents_bike_berlin,
    segment_geo_gdf,
    canonical_crs=CANONICAL_CRS,
    max_distance_m=10,
)

print(f"Total accidents: {len(accidents_bike_berlin)}")
print(f"Total bike network Strava segments: {len(segment_geo_gdf)}")
print(f"Unique Strava segments in matched dataset: {joined_nearest_unique['counter_name'].nunique()}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print(f"Ratio of assigned accidents: {len(joined_nearest_unique) / len(accidents_bike_berlin):.2%}")

joined_nearest_unique.head()


Total accidents: 33181
Total bike network Strava segments: 4958
Unique Strava segments in matched dataset: 3570
Accidents assigned to segments: 21666
Ratio of assigned accidents: 65.30%


Unnamed: 0,object_id,accident_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,...,oid,plausibility_level,fid,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,,,11,0,9,9,2021,4,9,4,...,210679.0,,,POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,,,11,0,5,5,2024,3,9,1,...,235521.0,1.0,,POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,199348.0,,11,0,9,9,2019,9,6,4,...,,,,POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,194582.0,,11,0,12,12,2019,5,16,5,...,,,,POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,194994.0,,11,0,9,9,2019,5,18,3,...,,,,POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


## Strava data (bicycle network traffic, other features - daily)

1. We need to aggregate this df to the same granularity as in Accidents data (segment, year, month, weekday) to join. 
2. We can not join only by geo data, as Accidents don't have date column, but Strava contains daily info (eg specific traffic volume or weather on specific day)
3. We can not just calculate mean of all columns in Strava data by year-month-etc... as we also have categorial features (for example `infrastructure_bicyclelane_type`) and some features are constant over time for segment (eg `infrastructure_max_speed` in dataset is constant for segment for all dates)

In [8]:
strava_berlin_data = load_strava_berlin_data()
strava_berlin_data.columns.tolist()


['counter_name',
 'date',
 'count',
 'year',
 'latitude',
 'longitude',
 'geometry',
 'socioeconomic_total_population',
 'socioeconomic_share_residents_5plus_years_same_address',
 'socioeconomic_net_migration_per_100',
 'socioeconomic_migration_volume_per_100',
 'socioeconomic_share_under_18',
 'socioeconomic_share_65_and_older',
 'socioeconomic_youth_dependency_ratio',
 'socioeconomic_old_age_dependency_ratio',
 'socioeconomic_average_age',
 'socioeconomic_greying_index',
 'socioeconomic_share_with_migration_background',
 'socioeconomic_share_foreign_nationals',
 'socioeconomic_share_foreign_eu_nationals',
 'socioeconomic_share_foreign_non_eu_nationals',
 'socioeconomic_gender_distribution',
 'socioeconomic_total_fertility_rate',
 'socioeconomic_unemployment_rate_age_15_to_65',
 'infrastructure_count_education_within0.05km',
 'infrastructure_count_hospitals_within0.05km',
 'infrastructure_count_shops_within0.05km',
 'infrastructure_count_industry_within0.05km',
 'infrastructure_count_

### Which data types we have as features?

Results:

1. Mostly we have numerical features, but also categorical ones like `'infrastructure_bicyclelane_type'` - we will check if we need to aggregate them somehow or they are contstant over time.
2. Analysis shows:
    - **Numeric columns (111)**: Traffic counts, speeds, socioeconomic indicators, weather data
    - **Categorical columns**: Infrastructure types, activity types, street properties
    - **Boolean columns (8)**: Holiday flags, weekend indicators, data quality flags
3. **Key finding**: All connectivity and infrastructure columns are constant per segment, so they only need to be taken once per segment. Socioeconomic, motorized, strava, and weather columns vary over time and require aggregation by year-month-weekday.

In [9]:
df = strava_berlin_data
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()

print("Numeric:", len(numeric_cols))
print(numeric_cols)
print("\nCategorical:", len(categorical_cols))
print(categorical_cols)
print("\nBool:", len(bool_cols))
print(bool_cols)


Numeric: 111
['count', 'latitude', 'longitude', 'socioeconomic_total_population', 'socioeconomic_share_residents_5plus_years_same_address', 'socioeconomic_net_migration_per_100', 'socioeconomic_migration_volume_per_100', 'socioeconomic_share_under_18', 'socioeconomic_share_65_and_older', 'socioeconomic_youth_dependency_ratio', 'socioeconomic_old_age_dependency_ratio', 'socioeconomic_average_age', 'socioeconomic_greying_index', 'socioeconomic_share_with_migration_background', 'socioeconomic_share_foreign_nationals', 'socioeconomic_share_foreign_eu_nationals', 'socioeconomic_share_foreign_non_eu_nationals', 'socioeconomic_gender_distribution', 'socioeconomic_total_fertility_rate', 'socioeconomic_unemployment_rate_age_15_to_65', 'infrastructure_count_education_within0.05km', 'infrastructure_count_hospitals_within0.05km', 'infrastructure_count_shops_within0.05km', 'infrastructure_count_industry_within0.05km', 'infrastructure_count_hotels_within0.05km', 'infrastructure_count_education_withi

### Check which features we have contstant for one segment over time, so we don't need to aggregate them futher

In [10]:
summary_df = column_stability_summary(strava_berlin_data, group_col="counter_name")
summary_df.head(10)


Unnamed: 0,column,segments_total,segments_varying,max_unique_within_any_segment
0,infrastructure_commercial_area_percent,4958,0,1
1,infrastructure_cemetery_percent,4958,0,1
2,infrastructure_brach3_percent,4958,0,1
3,infrastructure_brach2_percent,4958,0,1
4,infrastructure_brach1_percent,4958,0,1
5,infrastructure_baustelle_percent,4958,0,1
6,infrastructure_horticulture_percent,4958,0,1
7,infrastructure_arable_land_percent,4958,0,1
8,infrastructure_str_flges_percent,4958,0,1
9,infrastructure_public_facilities_percent,4958,0,1


In [11]:
# tag dtypes
col_dtype = strava_berlin_data.dtypes
summary_df["dtype_bucket"] = summary_df["column"].map(
    lambda c: "bool" if col_dtype[c].name == "bool"
    else "numeric" if np.issubdtype(col_dtype[c], np.number)
    else "categorical"
)

# overall constant/varying summary
overall_stats = {
    "total_columns": len(summary_df),
    "constant_columns": int((summary_df["segments_varying"] == 0).sum()),
    "varying_columns": int((summary_df["segments_varying"] > 0).sum()),
}
overall_stats["percent_constant"] = round(
    overall_stats["constant_columns"]
    / max(overall_stats["total_columns"], 1)
    * 100,
    1,
 )

print("Overall column stability:")
for key, value in overall_stats.items():
    print(f"  {key}: {value}")

# dtype-level statistics
dtype_counts = (
    summary_df
    .groupby(["dtype_bucket"])
    .agg(
        total_cols=("column", "count"),
        constant_cols=("segments_varying", lambda s: (s == 0).sum()),
        varying_cols=("segments_varying", lambda s: (s > 0).sum()),
    )
)

dtype_counts["percent_constant"] = (
    dtype_counts["constant_cols"] / dtype_counts["total_cols"] * 100
).round(1)

display(dtype_counts.sort_values("percent_constant", ascending=False))

Overall column stability:
  total_columns: 136
  constant_columns: 70
  varying_columns: 66
  percent_constant: 51.5


Unnamed: 0_level_0,total_cols,constant_cols,varying_cols,percent_constant
dtype_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
categorical,17,12,5,70.6
bool,8,4,4,50.0
numeric,111,54,57,48.6


### Result: All connectivity and infrastructure columns are constant per segment. Socioeconomic, Motorized and weather columns vary, so we need to aggregate them.

- Connectivity (7/7 constant, 2 bool, 5 numeric): treat as static attributes per segment; just carry a single value (e.g., first).
- Infrastructure (58/58 constant, 1 bool, 10 categorical, 47 numeric): fully static; keep one value per segment, no temporal aggregation needed.
**- Other (14 cols, 5 constant/9 varying; 5 bool/6 cat/3 num): mixed bag—decide column by column; reassign misfiled cols if any.**
- Motorized (12/12 varying, all numeric): fully time-varying; aggregate over your time buckets (sum for counts, mean for speeds).
- Socioeconomic (17/17 varying, numeric): varies across time in the data; aggregate over your time buckets (sum for counts, mean for speeds).
**- Strava (19/19 varying; 1 categorical, 18 numeric): counts/speeds should be summed/averaged per time bucket; handle the single categorical (strava_activity_type) via ????**
- Weather (9/9 varying, numeric): time-varying; aggregate with mean (or min/max if useful).



## Aggregation of Berlin Strava data 
1. Aggregation keys: counter_name (segment), year, month, weekday (to align with accidents).
2. Constant features stay as-is (no aggregation) since they don’t vary over time.

### This code execution can take a while, on Liaisan's pc ~13 minutes.

# Segment level risk

In [12]:
# --- 1. Build exposure panel from Strava/sensor data (segment x year x month) ---
final_exposure_ym = build_exposure_panel_segment_year_month(
    strava_berlin_data,
    segment_static=segment_static,
    summary_df=summary_df,
)

print("Exposure panel (segment–year–month) shape:", final_exposure_ym.shape)
final_exposure_ym.head()


Exposure panel (segment–year–month) shape: (297480, 132)


Unnamed: 0,counter_name,year,month,sum_count,sum_strava_total_trip_count,sum_strava_ride_count,sum_strava_ebike_ride_count,sum_strava_total_people_count,sum_strava_total_commute_trip_count,sum_strava_total_leisure_trip_count,...,infrastructure_count_shops_within0.25km,infrastructure_count_industry_within0.25km,infrastructure_count_hotels_within0.25km,infrastructure_count_education_within0.5km,infrastructure_count_hospitals_within0.5km,infrastructure_count_industry_within0.1km,infrastructure_count_hotels_within0.5km,infrastructure_count_industry_within0.5km,infrastructure_count_shops_within0.5km,geometry
0,streetsegment_3572,2019,1,5866.0,5.0,5.0,0.0,5.0,0.0,5.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
1,streetsegment_3572,2019,2,8437.0,30.0,25.0,0.0,30.0,0.0,30.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
2,streetsegment_3572,2019,3,9993.0,40.0,30.0,0.0,40.0,0.0,40.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
3,streetsegment_3572,2019,4,14961.0,55.0,45.0,0.0,55.0,0.0,55.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
4,streetsegment_3572,2019,5,16605.0,65.0,65.0,0.0,65.0,0.0,65.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."


## Aggregate accidents

In [13]:
# --- 2. Build rich accident panel from Unfallatlas+segments (segment x year x month) ---
min_year = int(final_exposure_ym["year"].min())
max_year = int(final_exposure_ym["year"].max())

accidents_agg_ym_rich = aggregate_accidents_segment_year_month_rich(
    joined_nearest_unique,
    column_map=accident_columns_en,
    exposure_year_min=min_year,
    exposure_year_max=max_year,
)

print(
    "Rich accident aggregate (segment–year–month) shape:",
    accidents_agg_ym_rich.shape,
)
accidents_agg_ym_rich.head()


Rich accident aggregate (segment–year–month) shape: (14085, 64)


Unnamed: 0,counter_name,year,month,total_accidents,acc_involved_bicycle_count,acc_involved_passenger_car_count,acc_involved_pedestrian_count,acc_involved_motorcycle_count,acc_involved_goods_vehicle_count,acc_involved_other_vehicle_count,...,acc_accident_type_share_4,acc_accident_type_share_5,acc_accident_type_share_6,acc_accident_type_share_7,acc_light_condition_count_0.0,acc_light_condition_count_1.0,acc_light_condition_count_2.0,acc_light_condition_share_0.0,acc_light_condition_share_1.0,acc_light_condition_share_2.0
0,streetsegment_0,2019,1,1,1,1,0,0,0.0,0,...,0.0,0.0,0.0,1.0,1,0,0,1.0,0.0,0.0
1,streetsegment_0,2019,4,1,1,1,0,0,0.0,0,...,0.0,0.0,0.0,1.0,1,0,0,1.0,0.0,0.0
2,streetsegment_0,2019,7,1,1,0,0,1,0.0,0,...,0.0,0.0,0.0,0.0,1,0,0,1.0,0.0,0.0
3,streetsegment_0,2019,9,1,1,1,0,0,0.0,0,...,0.0,0.0,0.0,1.0,1,0,0,1.0,0.0,0.0
4,streetsegment_0,2020,5,1,1,1,0,0,0.0,0,...,0.0,1.0,0.0,0.0,0,0,1,0.0,0.0,1.0


## Merge datasets

In [14]:
# --- 3. Merge exposure and accident panels into a risk panel ---

merge_keys = ["counter_name", "year", "month"]

merged_accidents_strava_ym = merge_exposure_and_accidents(
    final_exposure_ym,
    accidents_agg_ym_rich,
    merge_keys=merge_keys,
 )

print(
    "Merged risk panel (segment–year–month) shape:",
    merged_accidents_strava_ym.shape,
 )

merged_accidents_strava_ym.head()

# Save geodataframe to parquet file
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "berlin_bike_accident_strava_panel.parquet"

gpd.GeoDataFrame(
    merged_accidents_strava_ym,
    geometry="geometry",
    crs=segment_geo_gdf.crs,
 ).to_parquet(
    out_path,
    index=False,
 )

Merged risk panel (segment–year–month) shape: (297480, 193)


## Sanity check of the merge

In [15]:
stats = sanity_check_merge(
    merged_accidents_strava_ym=merged_accidents_strava_ym,
    accidents_agg_ym_rich=accidents_agg_ym_rich,
    final_exposure_ym=final_exposure_ym,
)

for k, v in stats.items():
    print(f"{k}: {v}")


segments_with_accidents: 14085
segments_without_accidents: 283395
accident_groups_missing_exposure: 0
merged_total_accidents: 15398.0
source_total_accidents: 15398.0
lost_accidents_due_to_missing_exposure: 0.0


# Playground, to create and test smaller dataset version. The smaller ones are created based on the completly merged version.

In [16]:
core_panel = build_core_risk_panel(merged_accidents_strava_ym)
print("Core panel shape:", core_panel.shape)
core_panel.head(20)

# Save geodataframe to parquet file
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "berlin_bike_accident_strava_risk_core_panel.parquet"

gpd.GeoDataFrame(
    core_panel,
    geometry="geometry",
    crs=segment_geo_gdf.crs,
 ).to_parquet(
    out_path,
    index=False,
 )

Core panel shape: (297480, 56)


In [17]:
# find unique numbers of accidents in core_panel
unique_accident_counts = core_panel["total_accidents"].nunique()
print(f"Unique accident counts in core panel: {unique_accident_counts}")

# display those unique counts
print("Unique accident counts:", core_panel["total_accidents"].unique())




Unique accident counts in core panel: 7
Unique accident counts: [0. 2. 1. 3. 4. 5. 6.]


In [18]:
# sum up the accidents per segment to verfiy total accidents per segment
accidents_per_segment = (
    core_panel
    .groupby("counter_name", as_index=False)["total_accidents"]
    .sum()
    .rename(columns={"total_accidents": "total_accidents_segment"})
)

# find segments with highest total accidents
top_segments = accidents_per_segment.sort_values("total_accidents_segment", ascending=False).head(10)
print("Top 10 segments by total accidents:")
display(top_segments)

Top 10 segments by total accidents:


Unnamed: 0,counter_name,total_accidents_segment
1662,streetsegment_2494,57.0
2861,streetsegment_3573,55.0
2740,streetsegment_3464,44.0
1038,streetsegment_1932,44.0
1362,streetsegment_2223,41.0
1376,streetsegment_2236,41.0
2552,streetsegment_3295,41.0
1268,streetsegment_2139,37.0
2417,streetsegment_3173,34.0
4103,streetsegment_4691,33.0


# Crossing (junction) risk

### Build nodes (junction candidates) from segment endpoints

In [19]:
nodes_raw = build_nodes_from_segment_endpoints(
    segment_geo_gdf,
    counter_col="counter_name",
)

nodes_raw.head()

Unnamed: 0,counter_name,role,geometry
0,streetsegment_0,start,POINT (388283.894 5816533.578)
1,streetsegment_0,end,POINT (388370.886 5816366.868)
2,streetsegment_1,start,POINT (389240.438 5813521.134)
3,streetsegment_1,end,POINT (389260.513 5813802.481)
4,streetsegment_2,start,POINT (388562.846 5831195.503)


### Cluster endpoints into nodes (snap grid)

In [20]:
clustering = cluster_nodes_snap_grid(
    nodes_raw,
    tol_m=2,
    counter_col="counter_name",
)

nodes_raw = clustering.nodes_raw
node_points = clustering.node_points
segment_node_map = clustering.segment_node_map

print("Nodes (raw endpoints):", len(nodes_raw))
print("Nodes (clustered):", len(node_points))
segment_node_map.head()

Nodes (raw endpoints): 9916
Nodes (clustered): 3155


Unnamed: 0,counter_name,node_id,role
0,streetsegment_0,0,start
1,streetsegment_0,1,end
2,streetsegment_1,2,start
3,streetsegment_1,3,end
4,streetsegment_2,4,start


### Define crossings (nodes with degree $\geq$ 3)

In [21]:
crossing_ids = select_crossings_by_degree(
    nodes_raw,
    min_degree=3,
    counter_col="counter_name",
)

crossings_gdf = node_points[node_points["node_id"].isin(crossing_ids)].copy()

print("Crossings (degree >= 3):", len(crossings_gdf))
crossings_gdf.head()

Crossings (degree >= 3): 2924


Unnamed: 0,node_id,geometry
0,0,POINT (388283.894 5816533.578)
1,1,POINT (388370.886 5816366.868)
2,2,POINT (389240.438 5813521.134)
3,3,POINT (389260.513 5813802.481)
4,4,POINT (388562.846 5831195.503)


### Assign accidents to nearest crossing

In [26]:
# Keep only the accident point rows we actually want to map to junctions
# (still restricted to the Strava study area, because joined_nearest_unique is already filtered)
study_area_accidents = joined_nearest_unique[["acc_id", "year", "month", "geometry"]].copy()

acc_node, acc_node_ym = assign_accidents_to_nearest_crossing(
    study_area_accidents,
    crossings_gdf,
    max_distance_m=20,
)

print("Study-area accidents:", len(acc_node))
print("Assigned to a crossing:", int(acc_node["has_crossing"].sum()))
print("Unassigned (kept):", int((~acc_node["has_crossing"]).sum()))
print("Accident groups (node×year×month):", len(acc_node_ym))
acc_node_ym.head()

# Save accident->junction mapping for visualization / debugging
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

acc_node.to_parquet(
    out_dir / "acc_node.parquet",
    index=False,
)


Study-area accidents: 21666
Assigned to a crossing: 6846
Unassigned (kept): 14820
Accident groups (node×year×month): 6491


### Build node-level exposure from segment flows

In [23]:
node_exposure_ym = build_node_exposure_panel_from_segments(
    final_exposure_ym,
    segment_node_map,
    crossing_ids,
    trip_col="sum_strava_total_trip_count",
)

print("Node exposure (node×year×month) shape:", node_exposure_ym.shape)
node_exposure_ym.head()

Node exposure (node×year×month) shape: (175440, 4)


Unnamed: 0,node_id,year,month,monthly_strava_trips
0,0,2019,1,345.0
1,0,2019,2,580.0
2,0,2019,3,705.0
3,0,2019,4,1165.0
4,0,2019,5,1240.0


### Combine into node-level risk panel

In [24]:
node_panel_ym = build_node_risk_panel(
    node_exposure_ym,
    acc_node_ym,
    crossings_gdf,
)

print("Node panel (crossing x year x month) shape:", node_panel_ym.shape)
node_panel_ym.head()

# Save geodataframe to parquet file
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "berlin_bike_accident_node_panel.parquet"

node_panel_ym.to_parquet(
    out_path,
    index=False,
)

Node panel (crossing x year x month) shape: (175440, 8)


In [25]:
# Look at the overall description of the dataset
node_panel_ym.describe(include="all")

Unnamed: 0,node_id,year,month,monthly_strava_trips,total_accidents,geometry,risk_accidents_per_trip,risk_accidents_per_10k_trips
count,175440.0,175440.0,175440.0,175440.0,175440.0,175440,166995.0,166995.0
unique,,,,,,2924,,
top,,,,,,POINT (388283.8938791263 5816533.57797254),,
freq,,,,,,60,,
mean,1517.834473,2021.0,6.5,1415.950467,0.027611,,5.7e-05,0.56754
std,893.653188,1.414218,3.452062,2222.060432,0.172861,,0.001675,16.74964
min,0.0,2019.0,1.0,0.0,0.0,,0.0,0.0
25%,743.75,2020.0,3.75,165.0,0.0,,0.0,0.0
50%,1496.5,2021.0,6.5,660.0,0.0,,0.0,0.0
75%,2278.25,2022.0,9.25,1760.0,0.0,,0.0,0.0
