# Bicycle Accidents x Geo Data x Strava Exposure Merge


#### Import some standard libraries and helper scripts:

In [39]:
import sys
from pathlib import Path

# Ensure project root is importable when running from notebooks/
_project_root = Path.cwd().resolve()
if not (_project_root / "src").exists() and (_project_root.parent / "src").exists():
    _project_root = _project_root.parent
sys.path.insert(0, str(_project_root))

import datetime as dt

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt

# Optional: reload project modules without restarting the kernel
import importlib
import src.accidents as accidents
import src.segments as segments
import src.strava_exposure as strava_exposure
import src.panels as panels
import src.nodes as nodes
for _m in (accidents, segments, strava_exposure, panels, nodes):
    importlib.reload(_m)

from src.accidents import (
    ACCIDENT_COLUMNS_EN,
    assign_accidents_to_nearest_segment,
    load_accidents_raw,
    prepare_accidents_bike_berlin,
)
from src.segments import load_segment_geometry
from src.strava_exposure import (
    build_exposure_panel_segment_year_month,
    column_stability_summary,
    load_strava_berlin_data,
)
from src.panels import (
    aggregate_accidents_segment_year_month_rich,
    build_core_risk_panel,
    merge_exposure_and_accidents,
    sanity_check_merge,
)
from src.nodes import (
    assign_accidents_to_nearest_crossing,
    build_node_exposure_panel_from_segments,
    build_node_risk_panel,
    build_nodes_from_segment_endpoints,
    cluster_nodes_snap_grid,
    select_crossings_by_degree,
)


In [3]:
import sys
print(sys.executable)

/Users/laysan/Desktop/University/data_literacy_/data_literacy/venv/bin/python


In [4]:
# Kernel cleanup: drop old variables from pre-refactor runs
# (Equivalent to a kernel restart for the pipeline variables.)
import gc
import sys

# Ensure the removed module isn't lingering in memory
sys.modules.pop("src.merge_datasets", None)

# Drop previously computed objects/dataframes so we don't keep stale types around
for _name in [
    "seg",
    "clustering",
    "accidents_raw",
    "accidents_bike_berlin",
    "segment_geo_gdf",
    "segment_static",
    "strava_berlin_data",
    "summary_df",
    "final_exposure_ym",
    "accidents_agg_ym_rich",
    "merged_accidents_strava_ym",
    "core_panel",
    "nodes_raw",
    "node_points",
    "crossings_gdf",
    "crossing_ids",
    "segment_node_map",
    "node_exposure_ym",
    "node_panel_ym",
    "acc_node",
    "acc_node_ym",
    "joined_nearest_unique",
    "stats",
    "merge_keys",
    "min_year",
    "max_year",
    "out_dir",
    "out_path",
]:
    globals().pop(_name, None)

gc.collect()
print("kernel_cleanup_done")


kernel_cleanup_done


## Bicycle data for Berlin

In [5]:
import gc

accidents_raw = load_accidents_raw()
print("Raw accidents shape:", accidents_raw.shape)

accident_columns_en = ACCIDENT_COLUMNS_EN

accidents_bike_berlin = prepare_accidents_bike_berlin(accidents_raw, column_map=accident_columns_en)
print(f"Filtered to bicycle accidents in Berlin -> shape: {accidents_bike_berlin.shape}")

# Free big raw dataframe early to keep memory low for Strava aggregation
del accidents_raw
gc.collect()

accidents_bike_berlin.head()


Raw accidents shape: (2098019, 32)
Filtered to bicycle accidents in Berlin -> shape: (33181, 32)


Unnamed: 0,object_id,accident_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,...,XGCSWGS84,YGCSWGS84,source_file,object_id_alt,light_condition,involved_goods_vehicle,accident_id_extended,oid,plausibility_level,fid
0,,,11,0,3,3,2018,1,15,4,...,13.403228,52.583472,Unfallorte2018_LinRef.csv,112747.0,1.0,0.0,,,,
1,,,11,0,3,3,2018,1,11,5,...,13.432186,52.535255,Unfallorte2018_LinRef.csv,112892.0,0.0,0.0,,,,
2,,,11,0,2,2,2018,1,8,2,...,13.470897,52.514173,Unfallorte2018_LinRef.csv,112902.0,0.0,0.0,,,,
3,,,11,0,1,1,2018,1,19,4,...,13.394673,52.510848,Unfallorte2018_LinRef.csv,112921.0,2.0,0.0,,,,
4,,,11,0,9,9,2018,1,18,4,...,13.506372,52.458993,Unfallorte2018_LinRef.csv,112947.0,2.0,0.0,,,,


In [57]:
# --- Add temporal features: weekday_type and time_of_day ---
# These will be aggregated as categorical features in the rich accident panel

# weekday_type: weekday (Mon-Fri) vs weekend (Sat-Sun)
# Day of the week: 1=Sunday, 2=Monday, ..., 7=Saturday
if "weekday" in accidents_bike_berlin.columns:
    accidents_bike_berlin["weekday_type"] = accidents_bike_berlin["weekday"].map(
        lambda x: "weekday" if x in [2, 3, 4, 5, 6] else "weekend"
    )
    print("Added weekday_type column (1=Sunday, 7=Saturday)")
else:
    print("Warning: 'weekday' column not found, skipping weekday_type")

# time_of_day: work_hours (7-18), evening (18-22), night (22-7)
if "hour" in accidents_bike_berlin.columns:
    def _classify_time_of_day(hour):
        if pd.isna(hour):
            return None
        h = int(hour)
        if 7 <= h < 18:
            return "work_hours (7h-18h)"
        elif 18 <= h < 22:
            return "evening (18h-22h)"
        else:
            return "night (22h-7h)"
    
    accidents_bike_berlin["time_of_day"] = accidents_bike_berlin["hour"].map(_classify_time_of_day)
    print("Added time_of_day column")
else:
    print("Warning: 'hour' column not found, skipping time_of_day")

# Show distribution of new features
if "weekday_type" in accidents_bike_berlin.columns:
    print("\nweekday_type distribution:")
    print(accidents_bike_berlin["weekday_type"].value_counts())

if "time_of_day" in accidents_bike_berlin.columns:
    print("\ntime_of_day distribution:")
    print(accidents_bike_berlin["time_of_day"].value_counts())

Added weekday_type column (1=Sunday, 7=Saturday)
Added time_of_day column

weekday_type distribution:
weekday_type
weekday    27921
weekend     5260
Name: count, dtype: int64

time_of_day distribution:
time_of_day
work_hours (7h-18h)    24714
evening (18h-22h)       5960
night (22h-7h)          2507
Name: count, dtype: int64


## Rename columns to English

In [58]:
# Column map is now maintained in src.accidents
accident_columns_en = ACCIDENT_COLUMNS_EN

# (Optional) quick per-column uniqueness scan
for col in accidents_bike_berlin.columns:
    uniq_cnt = accidents_bike_berlin[col].nunique(dropna=True)
    first_vals = accidents_bike_berlin[col].head(5).tolist()
    print(f"{col}: uniques={uniq_cnt}; first5={first_vals}")


object_id: uniques=14772; first5=[nan, nan, nan, nan, nan]
accident_id: uniques=0; first5=[nan, nan, nan, nan, nan]
land_code: uniques=1; first5=[11, 11, 11, 11, 11]
admin_region_code: uniques=1; first5=[0, 0, 0, 0, 0]
district_code: uniques=12; first5=[3, 3, 2, 1, 9]
municipality_code: uniques=12; first5=[3, 3, 2, 1, 9]
year: uniques=7; first5=[2018, 2018, 2018, 2018, 2018]
month: uniques=12; first5=[1, 1, 1, 1, 1]
hour: uniques=24; first5=[15, 11, 8, 19, 18]
weekday: uniques=7; first5=[4, 5, 2, 4, 4]
injury_severity: uniques=3; first5=[3, 3, 3, 3, 2]
accident_kind: uniques=10; first5=[6, 5, 5, 5, 5]
accident_type: uniques=7; first5=[7, 2, 2, 7, 3]
involved_bicycle: uniques=1; first5=[1, 1, 1, 1, 1]
involved_passenger_car: uniques=2; first5=[0, 1, 1, 1, 1]
involved_pedestrian: uniques=2; first5=[1, 0, 0, 0, 0]
involved_motorcycle: uniques=2; first5=[0, 0, 0, 0, 0]
involved_other_vehicle: uniques=2; first5=[0, 0, 0, 0, 0]
light_condition_old: uniques=0; first5=[nan, nan, nan, nan, nan]

In [59]:
seg = load_segment_geometry(canonical_crs="EPSG:32633")
CANONICAL_CRS = seg.canonical_crs

segment_geo_gdf = seg.segments_gdf
segment_static = seg.segment_static

segment_geo_gdf.head()


Unnamed: 0,geometry,counter_name,latitude,longitude
0,"LINESTRING (388283.894 5816533.578, 388349.119...",streetsegment_0,52.486743,13.35535
1,"LINESTRING (389240.438 5813521.134, 389260.513...",streetsegment_1,52.461885,13.369878
2,"LINESTRING (388562.846 5831195.503, 388578.563...",streetsegment_2,52.61982,13.354749
3,"LINESTRING (388683.345 5831306.663, 388717.453...",streetsegment_3,52.620476,13.357354
4,"LINESTRING (386530.508 5820675.884, 386544.692...",streetsegment_4,52.524039,13.328604


## Spatial Join: Accidents with Strava data (code from Luise and Eric) + edited by Tobi to achieve canonical geometry data


### Attempt 2: Use sjoin_nearest to assign exactly one (the nearest) segment to each accident
Challenges:
* need to find the right maximum distance so accidents that are not on a segment are not assigned to one.
* assigns two segments if their distance is equal

In [60]:
# Assign each accident to exactly one nearest segment (within max_distance)
joined_nearest_unique = assign_accidents_to_nearest_segment(
    accidents_bike_berlin,
    segment_geo_gdf,
    canonical_crs=CANONICAL_CRS,
    max_distance_m=10,
)

print(f"Total accidents: {len(accidents_bike_berlin)}")
print(f"Total bike network Strava segments: {len(segment_geo_gdf)}")
print(f"Unique Strava segments in matched dataset: {joined_nearest_unique['counter_name'].nunique()}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print(f"Ratio of assigned accidents: {len(joined_nearest_unique) / len(accidents_bike_berlin):.2%}")

joined_nearest_unique.head()


Total accidents: 33181
Total bike network Strava segments: 4958
Unique Strava segments in matched dataset: 3570
Accidents assigned to segments: 21666
Ratio of assigned accidents: 65.30%


Unnamed: 0,object_id,accident_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,...,fid,weekday_type,time_of_day,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,,,11,0,9,9,2021,4,9,4,...,,weekday,work_hours (7h-18h),POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,,,11,0,5,5,2024,3,9,1,...,,weekend,work_hours (7h-18h),POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,199348.0,,11,0,9,9,2019,9,6,4,...,,weekday,night (22h-7h),POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,194582.0,,11,0,12,12,2019,5,16,5,...,,weekday,work_hours (7h-18h),POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,194994.0,,11,0,9,9,2019,5,18,3,...,,weekday,evening (18h-22h),POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


## Strava data (bicycle network traffic, other features - daily)

1. We need to aggregate this df to the same granularity as in Accidents data (segment, year, month, weekday) to join. 
2. We can not join only by geo data, as Accidents don't have date column, but Strava contains daily info (eg specific traffic volume or weather on specific day)
3. We can not just calculate mean of all columns in Strava data by year-month-etc... as we also have categorial features (for example `infrastructure_bicyclelane_type`) and some features are constant over time for segment (eg `infrastructure_max_speed` in dataset is constant for segment for all dates)

In [61]:
strava_berlin_data = load_strava_berlin_data()
strava_berlin_data.columns.tolist()


['counter_name',
 'date',
 'count',
 'year',
 'latitude',
 'longitude',
 'geometry',
 'socioeconomic_total_population',
 'socioeconomic_share_residents_5plus_years_same_address',
 'socioeconomic_net_migration_per_100',
 'socioeconomic_migration_volume_per_100',
 'socioeconomic_share_under_18',
 'socioeconomic_share_65_and_older',
 'socioeconomic_youth_dependency_ratio',
 'socioeconomic_old_age_dependency_ratio',
 'socioeconomic_average_age',
 'socioeconomic_greying_index',
 'socioeconomic_share_with_migration_background',
 'socioeconomic_share_foreign_nationals',
 'socioeconomic_share_foreign_eu_nationals',
 'socioeconomic_share_foreign_non_eu_nationals',
 'socioeconomic_gender_distribution',
 'socioeconomic_total_fertility_rate',
 'socioeconomic_unemployment_rate_age_15_to_65',
 'infrastructure_count_education_within0.05km',
 'infrastructure_count_hospitals_within0.05km',
 'infrastructure_count_shops_within0.05km',
 'infrastructure_count_industry_within0.05km',
 'infrastructure_count_

### Which data types we have as features?

Results:

1. Mostly we have numerical features, but also categorical ones like `'infrastructure_bicyclelane_type'` - we will check if we need to aggregate them somehow or they are contstant over time.
2. Analysis shows:
    - **Numeric columns (111)**: Traffic counts, speeds, socioeconomic indicators, weather data
    - **Categorical columns**: Infrastructure types, activity types, street properties
    - **Boolean columns (8)**: Holiday flags, weekend indicators, data quality flags
3. **Key finding**: All connectivity and infrastructure columns are constant per segment, so they only need to be taken once per segment. Socioeconomic, motorized, strava, and weather columns vary over time and require aggregation by year-month-weekday.

In [62]:
df = strava_berlin_data
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()

print("Numeric:", len(numeric_cols))
print(numeric_cols)
print("\nCategorical:", len(categorical_cols))
print(categorical_cols)
print("\nBool:", len(bool_cols))
print(bool_cols)


Numeric: 111
['count', 'latitude', 'longitude', 'socioeconomic_total_population', 'socioeconomic_share_residents_5plus_years_same_address', 'socioeconomic_net_migration_per_100', 'socioeconomic_migration_volume_per_100', 'socioeconomic_share_under_18', 'socioeconomic_share_65_and_older', 'socioeconomic_youth_dependency_ratio', 'socioeconomic_old_age_dependency_ratio', 'socioeconomic_average_age', 'socioeconomic_greying_index', 'socioeconomic_share_with_migration_background', 'socioeconomic_share_foreign_nationals', 'socioeconomic_share_foreign_eu_nationals', 'socioeconomic_share_foreign_non_eu_nationals', 'socioeconomic_gender_distribution', 'socioeconomic_total_fertility_rate', 'socioeconomic_unemployment_rate_age_15_to_65', 'infrastructure_count_education_within0.05km', 'infrastructure_count_hospitals_within0.05km', 'infrastructure_count_shops_within0.05km', 'infrastructure_count_industry_within0.05km', 'infrastructure_count_hotels_within0.05km', 'infrastructure_count_education_withi

### Check which features we have contstant for one segment over time, so we don't need to aggregate them futher

In [63]:
summary_df = column_stability_summary(strava_berlin_data, group_col="counter_name")
summary_df.head(10)


Unnamed: 0,column,segments_total,segments_varying,max_unique_within_any_segment
0,latitude,4958,0,1
1,longitude,4958,0,1
2,geometry,4958,0,1
3,infrastructure_count_hotels_within0.05km,4958,0,1
4,infrastructure_count_education_within0.1km,4958,0,1
5,infrastructure_count_hospitals_within0.1km,4958,0,1
6,infrastructure_count_shops_within0.1km,4958,0,1
7,infrastructure_count_industry_within0.1km,4958,0,1
8,infrastructure_count_hospitals_within0.05km,4958,0,1
9,infrastructure_count_shops_within0.05km,4958,0,1


In [64]:
# tag dtypes
col_dtype = strava_berlin_data.dtypes
summary_df["dtype_bucket"] = summary_df["column"].map(
    lambda c: "bool" if col_dtype[c].name == "bool"
    else "numeric" if np.issubdtype(col_dtype[c], np.number)
    else "categorical"
)

# overall constant/varying summary
overall_stats = {
    "total_columns": len(summary_df),
    "constant_columns": int((summary_df["segments_varying"] == 0).sum()),
    "varying_columns": int((summary_df["segments_varying"] > 0).sum()),
}
overall_stats["percent_constant"] = round(
    overall_stats["constant_columns"]
    / max(overall_stats["total_columns"], 1)
    * 100,
    1,
 )

print("Overall column stability:")
for key, value in overall_stats.items():
    print(f"  {key}: {value}")

# dtype-level statistics
dtype_counts = (
    summary_df
    .groupby(["dtype_bucket"])
    .agg(
        total_cols=("column", "count"),
        constant_cols=("segments_varying", lambda s: (s == 0).sum()),
        varying_cols=("segments_varying", lambda s: (s > 0).sum()),
    )
)

dtype_counts["percent_constant"] = (
    dtype_counts["constant_cols"] / dtype_counts["total_cols"] * 100
).round(1)

display(dtype_counts.sort_values("percent_constant", ascending=False))

Overall column stability:
  total_columns: 136
  constant_columns: 70
  varying_columns: 66
  percent_constant: 51.5


Unnamed: 0_level_0,total_cols,constant_cols,varying_cols,percent_constant
dtype_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
categorical,17,12,5,70.6
bool,8,4,4,50.0
numeric,111,54,57,48.6


### Result: All connectivity and infrastructure columns are constant per segment. Socioeconomic, Motorized and weather columns vary, so we need to aggregate them.

- Connectivity (7/7 constant, 2 bool, 5 numeric): treat as static attributes per segment; just carry a single value (e.g., first).
- Infrastructure (58/58 constant, 1 bool, 10 categorical, 47 numeric): fully static; keep one value per segment, no temporal aggregation needed.
**- Other (14 cols, 5 constant/9 varying; 5 bool/6 cat/3 num): mixed bag—decide column by column; reassign misfiled cols if any.**
- Motorized (12/12 varying, all numeric): fully time-varying; aggregate over your time buckets (sum for counts, mean for speeds).
- Socioeconomic (17/17 varying, numeric): varies across time in the data; aggregate over your time buckets (sum for counts, mean for speeds).
**- Strava (19/19 varying; 1 categorical, 18 numeric): counts/speeds should be summed/averaged per time bucket; handle the single categorical (strava_activity_type) via ????**
- Weather (9/9 varying, numeric): time-varying; aggregate with mean (or min/max if useful).



## Aggregation of Berlin Strava data 
1. Aggregation keys: counter_name (segment), year, month, weekday (to align with accidents).
2. Constant features stay as-is (no aggregation) since they don’t vary over time.

### This code execution can take a while, on Liaisan's pc ~13 minutes.

# Segment level risk

In [65]:
# --- 1. Build exposure panel from Strava/sensor data (segment x year x month) ---
final_exposure_ym = build_exposure_panel_segment_year_month(
    strava_berlin_data,
    segment_static=segment_static,
    summary_df=summary_df,
)

print("Exposure panel (segment–year–month) shape:", final_exposure_ym.shape)
final_exposure_ym.head()


Exposure panel (segment–year–month) shape: (297480, 132)


Unnamed: 0,counter_name,year,month,sum_count,sum_strava_total_trip_count,sum_strava_ride_count,sum_strava_ebike_ride_count,sum_strava_total_people_count,sum_strava_total_commute_trip_count,sum_strava_total_leisure_trip_count,...,infrastructure_water_bodies_percent,infrastructure_public_facilities_percent,infrastructure_cemetery_percent,street_name,connectivity_pagerank,connectivity_is_cycling_main_network,connectivity_clustering,is_shortterm,connectivity_is_cycling_minor_network,geometry
0,streetsegment_3572,2019,1,5866.0,5.0,5.0,0.0,5.0,0.0,5.0,...,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True,"LINESTRING (402133.284 5816807.644, 402120.978..."
1,streetsegment_3572,2019,2,8437.0,30.0,25.0,0.0,30.0,0.0,30.0,...,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True,"LINESTRING (402133.284 5816807.644, 402120.978..."
2,streetsegment_3572,2019,3,9993.0,40.0,30.0,0.0,40.0,0.0,40.0,...,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True,"LINESTRING (402133.284 5816807.644, 402120.978..."
3,streetsegment_3572,2019,4,14961.0,55.0,45.0,0.0,55.0,0.0,55.0,...,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True,"LINESTRING (402133.284 5816807.644, 402120.978..."
4,streetsegment_3572,2019,5,16605.0,65.0,65.0,0.0,65.0,0.0,65.0,...,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True,"LINESTRING (402133.284 5816807.644, 402120.978..."


## Aggregate accidents

In [66]:
# --- 2. Build rich accident panel from Unfallatlas+segments (segment x year x month) ---
min_year = int(final_exposure_ym["year"].min())
max_year = int(final_exposure_ym["year"].max())

accidents_agg_ym_rich = aggregate_accidents_segment_year_month_rich(
    joined_nearest_unique,
    column_map=accident_columns_en,
    exposure_year_min=min_year,
    exposure_year_max=max_year,
)

print(
    "Rich accident aggregate (segment–year–month) shape:",
    accidents_agg_ym_rich.shape,
)
accidents_agg_ym_rich.head()


Rich accident aggregate (segment–year–month) shape: (14085, 74)


Unnamed: 0,counter_name,year,month,total_accidents,acc_involved_bicycle_count,acc_involved_passenger_car_count,acc_involved_pedestrian_count,acc_involved_motorcycle_count,acc_involved_goods_vehicle_count,acc_involved_other_vehicle_count,...,acc_weekday_type_count_weekday,acc_weekday_type_count_weekend,acc_weekday_type_share_weekday,acc_weekday_type_share_weekend,acc_time_of_day_count_evening (18h-22h),acc_time_of_day_count_night (22h-7h),acc_time_of_day_count_work_hours (7h-18h),acc_time_of_day_share_evening (18h-22h),acc_time_of_day_share_night (22h-7h),acc_time_of_day_share_work_hours (7h-18h)
0,streetsegment_0,2019,1,1,1,1,0,0,0.0,0,...,0,1,0.0,1.0,0,0,1,0.0,0.0,1.0
1,streetsegment_0,2019,4,1,1,1,0,0,0.0,0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
2,streetsegment_0,2019,7,1,1,0,0,1,0.0,0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
3,streetsegment_0,2019,9,1,1,1,0,0,0.0,0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
4,streetsegment_0,2020,5,1,1,1,0,0,0.0,0,...,1,0,1.0,0.0,1,0,0,1.0,0.0,0.0


## Merge datasets

In [67]:
# --- 3. Merge exposure and accident panels into a risk panel ---

merge_keys = ["counter_name", "year", "month"]

merged_accidents_strava_ym = merge_exposure_and_accidents(
    final_exposure_ym,
    accidents_agg_ym_rich,
    merge_keys=merge_keys,
    trip_col="sum_strava_total_trip_count",  # make merge explicit/robust
)

print("Merged panel (segment–year–month) shape:", merged_accidents_strava_ym.shape)
merged_accidents_strava_ym.head()

# Save merged segment panel
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "berlin_bike_accident_strava_panel.parquet"

gpd.GeoDataFrame(
    merged_accidents_strava_ym,
    geometry="geometry",
    crs=segment_geo_gdf.crs,
).to_parquet(out_path, index=False)


Merged panel (segment–year–month) shape: (297480, 204)


## Sanity check of the merge

In [68]:
stats = sanity_check_merge(
    merged_accidents_strava_ym=merged_accidents_strava_ym,
    accidents_agg_ym_rich=accidents_agg_ym_rich,
    final_exposure_ym=final_exposure_ym,
)

for k, v in stats.items():
    print(f"{k}: {v}")


segments_with_accidents: 14085
segments_without_accidents: 283395
accident_groups_missing_exposure: 0
merged_total_accidents: 15398.0
source_total_accidents: 15398.0
lost_accidents_due_to_missing_exposure: 0.0


# Playground, to create and test smaller dataset version. The smaller ones are created based on the completly merged version.

In [69]:
core_panel = build_core_risk_panel(merged_accidents_strava_ym)
print("Core panel shape:", core_panel.shape)
print(core_panel.head(5))

# Save geodataframe to parquet file
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "berlin_bike_accident_strava_risk_core_panel.parquet"

gpd.GeoDataFrame(
    core_panel,
    geometry="geometry",
    crs=segment_geo_gdf.crs,
 ).to_parquet(
    out_path,
    index=False,
 )

Core panel shape: (297480, 66)
      counter_name  year  month  \
0  streetsegment_0  2019      1   
1  streetsegment_0  2019      2   
2  streetsegment_0  2019      3   
3  streetsegment_0  2019      4   
4  streetsegment_0  2019      5   

                                            geometry  \
0  LINESTRING (388283.894 5816533.578, 388349.119...   
1  LINESTRING (388283.894 5816533.578, 388349.119...   
2  LINESTRING (388283.894 5816533.578, 388349.119...   
3  LINESTRING (388283.894 5816533.578, 388349.119...   
4  LINESTRING (388283.894 5816533.578, 388349.119...   

   sum_strava_total_trip_count  sum_count  total_accidents  \
0                         75.0        0.0              1.0   
1                        110.0        0.0              0.0   
2                        170.0        0.0              0.0   
3                        340.0        0.0              1.0   
4                        385.0        0.0              0.0   

   acc_injury_severity_count_1  acc_injury_sever

In [70]:
# find unique numbers of accidents in core_panel
unique_accident_counts = core_panel["total_accidents"].nunique()
print(f"Unique accident counts in core panel: {unique_accident_counts}")

# display those unique counts
print("Unique accident counts:", core_panel["total_accidents"].unique())

Unique accident counts in core panel: 7
Unique accident counts: [1. 0. 2. 3. 4. 5. 6.]


In [71]:
# sum up the accidents per segment to verfiy total accidents per segment
accidents_per_segment = (
    core_panel
    .groupby("counter_name", as_index=False)["total_accidents"]
    .sum()
    .rename(columns={"total_accidents": "total_accidents_segment"})
)

# find segments with highest total accidents
top_segments = accidents_per_segment.sort_values("total_accidents_segment", ascending=False).head(10)
print("Top 10 segments by total accidents:")
display(top_segments)

Top 10 segments by total accidents:


Unnamed: 0,counter_name,total_accidents_segment
1662,streetsegment_2494,57.0
2861,streetsegment_3573,55.0
2740,streetsegment_3464,44.0
1038,streetsegment_1932,44.0
2552,streetsegment_3295,41.0
1376,streetsegment_2236,41.0
1362,streetsegment_2223,41.0
1268,streetsegment_2139,37.0
2417,streetsegment_3173,34.0
4103,streetsegment_4691,33.0


# Crossing (junction) risk

### Build nodes (junction candidates) from segment endpoints

In [72]:
nodes_raw = build_nodes_from_segment_endpoints(
    segment_geo_gdf,
    counter_col="counter_name",
)

nodes_raw.head()

Unnamed: 0,counter_name,role,geometry
0,streetsegment_0,start,POINT (388283.894 5816533.578)
1,streetsegment_0,end,POINT (388370.886 5816366.868)
2,streetsegment_1,start,POINT (389240.438 5813521.134)
3,streetsegment_1,end,POINT (389260.513 5813802.481)
4,streetsegment_2,start,POINT (388562.846 5831195.503)


### Cluster endpoints into nodes (snap grid)

In [73]:
clustering = cluster_nodes_snap_grid(
    nodes_raw,
    tol_m=2,
    counter_col="counter_name",
)

nodes_raw = clustering.nodes_raw
node_points = clustering.node_points
segment_node_map = clustering.segment_node_map

print("Nodes (raw endpoints):", len(nodes_raw))
print("Nodes (clustered):", len(node_points))
segment_node_map.head()

Nodes (raw endpoints): 9916
Nodes (clustered): 3155


Unnamed: 0,counter_name,node_id,role
0,streetsegment_0,0,start
1,streetsegment_0,1,end
2,streetsegment_1,2,start
3,streetsegment_1,3,end
4,streetsegment_2,4,start


### Define crossings (nodes with degree $\geq$ 3)

In [74]:
crossing_ids = select_crossings_by_degree(
    nodes_raw,
    min_degree=3,
    counter_col="counter_name",
)

crossings_gdf = node_points[node_points["node_id"].isin(crossing_ids)].copy()

print("Crossings (degree >= 3):", len(crossings_gdf))
crossings_gdf.head()

Crossings (degree >= 3): 2924


Unnamed: 0,node_id,geometry
0,0,POINT (388283.894 5816533.578)
1,1,POINT (388370.886 5816366.868)
2,2,POINT (389240.438 5813521.134)
3,3,POINT (389260.513 5813802.481)
4,4,POINT (388562.846 5831195.503)


### Assign accidents to nearest crossing

In [75]:
# Uses accident data with ALL factor columns (not just geometry)
# We need to preserve incident factors for node-level aggregation
factor_cols = ["acc_id", "year", "month", "geometry"]

# Add all categorical factor columns if they exist
for col in ["weekday_type", "time_of_day", "light_condition", "road_condition", 
            "accident_type", "accident_kind", "injury_severity"]:
    if col in joined_nearest_unique.columns:
        factor_cols.append(col)

study_area_accidents = joined_nearest_unique[factor_cols].copy()
print(f"study_area_accidents columns: {study_area_accidents.columns.tolist()}")

acc_node, acc_node_ym = assign_accidents_to_nearest_crossing(
    study_area_accidents,
    crossings_gdf,
    max_distance_m=20,
)

print("Study-area accidents (after segment corridor filter):", len(acc_node))
print("Assigned to a crossing:", int(acc_node["has_crossing"].sum()))
print("Unassigned (kept):", int((~acc_node["has_crossing"]).sum()))
print("Accident groups (node×year×month):", len(acc_node_ym))
acc_node_ym.head()

# Save accident->crossing mapping for debugging/visualization
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

acc_node.to_parquet(out_dir / "acc_node.parquet", index=False)
acc_node_ym

study_area_accidents columns: ['acc_id', 'year', 'month', 'geometry', 'weekday_type', 'time_of_day', 'light_condition', 'accident_type', 'accident_kind', 'injury_severity']
Study-area accidents (after segment corridor filter): 21666
Assigned to a crossing: 6846
Unassigned (kept): 14820
Accident groups (node×year×month): 6491


Unnamed: 0,node_id,year,month,total_accidents,acc_injury_severity_count_1,acc_injury_severity_count_2,acc_injury_severity_count_3,acc_injury_severity_share_1,acc_injury_severity_share_2,acc_injury_severity_share_3,...,acc_weekday_type_count_weekday,acc_weekday_type_count_weekend,acc_weekday_type_share_weekday,acc_weekday_type_share_weekend,acc_time_of_day_count_evening (18h-22h),acc_time_of_day_count_night (22h-7h),acc_time_of_day_count_work_hours (7h-18h),acc_time_of_day_share_evening (18h-22h),acc_time_of_day_share_night (22h-7h),acc_time_of_day_share_work_hours (7h-18h)
0,0,2020,6,1,0,0,1,0.0,0.0,1.0,...,0,1,0.0,1.0,0,0,1,0.0,0.0,1.0
1,0,2020,9,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
2,0,2023,5,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
3,0,2023,6,2,0,1,1,0.0,0.5,0.5,...,1,1,0.5,0.5,0,0,2,0.0,0.0,1.0
4,1,2018,12,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,0,1,0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6486,3142,2023,5,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
6487,3142,2024,6,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0
6488,3142,2024,9,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,1,0,0,1.0,0.0,0.0
6489,3142,2024,11,1,0,0,1,0.0,0.0,1.0,...,1,0,1.0,0.0,0,0,1,0.0,0.0,1.0


### Build node-level exposure from segment flows

In [76]:
node_exposure_ym = build_node_exposure_panel_from_segments(
    final_exposure_ym,
    segment_node_map,
    crossing_ids,
    trip_col="sum_strava_total_trip_count",
)

print("Node exposure (node×year×month) shape:", node_exposure_ym.shape)
node_exposure_ym.head()

Node exposure (node×year×month) shape: (175440, 4)


Unnamed: 0,node_id,year,month,monthly_strava_trips
0,0,2019,1,115.0
1,0,2019,2,193.0
2,0,2019,3,235.0
3,0,2019,4,388.0
4,0,2019,5,413.0


### Combine into node-level risk panel

In [77]:
# --- Build node-level risk panel (crossing x year x month) ---

node_panel_ym = build_node_risk_panel(
    node_exposure_ym,
    acc_node_ym,
    crossings_gdf,
)

print("Node panel (crossing x year x month) shape:", node_panel_ym.shape)
node_panel_ym.head()

# Save node panel
out_dir = _project_root / "data" / "merged"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "berlin_bike_accident_node_panel.parquet"
node_panel_ym.to_parquet(out_path, index=False)

# Also save node exposure panel for debugging
node_exposure_ym.to_parquet(out_dir / "berlin_bike_node_exposure_panel.parquet", index=False)


Node panel (crossing x year x month) shape: (177334, 64)


In [78]:
# Look at the overall description of the dataset
node_panel_ym.describe(include="all")

Unnamed: 0,node_id,year,month,monthly_strava_trips,total_accidents,acc_injury_severity_count_1,acc_injury_severity_count_2,acc_injury_severity_count_3,acc_injury_severity_share_1,acc_injury_severity_share_2,...,acc_weekday_type_share_weekend,acc_time_of_day_count_evening (18h-22h),acc_time_of_day_count_night (22h-7h),acc_time_of_day_count_work_hours (7h-18h),acc_time_of_day_share_evening (18h-22h),acc_time_of_day_share_night (22h-7h),acc_time_of_day_share_work_hours (7h-18h),geometry,risk_accidents_per_trip,risk_accidents_per_10k_trips
count,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,...,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,177334.0,177334,166995.0,166995.0
unique,,,,,,,,,,,...,,,,,,,,2924,,
top,,,,,,,,,,,...,,,,,,,,POINT (390444.885303024 5816768.826382071),,
freq,,,,,,,,,,,...,,,,,,,,70,,
mean,1515.829339,2020.998038,6.502966,420.979496,0.038605,9.6e-05,0.005092,0.033417,9.6e-05,0.004839,...,0.005532,0.006846,0.002994,0.028765,0.006428,0.002834,0.027341,,0.000187,1.869092
std,893.612367,1.440406,3.447214,662.015852,0.203666,0.009791,0.071808,0.188663,0.009791,0.068669,...,0.073389,0.08388,0.055152,0.173762,0.078998,0.052543,0.162514,,0.00563,56.29919
min,0.0,2018.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
25%,742.0,2020.0,4.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
50%,1493.0,2021.0,7.0,197.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
75%,2277.0,2022.0,9.0,524.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0


## Sanity Checks

In [79]:
# 1) basic columns exist
assert "node_id" in acc_node.columns
assert "has_crossing" in acc_node.columns
assert "dist_node" in acc_node.columns

# 2) has_crossing matches node_id presence
assert (acc_node["has_crossing"] == acc_node["node_id"].notna()).all()

# 3) assigned accidents respect radius
max_dist = acc_node.loc[acc_node["has_crossing"], "dist_node"].max()
print("max dist among assigned:", max_dist)
# Expect <= your max_distance_m (20), with tiny floating error possible

# 4) acc_node_ym matches acc_node counts by node/month
tmp = acc_node.dropna(subset=["node_id"]).copy()
tmp["node_id"] = tmp["node_id"].astype("int64")

check = (
    tmp.groupby(["node_id", "year", "month"]).size().reset_index(name="n")
    .merge(acc_node_ym, on=["node_id", "year", "month"], how="outer")
)

diff = (check["n"].fillna(0) - check["total_accidents"].fillna(0)).abs().sum()
print("sum absolute differences:", diff)
# Expect 0


max dist among assigned: 19.99986765009113
sum absolute differences: 0


In [80]:
# Sum accidents by node from the node panel (across all months)
panel_counts = node_panel_ym.groupby("node_id")["total_accidents"].sum()

# Sum assigned accidents by node from mapping
map_counts = (
    acc_node.dropna(subset=["node_id"])
    .assign(node_id=lambda d: d["node_id"].astype("int64"))
    .groupby("node_id")
    .size()
)

compare = (
    panel_counts.rename("panel")
    .to_frame()
    .join(map_counts.rename("mapped"), how="outer")
    .fillna(0)
)

bad = compare[compare["panel"] != compare["mapped"]]
print("nodes with mismatch:", len(bad))
display(bad.head(20))
# Expect 0 mismatches


nodes with mismatch: 0


Unnamed: 0_level_0,panel,mapped
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [81]:
# Exposure should never be negative
assert (node_exposure_ym["monthly_strava_trips"] >= 0).all()

# Optional: check that for a given month, total node trips is within a reasonable range of total segment trips
seg_total = final_exposure_ym["sum_strava_total_trip_count"].sum()
node_total = node_exposure_ym["monthly_strava_trips"].sum()

print("segment trips total:", seg_total)
print("node trips total:", node_total)
# node_total will usually be larger than seg_total because segments contribute to 2 endpoints.
# But it should not be absurdly larger. If it suddenly doubles vs prior runs, you may have duplicate segment-node pairs.


segment trips total: 125965380.0
node trips total: 74653978.0


In [82]:
# Crossings are points and have a CRS
assert crossings_gdf.crs is not None
assert crossings_gdf.geometry.geom_type.isin(["Point"]).all()


###################################

In [83]:
segment_node_map_cross = segment_node_map[segment_node_map["node_id"].isin(crossing_ids)].copy()

segments_per_junction = (
    segment_node_map_cross.groupby("node_id")["counter_name"]
    .apply(lambda s: sorted(set(s)))
    .reset_index(name="segments")
)

segments_per_junction.head()


Unnamed: 0,node_id,segments
0,0,"[streetsegment_0, streetsegment_1460, streetse..."
1,1,"[streetsegment_0, streetsegment_3387, streetse..."
2,2,"[streetsegment_1, streetsegment_3231, streetse..."
3,3,"[streetsegment_1, streetsegment_1462, streetse..."
4,4,"[streetsegment_1507, streetsegment_2, streetse..."


In [84]:
junctions_per_segment = (
    segment_node_map_cross.groupby("counter_name")["node_id"]
    .apply(lambda s: sorted(set(s)))
    .reset_index(name="junction_nodes")
)


In [85]:
# exposure is non-negative
assert (node_exposure_ym["monthly_strava_trips"] >= 0).all()

# spot check a node-month: node exposure equals sum of its incident segments that month
nid = int(node_exposure_ym["node_id"].iloc[0])
yy = int(node_exposure_ym["year"].iloc[0])
mm = int(node_exposure_ym["month"].iloc[0])

inc = segment_node_map_cross[segment_node_map_cross["node_id"] == nid]["counter_name"].unique()
seg_sum = final_exposure_ym.query("year == @yy and month == @mm and counter_name in @inc")["sum_strava_total_trip_count"].sum()
node_val = node_exposure_ym.query("node_id == @nid and year == @yy and month == @mm")["monthly_strava_trips"].iloc[0]

print("segment-sum:", seg_sum, "node exposure:", node_val)


segment-sum: 345.0 node exposure: 115.0
