# Bicycle Accidents x Geo Data x Strava Exposure Merge


#### Import some standard libraries and helper scripts:

In [1]:
import gc
import sys
from pathlib import Path
import datetime as dt
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt


In [2]:
# Ensure project root is importable when running from notebooks/
_project_root = Path.cwd().resolve()
if not (_project_root / "src").exists() and (_project_root.parent / "src").exists():
    _project_root = _project_root.parent
sys.path.insert(0, str(_project_root))

In [3]:
# Kernel cleanup: drop old variables from pre-refactor runs
# (Equivalent to a kernel restart for the pipeline variables.)
# Ensure the removed module isn't lingering in memory
sys.modules.pop("src.merge_datasets", None)

# Drop previously computed objects/dataframes so we don't keep stale types around
for _name in [
    "seg", "clustering", "accidents_raw", "accidents_bike_berlin", "segment_geo_gdf",
    "segment_static", "strava_berlin_data", "summary_df", "final_exposure_ym", "accidents_agg_ym_rich",
    "merged_accidents_strava_ym", "core_panel", "nodes_raw", "node_points", "crossings_gdf",
    "crossing_ids", "segment_node_map", "node_exposure_ym", "node_panel_ym", "acc_node", "acc_node_ym",
    "joined_nearest_unique", "stats", "merge_keys", "min_year", "max_year", "out_dir", "out_path",
]:
    globals().pop(_name, None)

gc.collect()
print("kernel_cleanup_done")


kernel_cleanup_done


## Bike Accident data for Berlin

In [4]:
from src.preprocess_data import preprocess_accident_data
accidents_bike_berlin = preprocess_accident_data(save_to_parquet=False)
accidents_bike_berlin.head()

Loaded 9 files -> combined shape: (2098019, 32)
Dropped irrelevant columns -> shape: (2098019, 24)
Filtered to bicycle accidents -> shape: (626844, 24)
Filtered to bicycle accidents in Berlin -> shape: (33181, 19)


Unnamed: 0,year,month,hour,weekday,injury_severity,accident_kind,accident_type,car_involved,pedestrian_involved,motorcycle_involved,other_vehicle_involved,road_condition,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84,source_file,light_condition,goods_vehicle_involved
0,2018,1,15,4,3,6,7,0,1,0,0,1,798261.3849,5835047.0,13.403228,52.583472,Unfallorte2018_LinRef.csv,1,0
1,2018,1,11,5,3,5,2,1,0,0,0,1,800551.721,5829808.0,13.432186,52.535255,Unfallorte2018_LinRef.csv,0,0
2,2018,1,8,2,3,5,2,1,0,0,0,0,803320.7292,5827627.0,13.470897,52.514173,Unfallorte2018_LinRef.csv,0,0
3,2018,1,19,4,3,5,7,1,0,0,0,1,798174.6913,5826940.0,13.394673,52.510848,Unfallorte2018_LinRef.csv,2,0
4,2018,1,18,4,2,5,3,1,0,0,0,1,806109.6297,5821644.0,13.506372,52.458993,Unfallorte2018_LinRef.csv,2,0


## Assigning segments

In [5]:
from src.segments import load_segment_geometry
seg = load_segment_geometry(canonical_crs="EPSG:32633")
CANONICAL_CRS = seg.canonical_crs

segment_geo_gdf = seg.segments_gdf
segment_static = seg.segment_static

segment_geo_gdf.head()


Unnamed: 0,geometry,counter_name,latitude,longitude
0,"LINESTRING (388283.894 5816533.578, 388349.119...",streetsegment_0,52.486743,13.35535
1,"LINESTRING (389240.438 5813521.134, 389260.513...",streetsegment_1,52.461885,13.369878
2,"LINESTRING (388562.846 5831195.503, 388578.563...",streetsegment_2,52.61982,13.354749
3,"LINESTRING (388683.345 5831306.663, 388717.453...",streetsegment_3,52.620476,13.357354
4,"LINESTRING (386530.508 5820675.884, 386544.692...",streetsegment_4,52.524039,13.328604


## Spatial Join: Accidents with Strava data + canonical geometry data
Use sjoin_nearest to assign exactly one (the nearest) segment to each accident


In [6]:
from src.accidents import assign_accidents_to_nearest_segment
# Assign each accident to exactly one nearest segment (within max_distance)
joined_nearest_unique = assign_accidents_to_nearest_segment(
    accidents_bike_berlin,
    segment_geo_gdf,
    canonical_crs=CANONICAL_CRS,
    max_distance_m=10,
)

print(f"Total accidents: {len(accidents_bike_berlin)}")
print(f"Total bike network Strava segments: {len(segment_geo_gdf)}")
print(f"Unique Strava segments in matched dataset: {joined_nearest_unique['counter_name'].nunique()}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print(f"Ratio of assigned accidents: {len(joined_nearest_unique) / len(accidents_bike_berlin):.2%}")

joined_nearest_unique.head()


Total accidents: 33181
Total bike network Strava segments: 4958
Unique Strava segments in matched dataset: 3570
Accidents assigned to segments: 21666
Ratio of assigned accidents: 65.30%


Unnamed: 0,year,month,hour,weekday,injury_severity,accident_kind,accident_type,car_involved,pedestrian_involved,motorcycle_involved,...,source_file,light_condition,goods_vehicle_involved,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,2021,4,9,4,2,1,5,1,0,0,...,Unfallorte_2021_LinRef.csv,0,0,POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,2024,3,9,1,3,5,3,1,0,0,...,Unfallorte2024_LinRef.csv,0,0,POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,2019,9,6,4,3,5,2,0,0,0,...,Unfallorte2019_LinRef.csv,1,0,POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,2019,5,16,5,3,5,2,1,0,0,...,Unfallorte2019_LinRef.csv,0,0,POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,2019,5,18,3,1,0,1,0,0,0,...,Unfallorte2019_LinRef.csv,0,0,POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


## Strava data (bicycle network traffic, other features - daily)

* take monthly aggregated bike traffic data, don't take other features into account in the first place

In [7]:
from src.strava_exposure import load_and_aggregate_monthly_strava_counts_per_segment
#strava_berlin_data = load_strava_berlin_data()
# aggregate count to monthly level
aggregated_strava_data = load_and_aggregate_monthly_strava_counts_per_segment()
aggregated_strava_data.head()
#strava_berlin_data.columns.tolist()

Unnamed: 0,counter_name,year,month,count
0,streetsegment_0,2019,1,0.0
1,streetsegment_0,2019,2,0.0
2,streetsegment_0,2019,3,0.0
3,streetsegment_0,2019,4,0.0
4,streetsegment_0,2019,5,0.0


### One-Hot-Encoding of accident features

In [8]:
from src.preprocess_data import one_hot_encode_columns
print("All columns:", accidents_bike_berlin.columns.tolist())
categorical_cols = ["weekday", "hour", "weekday", "injury_severity", "accident_kind", "accident_type", "light_condition"]
accidents_one_hot_encoded = one_hot_encode_columns(joined_nearest_unique, categorical_cols)
accidents_one_hot_encoded.head()

All columns: ['year', 'month', 'hour', 'weekday', 'injury_severity', 'accident_kind', 'accident_type', 'car_involved', 'pedestrian_involved', 'motorcycle_involved', 'other_vehicle_involved', 'road_condition', 'LINREFX', 'LINREFY', 'XGCSWGS84', 'YGCSWGS84', 'source_file', 'light_condition', 'goods_vehicle_involved']


Unnamed: 0,year,month,car_involved,pedestrian_involved,motorcycle_involved,other_vehicle_involved,road_condition,LINREFX,LINREFY,XGCSWGS84,...,accident_kind_8,accident_kind_9,accident_type_2,accident_type_3,accident_type_4,accident_type_5,accident_type_6,accident_type_7,light_condition_1,light_condition_2
29872,2021,4,1,0,0,0,0,804865.537885,5822181.0,13.488608,...,False,False,False,False,False,True,False,False,False,False
25448,2024,3,1,0,0,0,0,784405.6655,5828668.0,13.19382,...,False,False,False,True,False,False,False,False,False,False
9074,2019,9,0,0,0,1,0,812938.755016,5821379.0,13.606309,...,False,False,True,False,False,False,False,False,True,False
6943,2019,5,1,0,0,0,0,792217.260845,5833940.0,13.3133,...,False,False,True,False,False,False,False,False,False,False
7115,2019,5,0,0,0,0,0,818774.420586,5812580.0,13.68352,...,False,False,False,False,False,False,False,False,False,False


### Aggregate data for each month and segment

In [None]:
from src.preprocess_data import aggregate_accident_data_by_month, aggregate_accidents_monthwise

monthly_agg_accidents = aggregate_accidents_monthwise(
    accidents_one_hot_encoded,
    segment_col="counter_name"
)
monthly_agg_accidents.head()

Unnamed: 0,counter_name,year,month,total_accidents,car_involved,pedestrian_involved,motorcycle_involved,other_vehicle_involved,goods_vehicle_involved,weekday_2,...,accident_kind_9,accident_type_2,accident_type_3,accident_type_4,accident_type_5,accident_type_6,accident_type_7,light_condition_1,light_condition_2,total_accidents.1
0,streetsegment_0,2018,12,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
1,streetsegment_0,2019,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,streetsegment_0,2019,4,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,streetsegment_0,2019,7,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,streetsegment_0,2019,9,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


## Merge datasets

## Sanity check of the merge

In [None]:
stats = sanity_check_merge(
    merged_accidents_strava_ym=merged_accidents_strava_ym,
    accidents_agg_ym_rich=accidents_agg_ym_rich,
    final_exposure_ym=final_exposure_ym,
)

for k, v in stats.items():
    print(f"{k}: {v}")
