# Bicycle Accidents x Geo Data x Strava Exposure Merge


#### Import some standard libraries:

In [1]:
import os
import sys
from pathlib import Path
import datetime as dt
import json

import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.wkt

import matplotlib.pyplot as plt



In [2]:
import sys
print(sys.executable)

/Users/tobias/src/data_literacy/.venv/bin/python


## Bicycle data for Berlin

In [3]:
csv_dir = Path("data/csv")
csv_files = sorted(csv_dir.glob("*.csv"))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {csv_dir.resolve()}")

dfs = []
for fp in csv_files:
    df = pd.read_csv(fp, low_memory=False, delimiter=";")
    df["source_file"] = fp.name 
    dfs.append(df)

accidents = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(csv_files)} files -> combined shape: {accidents.shape}")

# print all column names
print("Columns:", accidents.columns.tolist())

accidents.head()

Loaded 9 files -> combined shape: (2098019, 35)
Columns: ['OBJECTID', 'UIDENTSTLA', 'ULAND', 'UREGBEZ', 'UKREIS', 'UGEMEINDE', 'UJAHR', 'UMONAT', 'USTUNDE', 'UWOCHENTAG', 'UKATEGORIE', 'UART', 'UTYP1', 'IstRad', 'IstPKW', 'IstFuss', 'IstKrad', 'IstSonstig', 'LICHT', 'STRZUSTAND', 'LINREFX', 'LINREFY', 'XGCSWGS84', 'YGCSWGS84', 'source_file', 'OBJECTID_1', 'ULICHTVERH', 'IstGkfz', 'IstSonstige', 'UIDENTSTLAE', 'IstStrassenzustand', 'OID_', 'PLST', 'FID', 'IstStrasse']


Unnamed: 0,OBJECTID,UIDENTSTLA,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,...,OBJECTID_1,ULICHTVERH,IstGkfz,IstSonstige,UIDENTSTLAE,IstStrassenzustand,OID_,PLST,FID,IstStrasse
0,1.0,1170113152013852017,1,0,55,12,2017,1,5,6,...,,,,,,,,,,
1,2.0,1170113171013912017,1,0,60,53,2017,1,6,6,...,,,,,,,,,,
2,3.0,1170106105132242017,1,0,61,11,2017,1,18,6,...,,,,,,,,,,
3,4.0,1170114152013542018,1,0,55,42,2017,1,12,7,...,,,,,,,,,,
4,5.0,1170106161013732017,1,0,62,60,2017,1,8,6,...,,,,,,,,,,


## Rename columns to English

In [4]:
accident_columns_en = {
    # IDs & metadata
    "OBJECTID": "object_id",
    "OBJECTID_1": "object_id_alt",
    "OID_": "oid",
    "FID": "fid",
    "source_file": "source_file",

    # Unique accident identifiers
    "UIDENTSTLA": "accident_id",
    "UIDENTSTLAE": "accident_id_extended",

    # Administrative divisions
    "ULAND": "land_code",
    "UREGBEZ": "admin_region_code",
    "UKREIS": "district_code",
    "UGEMEINDE": "municipality_code",

    # Time
    "UJAHR": "year",
    "UMONAT": "month",
    "USTUNDE": "hour",
    "UWOCHENTAG": "weekday",

    # Accident classification
    "UKATEGORIE": "injury_severity",
    "UART": "accident_kind",
    "UTYP1": "accident_type",

    # Participants involved (0 or 1)
    "IstRad": "involved_bicycle",
    "IstPKW": "involved_passenger_car",
    "IstFuss": "involved_pedestrian",
    "IstKrad": "involved_motorcycle",
    "IstSonstig": "involved_other_vehicle_old",     # older variant
    "IstGkfz": "involved_goods_vehicle",
    "IstSonstige": "involved_other_vehicle",
    "IstStrasse": "involved_road",
    "IstStrassenzustand": "road_condition_flag",

    # Environmental conditions
    "LICHT": "light_condition_old",
    "ULICHTVERH": "light_condition",                # official variable
    "STRZUSTAND": "road_condition",

    # Data quality
    "PLST": "plausibility_level",
}

accidents = accidents.rename(columns=accident_columns_en)

# filter in one pass to avoid intermediate copies
accidents_bike_berlin = (
    accidents.loc[
        (accidents["involved_bicycle"] == 1)
        & (accidents["land_code"] == 11)
    ]
    .reset_index(drop=True)
)
print(f"Filtered to bicycle accidents in Berlin -> shape: {accidents_bike_berlin.shape}")

# Fix decimal commas → floats in coordinate columns for Berlin future analysis
accidents_bike_berlin["XGCSWGS84"] = (
    accidents_bike_berlin["XGCSWGS84"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)
accidents_bike_berlin["YGCSWGS84"] = (
    accidents_bike_berlin["YGCSWGS84"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)

accidents_bike_berlin["LINREFX"] = (
    accidents_bike_berlin["LINREFX"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)
accidents_bike_berlin["LINREFY"] = (
    accidents_bike_berlin["LINREFY"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)

display(accidents_bike_berlin.describe())
accidents_bike_berlin.head()

for col in accidents_bike_berlin.columns:
    uniq_cnt = accidents_bike_berlin[col].nunique(dropna=True)
    first_vals = accidents_bike_berlin[col].head(5).tolist()
    print(f"{col}: uniques={uniq_cnt}; first5={first_vals}")


Filtered to bicycle accidents in Berlin -> shape: (33181, 35)


Unnamed: 0,object_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,injury_severity,...,YGCSWGS84,object_id_alt,light_condition,involved_goods_vehicle,involved_other_vehicle,road_condition_flag,oid,plausibility_level,fid,involved_road
count,14772.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,...,33181.0,5192.0,33181.0,33181.0,27989.0,17875.0,13217.0,8924.0,0.0,0.0
mean,173918.322299,11.0,0.0,5.011724,5.011724,2020.887375,6.806938,13.481179,4.076731,2.866158,...,52.506832,192975.22188,0.341008,0.021036,0.089428,0.197762,219381.787622,1.013559,,
std,58196.42271,0.0,0.0,3.466895,3.466895,2.009997,2.911945,4.595099,1.741427,0.346628,...,0.044248,27899.278571,0.714326,0.143507,0.285366,0.41524,37183.087676,0.115657,,
min,3331.0,11.0,0.0,1.0,1.0,2018.0,1.0,0.0,1.0,1.0,...,52.366052,112747.0,0.0,0.0,0.0,0.0,112180.0,1.0,,
25%,140264.75,11.0,0.0,2.0,2.0,2019.0,5.0,10.0,3.0,3.0,...,52.481865,199302.5,0.0,0.0,0.0,0.0,213555.0,1.0,,
50%,194507.5,11.0,0.0,4.0,4.0,2021.0,7.0,14.0,4.0,3.0,...,52.5095,202189.5,0.0,0.0,0.0,0.0,236988.0,1.0,,
75%,231123.5,11.0,0.0,8.0,8.0,2023.0,9.0,17.0,5.0,3.0,...,52.534254,205066.75,0.0,0.0,0.0,0.0,241075.0,1.0,,
max,241014.0,11.0,0.0,12.0,12.0,2024.0,12.0,23.0,7.0,3.0,...,52.660146,208851.0,2.0,1.0,1.0,2.0,268445.0,2.0,,


object_id: uniques=14772; first5=[nan, nan, nan, nan, nan]
accident_id: uniques=0; first5=[nan, nan, nan, nan, nan]
land_code: uniques=1; first5=[11, 11, 11, 11, 11]
admin_region_code: uniques=1; first5=[0, 0, 0, 0, 0]
district_code: uniques=12; first5=[3, 3, 2, 1, 9]
municipality_code: uniques=12; first5=[3, 3, 2, 1, 9]
year: uniques=7; first5=[2018, 2018, 2018, 2018, 2018]
month: uniques=12; first5=[1, 1, 1, 1, 1]
hour: uniques=24; first5=[15, 11, 8, 19, 18]
weekday: uniques=7; first5=[4, 5, 2, 4, 4]
injury_severity: uniques=3; first5=[3, 3, 3, 3, 2]
accident_kind: uniques=10; first5=[6, 5, 5, 5, 5]
accident_type: uniques=7; first5=[7, 2, 2, 7, 3]
involved_bicycle: uniques=1; first5=[1, 1, 1, 1, 1]
involved_passenger_car: uniques=2; first5=[0, 1, 1, 1, 1]
involved_pedestrian: uniques=2; first5=[1, 0, 0, 0, 0]
involved_motorcycle: uniques=2; first5=[0, 0, 0, 0, 0]
involved_other_vehicle_old: uniques=2; first5=[0.0, 0.0, 0.0, 0.0, 0.0]
light_condition_old: uniques=0; first5=[nan, nan, 

In [5]:
CANONICAL_CRS = "EPSG:32633" 

# Load canonical segment geometry
segment_geo_df = pd.read_parquet("data/strava/berlin_graph_geometry.parquet").copy()
segment_geo_df["geometry"] = segment_geo_df["geometry"].apply(shapely.wkt.loads)

segment_geo_gdf = gpd.GeoDataFrame(
    segment_geo_df,
    geometry="geometry",
    crs="EPSG:4326"    
).to_crs(CANONICAL_CRS)

# Trusted attributes from geo file
cols_static = ["counter_name", "geometry"]
if "street_name" in segment_geo_gdf.columns:
    cols_static.append("street_name")

segment_static = (
    segment_geo_gdf[cols_static]
    .drop_duplicates("counter_name")
    .reset_index(drop=True)
)

## Spatial Join: Accidents with Strava data (code from Luise and Eric) + edited by Tobi to achieve canonical geometry data


### Attempt 2: Use sjoin_nearest to assign exactly one (the nearest) segment to each accident
Challenges:
* need to find the right maximum distance so accidents that are not on a segment are not assigned to one.
* assigns two segments if their distance is equal

In [6]:
# This code uses sjoin_nearest (attempt 2)

# We already have segment_geo_gdf in CANONICAL_CRS
strava_segments_gdf = segment_geo_gdf.copy()  # geometry: canonical segment lines in CANONICAL_CRS

# Accident points: build GeoDataFrame and reproject to the same CRS
accidents_bike_berlin = accidents_bike_berlin.reset_index(drop=True)

accident_locations_gdf = gpd.GeoDataFrame(
    accidents_bike_berlin,
    geometry=gpd.points_from_xy(
        accidents_bike_berlin.XGCSWGS84,
        accidents_bike_berlin.YGCSWGS84
    ),
    crs="EPSG:4326"
).to_crs(CANONICAL_CRS)

# Add identifier to accidents
accident_locations_gdf = accident_locations_gdf.reset_index(drop=True)
accident_locations_gdf["acc_id"] = accident_locations_gdf.index

# Compute nearest segment (within 10 m)
joined = gpd.sjoin_nearest(
    accident_locations_gdf,
    strava_segments_gdf,
    how="left",
    max_distance=10,  # meters in CANONICAL_CRS
    distance_col="dist"
)

# Drop accidents without assigned segment (NaN in index_right)
joined = joined.dropna(subset=["index_right"])

# Drop duplicate accidents (keep nearest segment per acc_id)
joined_nearest_unique = (
    joined
    .sort_values("dist")
    .drop_duplicates(subset=["acc_id"], keep="first")
)

print(f"Total accidents: {len(accident_locations_gdf)}")
print(f"Total bike network Strava segments: {len(strava_segments_gdf)}")
print(f"Unique Strava segments in matched dataset: {joined_nearest_unique['counter_name'].nunique()}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print(f"Accidents with ambiguous nearest segment: {len(joined) - len(joined_nearest_unique)}")
print(f"Ratio of assigned accidents: {len(joined_nearest_unique) / len(accident_locations_gdf):.2%}")

joined_nearest_unique.head()


Total accidents: 33181
Total bike network Strava segments: 4958
Unique Strava segments in matched dataset: 3570
Accidents assigned to segments: 21666
Accidents with ambiguous nearest segment: 21
Ratio of assigned accidents: 65.30%


Unnamed: 0,object_id,accident_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,...,plausibility_level,fid,involved_road,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,,,11,0,9,9,2021,4,9,4,...,,,,POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,,,11,0,5,5,2024,3,9,1,...,1.0,,,POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,199348.0,,11,0,9,9,2019,9,6,4,...,,,,POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,194582.0,,11,0,12,12,2019,5,16,5,...,,,,POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,194994.0,,11,0,9,9,2019,5,18,3,...,,,,POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


## Strava data (bicycle network traffic, other features - daily)

1. We need to aggregate this df to the same granularity as in Accidents data (segment, year, month, weekday) to join. 
2. We can not join only by geo data, as Accidents don't have date column, but Strava contains daily info (eg specific traffic volume or weather on specific day)
3. We can not just calculate mean of all columns in Strava data by year-month-etc... as we also have categorial features (for example `infrastructure_bicyclelane_type`) and some features are constant over time for segment (eg `infrastructure_max_speed` in dataset is constant for segment for all dates)

In [7]:
strava_berlin_data = pd.read_parquet(path="data/strava/berlin_data.parquet")
strava_berlin_data.columns.tolist()


['counter_name',
 'date',
 'count',
 'year',
 'latitude',
 'longitude',
 'geometry',
 'socioeconomic_total_population',
 'socioeconomic_share_residents_5plus_years_same_address',
 'socioeconomic_net_migration_per_100',
 'socioeconomic_migration_volume_per_100',
 'socioeconomic_share_under_18',
 'socioeconomic_share_65_and_older',
 'socioeconomic_youth_dependency_ratio',
 'socioeconomic_old_age_dependency_ratio',
 'socioeconomic_average_age',
 'socioeconomic_greying_index',
 'socioeconomic_share_with_migration_background',
 'socioeconomic_share_foreign_nationals',
 'socioeconomic_share_foreign_eu_nationals',
 'socioeconomic_share_foreign_non_eu_nationals',
 'socioeconomic_gender_distribution',
 'socioeconomic_total_fertility_rate',
 'socioeconomic_unemployment_rate_age_15_to_65',
 'infrastructure_count_education_within0.05km',
 'infrastructure_count_hospitals_within0.05km',
 'infrastructure_count_shops_within0.05km',
 'infrastructure_count_industry_within0.05km',
 'infrastructure_count_

### Which data types we have as features?

Results:

1. Mostly we have numerical features, but also categorical ones like `'infrastructure_bicyclelane_type'` - we will check if we need to aggregate them somehow or they are contstant over time.
2. Analysis shows:
    - **Numeric columns (111)**: Traffic counts, speeds, socioeconomic indicators, weather data
    - **Categorical columns**: Infrastructure types, activity types, street properties
    - **Boolean columns (8)**: Holiday flags, weekend indicators, data quality flags
3. **Key finding**: All connectivity and infrastructure columns are constant per segment, so they only need to be taken once per segment. Socioeconomic, motorized, strava, and weather columns vary over time and require aggregation by year-month-weekday.

In [8]:
df = strava_berlin_data
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()

print("Numeric:", len(numeric_cols))
print(numeric_cols)
print("\nCategorical:", len(categorical_cols))
print(categorical_cols)
print("\nBool:", len(bool_cols))
print(bool_cols)


Numeric: 111
['count', 'latitude', 'longitude', 'socioeconomic_total_population', 'socioeconomic_share_residents_5plus_years_same_address', 'socioeconomic_net_migration_per_100', 'socioeconomic_migration_volume_per_100', 'socioeconomic_share_under_18', 'socioeconomic_share_65_and_older', 'socioeconomic_youth_dependency_ratio', 'socioeconomic_old_age_dependency_ratio', 'socioeconomic_average_age', 'socioeconomic_greying_index', 'socioeconomic_share_with_migration_background', 'socioeconomic_share_foreign_nationals', 'socioeconomic_share_foreign_eu_nationals', 'socioeconomic_share_foreign_non_eu_nationals', 'socioeconomic_gender_distribution', 'socioeconomic_total_fertility_rate', 'socioeconomic_unemployment_rate_age_15_to_65', 'infrastructure_count_education_within0.05km', 'infrastructure_count_hospitals_within0.05km', 'infrastructure_count_shops_within0.05km', 'infrastructure_count_industry_within0.05km', 'infrastructure_count_hotels_within0.05km', 'infrastructure_count_education_withi

### Check which features we have contstant for one segment over time, so we don't need to aggregate them futher

In [9]:
df = strava_berlin_data
grp = df.groupby("counter_name", sort=False)

summary = []
for col in df.columns:
    if col == "counter_name":
        continue
    nunique = grp[col].nunique(dropna=True)
    varying = nunique.gt(1)
    summary.append({
        "column": col,
        "segments_total": len(nunique),
        "segments_varying": int(varying.sum()),
        "max_unique_within_any_segment": int(nunique.max()),
    })

summary_df = pd.DataFrame(summary).sort_values("segments_varying", ascending=True)
summary_df.head(10)  # view top constant columns


Unnamed: 0,column,segments_total,segments_varying,max_unique_within_any_segment
67,infrastructure_commercial_area_percent,4958,0,1
64,infrastructure_cemetery_percent,4958,0,1
63,infrastructure_brach3_percent,4958,0,1
62,infrastructure_brach2_percent,4958,0,1
61,infrastructure_brach1_percent,4958,0,1
60,infrastructure_baustelle_percent,4958,0,1
59,infrastructure_horticulture_percent,4958,0,1
58,infrastructure_arable_land_percent,4958,0,1
57,infrastructure_str_flges_percent,4958,0,1
65,infrastructure_public_facilities_percent,4958,0,1


In [10]:
# tag dtypes
col_dtype = strava_berlin_data.dtypes
summary_df["dtype_bucket"] = summary_df["column"].map(
    lambda c: "bool" if col_dtype[c].name == "bool"
    else "numeric" if np.issubdtype(col_dtype[c], np.number)
    else "categorical"
)

# overall constant/varying summary
overall_stats = {
    "total_columns": len(summary_df),
    "constant_columns": int((summary_df["segments_varying"] == 0).sum()),
    "varying_columns": int((summary_df["segments_varying"] > 0).sum()),
}
overall_stats["percent_constant"] = round(
    overall_stats["constant_columns"]
    / max(overall_stats["total_columns"], 1)
    * 100,
    1,
 )

print("Overall column stability:")
for key, value in overall_stats.items():
    print(f"  {key}: {value}")

# dtype-level statistics
dtype_counts = (
    summary_df
    .groupby(["dtype_bucket"])
    .agg(
        total_cols=("column", "count"),
        constant_cols=("segments_varying", lambda s: (s == 0).sum()),
        varying_cols=("segments_varying", lambda s: (s > 0).sum()),
    )
)

dtype_counts["percent_constant"] = (
    dtype_counts["constant_cols"] / dtype_counts["total_cols"] * 100
).round(1)

display(dtype_counts.sort_values("percent_constant", ascending=False))

Overall column stability:
  total_columns: 136
  constant_columns: 70
  varying_columns: 66
  percent_constant: 51.5


Unnamed: 0_level_0,total_cols,constant_cols,varying_cols,percent_constant
dtype_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
categorical,17,12,5,70.6
bool,8,4,4,50.0
numeric,111,54,57,48.6


## Unfortunately I wasn't able to make the code below run on my own pc, but for Liaisan it worked, hence I only commented it out and made a simpler verion (see code cell above).

In [11]:
# import numpy as np

# # tag dtypes
# col_dtype = strava_berlin_data.dtypes
# summary_df["dtype_bucket"] = summary_df["column"].map(
#     lambda c: "bool" if col_dtype[c].name == "bool"
#     else "numeric" if np.issubdtype(col_dtype[c], np.number)
#     else "categorical"
# )

# # constant/varying summary
# group_stats = (
#     summary_df
#     .groupby("group")
#     .agg(
#         total_cols=("column", "count"),
#         constant_cols=("segments_varying", lambda s: (s == 0).sum()),
#         varying_cols=("segments_varying", lambda s: (s > 0).sum()),
#     )
# )

# # dtype counts per group
# dtype_counts = (
#     summary_df
#     .groupby(["group", "dtype_bucket"])
#     .size()
#     .unstack(fill_value=0)
# )

# # merge everything into one table
# group_stats = group_stats.join(dtype_counts, how="left")
# group_stats["percent_constant"] = (
#     group_stats["constant_cols"] / group_stats["total_cols"] * 100
# ).round(1)

# display(group_stats.sort_values("percent_constant", ascending=False))

# print("\nOther group details:")
# display(summary_df[summary_df["group"] == "other"])


### Result: All connectivity and infrastructure columns are constant per segment. Socioeconomic, Motorized and weather columns vary, so we need to aggregate them.

- Connectivity (7/7 constant, 2 bool, 5 numeric): treat as static attributes per segment; just carry a single value (e.g., first).
- Infrastructure (58/58 constant, 1 bool, 10 categorical, 47 numeric): fully static; keep one value per segment, no temporal aggregation needed.
**- Other (14 cols, 5 constant/9 varying; 5 bool/6 cat/3 num): mixed bag—decide column by column; reassign misfiled cols if any.**
- Motorized (12/12 varying, all numeric): fully time-varying; aggregate over your time buckets (sum for counts, mean for speeds).
- Socioeconomic (17/17 varying, numeric): varies across time in the data; aggregate over your time buckets (sum for counts, mean for speeds).
**- Strava (19/19 varying; 1 categorical, 18 numeric): counts/speeds should be summed/averaged per time bucket; handle the single categorical (strava_activity_type) via ????**
- Weather (9/9 varying, numeric): time-varying; aggregate with mean (or min/max if useful).



## Aggregation of Berlin Strava data 
1. Aggregation keys: counter_name (segment), year, month, weekday (to align with accidents).
2. Constant features stay as-is (no aggregation) since they don’t vary over time.

### This code execution can take a while, on Liaisan's pc ~13 minutes.

In [12]:
# # Keys and minimal copy
# df = strava_berlin_data.copy()
# df.dropna(subset=["latitude", "longitude", "geometry", "street_name", "is_shortterm"], inplace=True)
# df["date"] = pd.to_datetime(df["date"])
# df["year"] = df["date"].dt.year
# df["month"] = df["date"].dt.month
# df["weekday"] = df["date"].dt.day_name()
# keys = ["counter_name", "year", "month", "weekday"]

# # Constant columns that don't vary over time for one segment, we don't need to aggregate them(from summary_df)
# constant_cols = summary_df.loc[summary_df["segments_varying"] == 0, "column"].tolist()

# # Time-varying columns
# sum_cols = [c for c in [
#     "count","strava_total_trip_count","strava_ride_count","strava_ebike_ride_count",
#     "strava_total_people_count","strava_total_commute_trip_count","strava_total_leisure_trip_count",
#     "strava_total_morning_trip_count","strava_total_midday_trip_count",
#     "strava_total_evening_trip_count","strava_total_overnight_trip_count",
#     "strava_total_male_people_count","strava_total_female_people_count",
#     "strava_total_18_34_people_count","strava_total_35_54_people_count",
#     "strava_total_55_64_people_count","strava_total_65_plus_people_count",
#     "strava_total_unspecified_people_count",
#     "motorized_vehicle_count_all_vehicles_6km","motorized_vehicle_count_cars_6km","motorized_vehicle_count_trucks_6km",
#     "motorized_vehicle_count_all_vehicles","motorized_vehicle_count_cars","motorized_vehicle_count_trucks"
# ] if c in df.columns]

# mean_cols = [c for c in [
#     "strava_total_average_speed_meters_per_second",
#     "motorized_avg_speed_all_vehicles_6km","motorized_avg_speed_cars_6km","motorized_avg_speed_trucks_6km",
#     "motorized_avg_speed_all_vehicles","motorized_avg_speed_cars","motorized_avg_speed_trucks",
#     "infrastructure_distance_citycenter_km",
# ] + [c for c in df.columns if c.startswith("weather_")]
#   + [c for c in df.columns if c.startswith("socioeconomic_")]
#   if c in df.columns]

# # Only varying categorical
# cat_cols = [c for c in ["strava_activity_type"] if c in df.columns]

# # Keep just the columns we need
# vary_cols = keys + sum_cols + mean_cols + cat_cols
# df_var = df[vary_cols]
# df_const = df[["counter_name"] + constant_cols].drop_duplicates("counter_name")

# # Cast keys to category to speed up groupby and reduce memory
# for k in keys:
#     df_var[k] = df_var[k].astype("category")

# def fast_mode(s):
#     vc = s.value_counts(dropna=True)
#     return vc.index[0] if not vc.empty else pd.NA

# agg_map = {**{c: "sum" for c in sum_cols},
#            **{c: "mean" for c in mean_cols},
#            **{c: fast_mode for c in cat_cols}}

# # Group with observed=True to avoid cartesian combos of unused categories
# agg_segment_ymw = (
#     df_var
#     .groupby(keys, sort=False, observed=True)
#     .agg(agg_map)
#     .reset_index()
# )

# # Rename aggregated columns with prefixes
# rename_map = {}
# rename_map.update({c: f"sum_{c}" for c in sum_cols if c in agg_segment_ymw.columns})
# rename_map.update({c: f"mean_{c}" for c in mean_cols if c in agg_segment_ymw.columns})
# rename_map.update({c: f"mode_{c}" for c in cat_cols if c in agg_segment_ymw.columns})
# agg_segment_ymw = agg_segment_ymw.rename(columns=rename_map)

# final_agg = agg_segment_ymw.merge(df_const, on="counter_name", how="left")
# print(final_agg.shape)


In [13]:
# # final_agg.head()
# # after the rename block
# agg_segment_ymw = agg_segment_ymw.rename(columns=rename_map)

# # merge and set the final name you’ll inspect
# final_df = agg_segment_ymw.merge(df_const, on="counter_name", how="left")

# # quick checks
# print([c for c in final_df.columns if c.startswith(("sum_", "mean_", "mode_"))][:10])
# print(final_df.shape)
# final_df.head()


# Segment level risk

In [14]:
# --- 1. Build exposure panel from Strava/sensor data ---

df = strava_berlin_data.copy()

# Basic cleaning
# NOTE: we no longer require geometry here because we will attach canonical geometry later
df.dropna(
    subset=["latitude", "longitude", "street_name", "is_shortterm"],
    inplace=True
)

# Time keys
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.day_name()  # kept for possible later use

keys = ["counter_name", "year", "month"]

# --- Constant columns (from summary_df) ---
constant_cols_raw = summary_df.loc[
    summary_df["segments_varying"] == 0, "column"
].tolist()

# EXCLUDE geometry from Strava constants; we will take geometry from the geo dataset
constant_cols = [
    c for c in constant_cols_raw
    if c in df.columns and c != "geometry"
]

# --- Time-varying columns ---

sum_cols = [c for c in [
    "count","strava_total_trip_count","strava_ride_count","strava_ebike_ride_count",
    "strava_total_people_count","strava_total_commute_trip_count","strava_total_leisure_trip_count",
    "strava_total_morning_trip_count","strava_total_midday_trip_count",
    "strava_total_evening_trip_count","strava_total_overnight_trip_count",
    "strava_total_male_people_count","strava_total_female_people_count",
    "strava_total_18_34_people_count","strava_total_35_54_people_count",
    "strava_total_55_64_people_count","strava_total_65_plus_people_count",
    "strava_total_unspecified_people_count",
    "motorized_vehicle_count_all_vehicles_6km","motorized_vehicle_count_cars_6km","motorized_vehicle_count_trucks_6km",
    "motorized_vehicle_count_all_vehicles","motorized_vehicle_count_cars","motorized_vehicle_count_trucks"
] if c in df.columns]

mean_cols = [c for c in [
    "strava_total_average_speed_meters_per_second",
    "motorized_avg_speed_all_vehicles_6km","motorized_avg_speed_cars_6km","motorized_avg_speed_trucks_6km",
    "motorized_avg_speed_all_vehicles","motorized_avg_speed_cars","motorized_avg_speed_trucks",
    "infrastructure_distance_citycenter_km",
] + [c for c in df.columns if c.startswith("weather_")]
  + [c for c in df.columns if c.startswith("socioeconomic_")]
  if c in df.columns]

cat_cols = [c for c in ["strava_activity_type"] if c in df.columns]

# Varying + constant frames
vary_cols = keys + sum_cols + mean_cols + cat_cols
df_var = df[vary_cols].copy()

df_const = (
    df[["counter_name"] + constant_cols]
    .drop_duplicates("counter_name")
    .reset_index(drop=True)
)

# Cast keys to category to speed up groupby
for k in keys:
    df_var[k] = df_var[k].astype("category")


def fast_mode(s: pd.Series):
    vc = s.value_counts(dropna=True)
    return vc.index[0] if not vc.empty else pd.NA


agg_map = {
    **{c: "sum" for c in sum_cols},
    **{c: "mean" for c in mean_cols},
    **{c: fast_mode for c in cat_cols},
}

# Aggregate by segment–year–month
agg_segment_ym = (
    df_var
    .groupby(keys, sort=False, observed=True)
    .agg(agg_map)
    .reset_index()
)

# Prefix aggregated columns
rename_map = {}
rename_map.update({c: f"sum_{c}" for c in sum_cols if c in agg_segment_ym.columns})
rename_map.update({c: f"mean_{c}" for c in mean_cols if c in agg_segment_ym.columns})
rename_map.update({c: f"mode_{c}" for c in cat_cols if c in agg_segment_ym.columns})

agg_segment_ym = agg_segment_ym.rename(columns=rename_map)

# Attach static segment-level attributes from Strava (WITHOUT geometry)
final_exposure_ym = agg_segment_ym.merge(
    df_const,
    on="counter_name",
    how="left",
)

# Attach canonical geometry (and optionally canonical street_name) from geo dataset
# segment_static must contain at least ["counter_name", "geometry"]
final_exposure_ym = final_exposure_ym.merge(
    segment_static[["counter_name", "geometry"]],
    on="counter_name",
    how="left",
)

# Important: year/month back to integer (they were category)
final_exposure_ym["year"] = final_exposure_ym["year"].astype("int64")
final_exposure_ym["month"] = final_exposure_ym["month"].astype("int64")

print("Exposure panel (segment–year–month) shape:", final_exposure_ym.shape)
final_exposure_ym.head()


Exposure panel (segment–year–month) shape: (297480, 132)


Unnamed: 0,counter_name,year,month,sum_count,sum_strava_total_trip_count,sum_strava_ride_count,sum_strava_ebike_ride_count,sum_strava_total_people_count,sum_strava_total_commute_trip_count,sum_strava_total_leisure_trip_count,...,infrastructure_count_shops_within0.25km,infrastructure_count_industry_within0.25km,infrastructure_count_hotels_within0.25km,infrastructure_count_education_within0.5km,infrastructure_count_hospitals_within0.5km,infrastructure_count_industry_within0.1km,infrastructure_count_hotels_within0.5km,infrastructure_count_industry_within0.5km,infrastructure_count_shops_within0.5km,geometry
0,streetsegment_3572,2019,1,5866.0,5.0,5.0,0.0,5.0,0.0,5.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
1,streetsegment_3572,2019,2,8437.0,30.0,25.0,0.0,30.0,0.0,30.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
2,streetsegment_3572,2019,3,9993.0,40.0,30.0,0.0,40.0,0.0,40.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
3,streetsegment_3572,2019,4,14961.0,55.0,45.0,0.0,55.0,0.0,55.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."
4,streetsegment_3572,2019,5,16605.0,65.0,65.0,0.0,65.0,0.0,65.0,...,1,0,0,0,0,0,0,0,3,"LINESTRING (402133.284 5816807.644, 402120.978..."


## Aggregate accidents

In [15]:
# --- 2. Build rich accident panel from Unfallatlas+segments ---

# Start from joined accidents (with segment IDs)
acc = joined_nearest_unique.copy()

# Rename Unfallatlas columns to English if not already done
acc = acc.rename(columns=accident_columns_en)

# Ensure year/month are clean integers
acc["year"] = acc["year"].astype("int64")
acc["month"] = acc["month"].astype("int64")

keys = ["counter_name", "year", "month"]

# Restrict to coverage years of Strava exposure
min_year = final_exposure_ym["year"].min()
max_year = final_exposure_ym["year"].max()
acc = acc[acc["year"].between(min_year, max_year)].copy()

# Participant flags (0/1 indicators)
flag_cols = [
    "involved_bicycle",
    "involved_passenger_car",
    "involved_pedestrian",
    "involved_motorcycle",
    "involved_goods_vehicle",
    "involved_other_vehicle",
    "involved_road",
    "road_condition_flag",
]
flag_cols = [c for c in flag_cols if c in acc.columns]

# Key categorical variables
cat_cols = [
    "injury_severity",
    "accident_kind",
    "accident_type",
    "light_condition",
    "road_condition",
]
cat_cols = [c for c in cat_cols if c in acc.columns]

# Base: total accidents per segment–year–month
acc_base = (
    acc
    .groupby(keys, observed=True)
    .agg(total_accidents=("accident_id", "size"))
    .reset_index()
)

# Flags: counts + shares
if flag_cols:
    acc_flags_counts = (
        acc
        .groupby(keys, observed=True)[flag_cols]
        .sum()  # 0/1 flags → counts
        .reset_index()
    )

    count_rename = {c: f"acc_{c}_count" for c in flag_cols}
    acc_flags_counts = acc_flags_counts.rename(columns=count_rename)

    acc_flags = acc_base[keys + ["total_accidents"]].merge(
        acc_flags_counts,
        on=keys,
        how="left",
    )

    for c in flag_cols:
        cnt_col = f"acc_{c}_count"
        share_col = f"acc_{c}_share"
        acc_flags[share_col] = (
            acc_flags[cnt_col] / acc_flags["total_accidents"].replace(0, pd.NA)
        )

    acc_flags = acc_flags.drop(columns=["total_accidents"])
else:
    acc_flags = acc_base[keys].copy()

# Categoricals: counts + shares per category
cat_blocks = []

for col in cat_cols:
    pivot_counts = (
        acc
        .pivot_table(
            index=keys,
            columns=col,
            values="accident_id",
            aggfunc="count",
            fill_value=0,
        )
    )

    pivot_counts.columns = [
        f"acc_{col}_count_{str(cat)}" for cat in pivot_counts.columns
    ]

    row_sums = pivot_counts.sum(axis=1).replace(0, pd.NA)
    pivot_shares = pivot_counts.div(row_sums, axis=0)

    pivot_shares.columns = [
        name.replace("_count_", "_share_") for name in pivot_counts.columns
    ]

    pivot_both = pd.concat([pivot_counts, pivot_shares], axis=1).reset_index()
    cat_blocks.append(pivot_both)

if cat_blocks:
    from functools import reduce
    acc_cats = reduce(
        lambda left, right: left.merge(right, on=keys, how="outer"),
        cat_blocks
    )
else:
    acc_cats = acc_base[keys].copy()

# Combine base, flags, and categorical summaries
accidents_agg_ym_rich = (
    acc_base
    .merge(acc_flags, on=keys, how="left")
    .merge(acc_cats, on=keys, how="left")
)

print("Rich accident aggregate (segment–year–month) shape:",
      accidents_agg_ym_rich.shape)
accidents_agg_ym_rich.head()


Rich accident aggregate (segment–year–month) shape: (14085, 72)


Unnamed: 0,counter_name,year,month,total_accidents,acc_involved_bicycle_count,acc_involved_passenger_car_count,acc_involved_pedestrian_count,acc_involved_motorcycle_count,acc_involved_goods_vehicle_count,acc_involved_other_vehicle_count,...,acc_light_condition_count_2.0,acc_light_condition_share_0.0,acc_light_condition_share_1.0,acc_light_condition_share_2.0,acc_road_condition_count_0.0,acc_road_condition_count_1.0,acc_road_condition_count_2.0,acc_road_condition_share_0.0,acc_road_condition_share_1.0,acc_road_condition_share_2.0
0,streetsegment_0,2019,1,1,1,1,0,0,0.0,0.0,...,0,,,,0.0,0.0,0.0,,,
1,streetsegment_0,2019,4,1,1,1,0,0,0.0,0.0,...,0,,,,0.0,0.0,0.0,,,
2,streetsegment_0,2019,7,1,1,0,0,1,0.0,0.0,...,0,,,,0.0,0.0,0.0,,,
3,streetsegment_0,2019,9,1,1,1,0,0,0.0,0.0,...,0,,,,0.0,0.0,0.0,,,
4,streetsegment_0,2020,5,1,1,1,0,0,0.0,0.0,...,0,,,,0.0,0.0,0.0,,,


## Merge datasets

In [16]:
# --- 3. Merge exposure and accident panels into a risk panel ---

merge_keys = ["counter_name", "year", "month"]

# Sanity: make sure keys are unique on both sides
print("Exposure duplicate keys:",
      final_exposure_ym.duplicated(merge_keys).any())
print("Accident duplicate keys:",
      accidents_agg_ym_rich.duplicated(merge_keys).any())

# Left join: exposure as base
merged_accidents_strava_ym = final_exposure_ym.merge(
    accidents_agg_ym_rich,
    on=merge_keys,
    how="left",
    validate="one_to_one",
)

# Fill NAs in all accident-related columns (segments with exposure but no accidents)
acc_cols = [c for c in accidents_agg_ym_rich.columns if c not in merge_keys]
merged_accidents_strava_ym[acc_cols] = (
    merged_accidents_strava_ym[acc_cols].fillna(0)
)

print("Merged risk panel (segment–year–month) shape:",
      merged_accidents_strava_ym.shape)

# quick look
merged_accidents_strava_ym.head()

# Save geodataframe to parquet file
gpd.GeoDataFrame(
    merged_accidents_strava_ym,
    geometry="geometry",
    crs=segment_geo_gdf.crs,  # or set explicitly
).to_parquet(
    "data/merged/berlin_bike_accident_strava_panel.parquet",
    index=False
)



Exposure duplicate keys: False
Accident duplicate keys: False


  merged_accidents_strava_ym[acc_cols].fillna(0)


Merged risk panel (segment–year–month) shape: (297480, 201)


## Sanity check of the merge

In [17]:
# 0) Check required objects
if "merged_accidents_strava_ym" not in globals():
    raise RuntimeError("Run the merge cell (merged_accidents_strava_ym) before the sanity check.")
if "accidents_agg_ym_rich" not in globals() or "final_exposure_ym" not in globals():
    raise RuntimeError("Make sure both source aggregates (accidents_agg_ym_rich, final_exposure_ym) exist.")

merge_keys = ["counter_name", "year", "month"]

# 1) Verify Strava exposure has unique keys
exposure_duplicates = final_exposure_ym.duplicated(subset=merge_keys).sum()
if exposure_duplicates == 0:
    print("Strava exposure is unique per segment-year-month.")
else:
    raise AssertionError(f"Found {exposure_duplicates} duplicate keys in Strava exposure table.")

# 2) Coverage of accidents within Strava exposure universe (without materialising a full join)
if "unique_exposure_index" in locals() and "unique_accident_index" in locals():
    print("Reusing cached coverage indices.")
else:
    exposure_index = pd.MultiIndex.from_frame(final_exposure_ym[merge_keys])
    accident_index = pd.MultiIndex.from_frame(accidents_agg_ym_rich[merge_keys])
    unique_exposure_index = exposure_index.unique()
    unique_accident_index = accident_index.unique()

segments_with_accidents = int(unique_exposure_index.isin(unique_accident_index).sum())
segments_without_accidents = int(len(unique_exposure_index) - segments_with_accidents)
print(f"Segment-year-month combos with accidents: {segments_with_accidents}")
print(f"Segment-year-month combos without accidents (remain with zeros): {segments_without_accidents}")

# Accident groups that have no matching exposure
accidents_missing_mask = ~unique_accident_index.isin(unique_exposure_index)
missing_count = int(accidents_missing_mask.sum())
accident_only_df = None
if missing_count:
    missing_preview = list(unique_accident_index[accidents_missing_mask][:5])
    accident_only_df = (
        accidents_agg_ym_rich
        .set_index(merge_keys)
        .loc[unique_accident_index[accidents_missing_mask]]
        .reset_index()
    )
    print(
        "Warning: accident groups lacking Strava exposure coverage.",
        f"Count={missing_count}",
        f"Sample={missing_preview}",
    )
    display(accident_only_df.head())

# 3) Accident totals should be preserved after the merge (within coverage)
merged_total = merged_accidents_strava_ym["total_accidents"].sum()
source_total = accidents_agg_ym_rich["total_accidents"].sum()
total_diff = source_total - merged_total
print(f"Merged total accidents: {merged_total}")
print(f"Source total accidents: {source_total}")
if total_diff == 0:
    print("Accident totals preserved within merged panel.")
else:
    print(
        "Warning: accident totals differ.",
        f"Lost_in_merge={total_diff}",
        "These correspond to accident groups without exposure coverage.",
    )
    if accident_only_df is not None:
        lost_total = int(accident_only_df["total_accidents"].sum())
        print(f"Total accidents in uncovered groups: {lost_total}")
        display(
            accident_only_df
            .groupby("counter_name", as_index=False)["total_accidents"]
            .sum()
            .head()
        )


Strava exposure is unique per segment-year-month.
Segment-year-month combos with accidents: 14085
Segment-year-month combos without accidents (remain with zeros): 283395
Merged total accidents: 15398.0
Source total accidents: 15398
Accident totals preserved within merged panel.


# Playground, to create and test smaller dataset version. The smaller ones are created based on the completly merged version.

In [18]:
import numpy as np

full = merged_accidents_strava_ym.copy()

# --- Keys ---
key_cols = ["counter_name", "year", "month"]

# --- Segment / geometry / context ---
segment_cols = [
    c for c in [
        "geometry",
        # "street_name",
        # "latitude",
        # "longitude",
    ]
    if c in full.columns
]

# --- Exposure (traffic volume) ---
exposure_cols = [
    c for c in [
        "sum_strava_total_trip_count",        # exposure: Strava trips / month
        "sum_count",                          # sensor-based volume 
    ]
    if c in full.columns
]

# --- Core accident metrics ---
acc_core_cols = [
    c for c in [
        "total_accidents",
    ]
    if c in full.columns
]

# --- Severity mix (all codes) ---
severity_cols = [
    c for c in full.columns
    if c.startswith("acc_injury_severity_count_")
    or c.startswith("acc_injury_severity_share_")
]

# --- Accident type distributions ---
accident_type_cols = [
    c for c in full.columns
    if c.startswith("acc_accident_type_count_")
    or c.startswith("acc_accident_type_share_")
]

# --- Accident kind distributions ---
accident_kind_cols = [
    c for c in full.columns
    if c.startswith("acc_accident_kind_count_")
    or c.startswith("acc_accident_kind_share_")
]

# --- Light condition distributions ---
light_cols = [
    c for c in full.columns
    if c.startswith("acc_light_condition_count_")
    or c.startswith("acc_light_condition_share_")
]

# --- Road condition distributions ---
road_cols = [
    c for c in full.columns
    if c.startswith("acc_road_condition_count_")
    or c.startswith("acc_road_condition_share_")
]

# --- Collect all columns to keep and de-duplicate ---
cols_keep = (
    key_cols
    + segment_cols
    + exposure_cols
    + acc_core_cols
    + severity_cols
    + accident_type_cols
    + accident_kind_cols
    + light_cols
    + road_cols
)

seen = set()
cols_keep = [c for c in cols_keep if not (c in seen or seen.add(c))]

core_panel = full[cols_keep].copy()

# --- Derived metrics / risk columns ---

# alias for clarity
if "sum_strava_total_trip_count" in core_panel.columns:
    core_panel["monthly_strava_trips"] = core_panel["sum_strava_total_trip_count"]

# basic risk ratio: accidents per Strava trip
if "total_accidents" in core_panel.columns and "monthly_strava_trips" in core_panel.columns:
    denom = core_panel["monthly_strava_trips"].replace(0, np.nan)
    core_panel["risk_accidents_per_trip"] = core_panel["total_accidents"] / denom
    core_panel["risk_accidents_per_10k_trips"] = core_panel["risk_accidents_per_trip"] * 10_000

print("Core panel shape:", core_panel.shape)
core_panel.head(20)

# Save geodataframe to parquet file
gpd.GeoDataFrame(
    core_panel,
    geometry="geometry",
    crs=segment_geo_gdf.crs,  # or set explicitly
).to_parquet(
    "data/merged/berlin_bike_accident_strava_risk_core_panel.parquet",
    index=False
)



Core panel shape: (297480, 62)


In [19]:
# find unique numbers of accidents in core_panel
unique_accident_counts = core_panel["total_accidents"].nunique()
print(f"Unique accident counts in core panel: {unique_accident_counts}")

# display those unique counts
print("Unique accident counts:", core_panel["total_accidents"].unique())




Unique accident counts in core panel: 7
Unique accident counts: [0. 2. 1. 3. 4. 5. 6.]


In [20]:
# sum up the accidents per segment to verfiy total accidents per segment
accidents_per_segment = (
    core_panel
    .groupby("counter_name", as_index=False)["total_accidents"]
    .sum()
    .rename(columns={"total_accidents": "total_accidents_segment"})
)

# find segments with highest total accidents
top_segments = accidents_per_segment.sort_values("total_accidents_segment", ascending=False).head(10)
print("Top 10 segments by total accidents:")
display(top_segments)

Top 10 segments by total accidents:


Unnamed: 0,counter_name,total_accidents_segment
1662,streetsegment_2494,57.0
2861,streetsegment_3573,55.0
2740,streetsegment_3464,44.0
1038,streetsegment_1932,44.0
1362,streetsegment_2223,41.0
1376,streetsegment_2236,41.0
2552,streetsegment_3295,41.0
1268,streetsegment_2139,37.0
2417,streetsegment_3173,34.0
4103,streetsegment_4691,33.0


# Crossing (junction) risk

### Build nodes (junction candidates) from segment endpoints

In [21]:
segments = segment_geo_gdf[["counter_name", "geometry"]].copy()  # canonical segments in CANONICAL_CRS

def get_endpoints(geom):
    # Handles LineString and MultiLineString
    if geom.geom_type == "LineString":
        coords = list(geom.coords)
    elif geom.geom_type == "MultiLineString":
        lines = list(geom.geoms)
        longest = max(lines, key=lambda g: g.length)
        coords = list(longest.coords)
    else:
        return None, None
    return coords[0], coords[-1]

segments[["start_pt", "end_pt"]] = segments["geometry"].apply(
    lambda g: pd.Series(get_endpoints(g))
)

start_gdf = gpd.GeoDataFrame(
    segments[["counter_name"]],
    geometry=gpd.points_from_xy(
        [p[0] for p in segments["start_pt"]],
        [p[1] for p in segments["start_pt"]],
    ),
    crs=segments.crs,
)
start_gdf["role"] = "start"

end_gdf = gpd.GeoDataFrame(
    segments[["counter_name"]],
    geometry=gpd.points_from_xy(
        [p[0] for p in segments["end_pt"]],
        [p[1] for p in segments["end_pt"]],
    ),
    crs=segments.crs,
)
end_gdf["role"] = "end"

nodes_raw = pd.concat([start_gdf, end_gdf], ignore_index=True)

### Cluster endpoints into nodes (snap grid)

In [22]:
tol = 2  # meters; snapping tolerance
nodes_raw["x_rounded"] = (nodes_raw.geometry.x / tol).round().astype(int)
nodes_raw["y_rounded"] = (nodes_raw.geometry.y / tol).round().astype(int)

nodes_raw["node_id"] = nodes_raw.groupby(["x_rounded", "y_rounded"]).ngroup()

# Node geometry
node_points = (
    nodes_raw
    .dissolve(by="node_id", as_index=False)
    [["node_id", "geometry"]]
)

# Segment–node mapping (each segment has start & end)
segment_node_map = nodes_raw[["counter_name", "node_id", "role"]].drop_duplicates()

### Define crossings (nodes with degree $\geq$ 3)

In [23]:
# Degree per node (how many segments touch it)
deg = nodes_raw.groupby("node_id")["counter_name"].nunique()

# Pick crossings: degree >= 3 (adjust if you want >=4)
crossing_ids = deg[deg >= 3].index

crossings_gdf = node_points[node_points["node_id"].isin(crossing_ids)].copy()

### Assign accidents to nearest crossing

In [24]:
acc_gdf = gpd.GeoDataFrame(
    joined_nearest_unique.copy(),
    geometry="geometry",
    crs=segments.crs,  # CANONICAL_CRS
)

# Clean up any leftover join columns to avoid GeoPandas name clashes
for df_ in (acc_gdf, crossings_gdf):
    df_.drop(columns=["index_right", "index_left"], errors="ignore", inplace=True)

acc_gdf = acc_gdf.reset_index(drop=True)
crossings_clean = crossings_gdf[["node_id", "geometry"]].reset_index(drop=True)

acc_node = gpd.sjoin_nearest(
    acc_gdf,
    crossings_clean,
    how="left",
    max_distance=20,          # 20 m radius for crossing assignment
    distance_col="dist_node",
    rsuffix="node",
)

# Keep only accidents assigned to a crossing
acc_node = acc_node.dropna(subset=["node_id"]).copy()
acc_node["node_id"] = acc_node["node_id"].astype(int)

# Aggregate accidents at node × year × month
acc_node_ym = (
    acc_node
    .groupby(["node_id", "year", "month"], observed=True)
    .agg(total_accidents=("acc_id", "size"))
    .reset_index()
)

### Build node-level exposure from segment flows

In [25]:
# Restrict segment-node mapping to crossings
segment_node_map_cross = segment_node_map[
    segment_node_map["node_id"].isin(crossing_ids)
].copy()

# Attach node_id to segment exposure (final_exposure_ym is segment×year×month)
segment_exposure_nodes = final_exposure_ym.merge(
    segment_node_map_cross[["counter_name", "node_id"]],
    on="counter_name",
    how="inner",
)

# Aggregate exposure per node × year × month
node_exposure_ym = (
    segment_exposure_nodes
    .groupby(["node_id", "year", "month"], observed=True)
    .agg(monthly_strava_trips=("sum_strava_total_trip_count", "sum"))
    .reset_index()
)

### Combine into node-level risk panel

In [26]:
node_panel_ym = node_exposure_ym.merge(
    acc_node_ym,
    on=["node_id", "year", "month"],
    how="left",
)

node_panel_ym["total_accidents"] = node_panel_ym["total_accidents"].fillna(0)

# Attach node geometry (crossing point)
node_panel_ym = node_panel_ym.merge(
    crossings_gdf[["node_id", "geometry"]],
    on="node_id",
    how="left",
)

# Risk metrics
denom = node_panel_ym["monthly_strava_trips"].replace(0, np.nan)
node_panel_ym["risk_accidents_per_trip"] = node_panel_ym["total_accidents"] / denom
node_panel_ym["risk_accidents_per_10k_trips"] = node_panel_ym["risk_accidents_per_trip"] * 10_000

print("Node panel (crossing x year x month) shape:", node_panel_ym.shape)
node_panel_ym.head()

# Save geodataframe to parquet file
gpd.GeoDataFrame(
    node_panel_ym,
    geometry="geometry",
    crs=crossings_gdf.crs,  # or set explicitly
).to_parquet(
    "data/merged/berlin_bike_accident_node_panel.parquet",
    index=False
)



Node panel (crossing x year x month) shape: (175440, 8)


In [27]:
# Look at the overall description of the dataset
node_panel_ym.describe(include="all")

Unnamed: 0,node_id,year,month,monthly_strava_trips,total_accidents,geometry,risk_accidents_per_trip,risk_accidents_per_10k_trips
count,175440.0,175440.0,175440.0,175440.0,175440.0,175440,166995.0,166995.0
unique,,,,,,2924,,
top,,,,,,POINT (371406.10562136833 5808770.067929188),,
freq,,,,,,60,,
mean,1564.939808,2021.0,6.5,1415.950467,0.027611,,5.7e-05,0.56754
std,892.598709,1.414218,3.452062,2222.060432,0.172861,,0.001675,16.74964
min,1.0,2019.0,1.0,0.0,0.0,,0.0,0.0
25%,798.75,2020.0,3.75,165.0,0.0,,0.0,0.0
50%,1559.5,2021.0,6.5,660.0,0.0,,0.0,0.0
75%,2327.25,2022.0,9.25,1760.0,0.0,,0.0,0.0
