# Assigning accidents to segments from strava dataset

testing different ways of doing it and visualizing the results

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
import geopandas as gpd
import shapely


In [18]:
# set project root
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

## Read data from csv-files 

In [19]:
from src.preprocess_data import preprocess_accident_data as preprocess_accident_data
df_accidents = preprocess_accident_data()
df_accidents.head()

Loaded 9 files -> combined shape: (2098019, 32)
Dropped irrelevant columns -> shape: (2098019, 24)
Filtered to bicycle accidents -> shape: (626844, 24)
Filtered to bicycle accidents in Berlin -> shape: (33181, 24)


Unnamed: 0,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,UART,...,IstKrad,IstSonstige,IstStrassenzustand,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84,source_file,ULICHTVERH,IstGkfz
307975,11,0,3,3,2018,1,15,4,3,6,...,0,0,1,798261.3849,5835047.0,13.403228,52.583472,Unfallorte2018_LinRef.csv,1,0
308120,11,0,3,3,2018,1,11,5,3,5,...,0,0,1,800551.721,5829808.0,13.432186,52.535255,Unfallorte2018_LinRef.csv,0,0
308130,11,0,2,2,2018,1,8,2,3,5,...,0,0,0,803320.7292,5827627.0,13.470897,52.514173,Unfallorte2018_LinRef.csv,0,0
308149,11,0,1,1,2018,1,19,4,3,5,...,0,0,1,798174.6913,5826940.0,13.394673,52.510848,Unfallorte2018_LinRef.csv,2,0
308175,11,0,9,9,2018,1,18,4,2,5,...,0,0,1,806109.6297,5821644.0,13.506372,52.458993,Unfallorte2018_LinRef.csv,2,0


# Spatial Join

## Clean Dataset for Accidents with locations outside the Strava Streetsegments
### Attempt 1: Use spatial join to assign segments to the accident locations
Challenges:
* need buffer on the accident locations -> what's the right size?
* because of the buffering, some accident locations have more than one segments assigned to them

notice how the joined dataset for 15m buffer is bigger than the original one. This is due to duplicate accidents that are assigned to more than one segment and therefore duplicated in the spatial join.

### Attempt 2: Use sjoin_nearest to assign exactly one (the nearest) segment to each accident
Challenges:
* need to find the right maximum distance so accidents that are not on a segment are not assigned to one.
* assigns two segments if their distance is equal

In [20]:
# This code uses sjoin_nearest (attempt 2)
# load data
strava_segments = pd.read_parquet(path= PROJECT_ROOT / "data/strava/berlin_graph_geometry.parquet")
strava_segments["geometry"] = strava_segments["geometry"].apply(shapely.wkt.loads)
df_accidents = df_accidents.reset_index(drop=True)

# transform strava segments and accident locations to GeoDataFrames
strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")
accident_locations_gdf = gpd.GeoDataFrame(df_accidents, geometry=gpd.points_from_xy(df_accidents.XGCSWGS84, df_accidents.YGCSWGS84), crs="EPSG:4326")
accident_locations_gdf = accident_locations_gdf.to_crs("EPSG:32633")
strava_segments_gdf = strava_segments_gdf.to_crs("EPSG:32633")

# Add identifier to accidents
accident_locations_gdf = accident_locations_gdf.reset_index(drop=True)
accident_locations_gdf["acc_id"] = accident_locations_gdf.index

# Compute nearest segment
joined = gpd.sjoin_nearest(
    accident_locations_gdf,
    strava_segments_gdf,
    how="left",
    max_distance=17, # hyperparameter that can be tuned to make assignments as accurate as possible
    distance_col="dist"
)

# drop accidents without assigned segment (NaN in index_right)
joined = joined.dropna(subset=["index_right"])

# drop duplicate accidents (by distance to segment)
joined_nearest_unique = (
    joined
    .sort_values("dist")
    .drop_duplicates(subset=["acc_id"], keep="first")
)


print(f"Total accidents: {len(accident_locations_gdf)}")
print(f"Total segments: {len(strava_segments_gdf)}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print (f"Accidents with ambiguous nearest segment: {len(joined) - len(joined_nearest_unique)}")
print(f"ratio of assigned accidents: {len(joined_nearest_unique) / len(accident_locations_gdf):.2%}")

strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")
accident_locations_gdf = gpd.GeoDataFrame(df_accidents, geometry=gpd.points_from_xy(df_accidents.LINREFX, df_accidents.LINREFY), crs="EPSG:4326")

joined_nearest_unique.head()

Total accidents: 33181
Total segments: 4958
Accidents assigned to segments: 25125
Accidents with ambiguous nearest segment: 37
ratio of assigned accidents: 75.72%


Unnamed: 0,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,UART,...,source_file,ULICHTVERH,IstGkfz,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,11,0,9,9,2021,4,9,4,2,1,...,Unfallorte_2021_LinRef.csv,0,0,POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,11,0,5,5,2024,3,9,1,3,5,...,Unfallorte2024_LinRef.csv,0,0,POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,11,0,9,9,2019,9,6,4,3,5,...,Unfallorte2019_LinRef.csv,1,0,POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,11,0,12,12,2019,5,16,5,3,5,...,Unfallorte2019_LinRef.csv,0,0,POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,11,0,9,9,2019,5,18,3,1,0,...,Unfallorte2019_LinRef.csv,0,0,POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


### CHECK HOW WELL SPATIAL JOIN WORKS:  This function specifies a box within which all segments and their accidents are plotted

In [21]:
import folium; import numpy as np; import random

def plot_segments_accidents(segment_df, joined_df, accident_locations_df, coordinates, lat_tol, lon_tol, zoom, save_path=PROJECT_ROOT / "outputs/maps/join_map.html", save=False):

    m = folium.Map(location=coordinates, zoom_start=zoom, tiles=None)
    folium.TileLayer(
    tiles='https://basemaps.cartocdn.com/light_nolabels/{z}/{x}/{y}.png',
    attr='©CartoDB',
    name='CartoDB Light No Labels').add_to(m)
    
    colors = ["red", "blue", "green", "orange", "purple", "pink"]
    
    lat_list = []
    lon_list = []
    
    # 1. get all segments in the specified area
    # 2. add segment line for segment and correponding accidents in same color  
    
    min_lat, max_lat = coordinates[0] - lat_tol, coordinates[0] + lat_tol
    min_lon, max_lon = coordinates[1] - lon_tol, coordinates[1] + lon_tol
    sliced_df = segment_df[segment_df["latitude"].between(min_lat, max_lat) &
                segment_df["longitude"].between(min_lon, max_lon)]
    for segment_idx, segment_row in sliced_df.iterrows():
        color = random.choice(colors)
        segment_coords = [(lat, lon) for lon, lat in segment_row.geometry.coords]
        folium.PolyLine(
            locations=segment_coords,
            color=color,
            weight=5,
            opacity=0.5).add_to(m)
        for _, row in joined_df[joined_df["index_right"] == float(segment_idx)].iterrows():
            lat = row['YGCSWGS84']; lat_list.append(lat)
            lon = row['XGCSWGS84']; lon_list.append(lon)
            folium.CircleMarker(
                location=[lat, lon],
                radius=1,
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=1.0).add_to(m)
    
    # plot all accidents that are not added to any segment in black
    mask = (~accident_locations_df['YGCSWGS84'].isin(lat_list) & accident_locations_df['YGCSWGS84'].between(min_lat, max_lat)) & (~accident_locations_df['XGCSWGS84'].isin(lon_list) & accident_locations_df['XGCSWGS84'].between(min_lon, max_lon))
    filtered_rows = accident_locations_df[mask]
    for _, row in filtered_rows.iterrows():
        lat = row['YGCSWGS84']
        lon = row['XGCSWGS84']
        folium.CircleMarker(
            location=[lat, lon],
            radius=1,
            color="black",
            fill=True,
            fill_color=color,
            fill_opacity=1.0).add_to(m)
    if save:
        m.save(save_path)
        print(f"Map saved to {save_path}")
    return m

# --- using the function ---
coordinates = (52.518589, 13.376665) # coordinates for berlin mitte
lat_tol = 1.5e-2; lon_tol = 3e-2
zoom = 14
map_obj = plot_segments_accidents(strava_segments_gdf, joined_nearest_unique, accident_locations_gdf,
                                  coordinates, lat_tol, lon_tol, zoom, save=False)
map_obj

### How well can we identify junctions?

In [22]:
junction_df = gpd.read_parquet("../data/merged/berlin_bike_accident_node_panel.parquet")
acc_node_df = gpd.read_parquet("../data/merged/acc_node.parquet")

# Ensure expected CRS for folium (lat/lon)
junction_df = junction_df.to_crs(epsg=4326)
acc_node_df = acc_node_df.to_crs(epsg=4326)

# If these columns aren't present yet, create them from geometry
if "longitude" not in junction_df.columns:
    junction_df["longitude"] = junction_df.geometry.x
if "latitude" not in junction_df.columns:
    junction_df["latitude"] = junction_df.geometry.y

if "longitude" not in acc_node_df.columns:
    acc_node_df["longitude"] = acc_node_df.geometry.x
if "latitude" not in acc_node_df.columns:
    acc_node_df["latitude"] = acc_node_df.geometry.y

# Backwards compatibility: if you didn't add has_crossing, derive it
if "has_crossing" not in acc_node_df.columns:
    acc_node_df["has_crossing"] = acc_node_df["node_id"].notna()


In [23]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
import folium

def sanity_check_junctions(
    junction_panel_gdf,
    acc_node_gdf,
    zoom,
    coordinates,
    lat_tol,
    lon_tol,
    *,
    show_unassigned=True,
    show_assigned=True,
):
    """
    Visualize:
      - Junctions (from junction panel, aggregated)
      - Accidents assigned to those junctions (small colored points)
      - Optional: accidents in the bbox that are NOT assigned to any junction (black points)
    Assumes both GeoDataFrames are in EPSG:4326 with latitude/longitude columns.
    """

    m = folium.Map(location=coordinates, zoom_start=zoom, tiles=None)
    folium.TileLayer(
        tiles="https://basemaps.cartocdn.com/light_nolabels/{z}/{x}/{y}.png",
        attr="©CartoDB",
        name="CartoDB Light No Labels",
    ).add_to(m)

    cmap = plt.get_cmap("tab20")
    colors = [mcolors.rgb2hex(cmap(i)) for i in range(20)]

    min_lat, max_lat = coordinates[0] - lat_tol, coordinates[0] + lat_tol
    min_lon, max_lon = coordinates[1] - lon_tol, coordinates[1] + lon_tol

    # --- 1) Filter to bbox ---
    junction_bbox = junction_panel_gdf[
        junction_panel_gdf["latitude"].between(min_lat, max_lat)
        & junction_panel_gdf["longitude"].between(min_lon, max_lon)
    ].copy()

    acc_bbox = acc_node_gdf[
        acc_node_gdf["latitude"].between(min_lat, max_lat)
        & acc_node_gdf["longitude"].between(min_lon, max_lon)
    ].copy()

    # --- 2) Aggregate junctions over time for plotting (one marker per node_id) ---
    # junction panel has multiple rows per node_id (year/month); we collapse for display
    aggregations = {
        "total_accidents": "sum",
        "monthly_strava_trips": "sum",
        "longitude": "first",
        "latitude": "first",
    }
    junction_plot = junction_bbox.groupby("node_id", as_index=False).agg(aggregations)

    # --- 3) Plot junctions + assigned accidents ---
    for _, jrow in junction_plot.iterrows():
        node_id = jrow["node_id"]
        color = random.choice(colors)

        jlat, jlon = float(jrow["latitude"]), float(jrow["longitude"])

        # Big junction marker
        folium.CircleMarker(
            location=[jlat, jlon],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            tooltip=f"Junction ID: {node_id} | accidents={int(jrow['total_accidents'])} | trips={int(jrow['monthly_strava_trips'])}",
            fill_opacity=1.0,
        ).add_to(m)

        if show_assigned:
            # Small markers: accidents assigned to this junction
            assigned_acc = acc_bbox[acc_bbox["node_id"] == node_id]
            for _, arow in assigned_acc.iterrows():
                alat, alon = float(arow["latitude"]), float(arow["longitude"])
                folium.CircleMarker(
                    location=[alat, alon],
                    radius=1,
                    color=color,
                    fill=True,
                    fill_color=color,
                    tooltip=f"acc_id={arow.get('acc_id', 'NA')} | node_id={node_id} | dist={arow.get('dist_node', 'NA')}",
                    fill_opacity=1.0,
                ).add_to(m)

    # --- 4) Plot accidents not assigned to any junction (black) ---
    if show_unassigned:
        unassigned = acc_bbox[acc_bbox["node_id"].isna()].copy()
        for _, arow in unassigned.iterrows():
            alat, alon = float(arow["latitude"]), float(arow["longitude"])
            folium.CircleMarker(
                location=[alat, alon],
                radius=0.8,
                color="black",
                fill=False,
                tooltip=f"UNASSIGNED | acc_id={arow.get('acc_id', 'NA')} | dist={arow.get('dist_node', 'NA')}",
                fill_opacity=1.0,
            ).add_to(m)

    return m


coordinates = (52.518589, 13.376665)  # Berlin Mitte
lat_tol = 1.5e-2
lon_tol = 3e-2
zoom = 14

m = sanity_check_junctions(junction_df, acc_node_df, zoom, coordinates, lat_tol, lon_tol)
m


### Map displaying all accidents and strava segments

In [24]:
# create map displaying accidents and strava segments
map = folium.Map(location=[52.52, 13.40], zoom_start=12) 
strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")

# add strava segements to map
layer_segments = folium.FeatureGroup("Strava Segments", show=True)
folium.GeoJson(
    strava_segments_gdf,
    style_function=lambda x: {"color": "green"}
).add_to(layer_segments)
map.add_child(layer_segments)


# add accidents to map
layer_accidents = folium.FeatureGroup("Bicycle Accidents", show=True)
for _, row in joined_nearest_unique.iterrows():
    lat = row['YGCSWGS84']  
    lon = row['XGCSWGS84']  
    folium.CircleMarker(
        location=[lat, lon],
        radius=1,
        color='black',
        fill=True, 
        fill_color='black',
        fill_opacity=0.6,
        popup=f"Accident ID: {row['acc_id']}, Nearest Segment Index: {row['index_right']}"
).add_to(layer_accidents)   
map.add_child(layer_accidents)

folium.LayerControl().add_to(map)
map.save(PROJECT_ROOT / "outputs/maps/bicycle_accidents_with_segments_berlin.html")