# Assigning accidents to segments from strava dataset

testing different ways of doing it and visualizing the results

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
import geopandas as gpd
import shapely

In [15]:
# set project root
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

## Read data from csv-files 

In [8]:
from src.preprocess_data import preprocess_accident_data as preprocess_accident_data
df_accidents = preprocess_accident_data()
df_accidents.head()

Loaded 9 files -> combined shape: (2098019, 32)
Filtered to bicycle accidents -> shape: (626844, 24)
Filtered to bicycle accidents in Berlin -> shape: (33181, 24)


Unnamed: 0,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,UART,...,IstKrad,IstSonstige,IstStrassenzustand,LINREFX,LINREFY,XGCSWGS84,YGCSWGS84,source_file,ULICHTVERH,IstGkfz
307975,11,0,3,3,2018,1,15,4,3,6,...,0,0,1,798261.3849,5835047.0,13.403228,52.583472,Unfallorte2018_LinRef.csv,1,0
308120,11,0,3,3,2018,1,11,5,3,5,...,0,0,1,800551.721,5829808.0,13.432186,52.535255,Unfallorte2018_LinRef.csv,0,0
308130,11,0,2,2,2018,1,8,2,3,5,...,0,0,0,803320.7292,5827627.0,13.470897,52.514173,Unfallorte2018_LinRef.csv,0,0
308149,11,0,1,1,2018,1,19,4,3,5,...,0,0,1,798174.6913,5826940.0,13.394673,52.510848,Unfallorte2018_LinRef.csv,2,0
308175,11,0,9,9,2018,1,18,4,2,5,...,0,0,1,806109.6297,5821644.0,13.506372,52.458993,Unfallorte2018_LinRef.csv,2,0


# Spatial Join

## Clean Dataset for Accidents with locations outside the Strava Streetsegments
### Attempt 1: Use spatial join to assign segments to the accident locations
Challenges:
* need buffer on the accident locations -> what's the right size?
* because of the buffering, some accident locations have more than one segments assigned to them

notice how the joined dataset for 15m buffer is bigger than the original one. This is due to duplicate accidents that are assigned to more than one segment and therefore duplicated in the spatial join.

### Attempt 2: Use sjoin_nearest to assign exactly one (the nearest) segment to each accident
Challenges:
* need to find the right maximum distance so accidents that are not on a segment are not assigned to one.
* assigns two segments if their distance is equal

In [17]:
# This code uses sjoin_nearest (attempt 2)
# load data
strava_segments = pd.read_parquet(path= PROJECT_ROOT / "data/strava/berlin_graph_geometry.parquet")
strava_segments["geometry"] = strava_segments["geometry"].apply(shapely.wkt.loads)
df_accidents = df_accidents.reset_index(drop=True)

# transform strava segments and accident locations to GeoDataFrames
strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")
accident_locations_gdf = gpd.GeoDataFrame(df_accidents, geometry=gpd.points_from_xy(df_accidents.XGCSWGS84, df_accidents.YGCSWGS84), crs="EPSG:4326")
accident_locations_gdf = accident_locations_gdf.to_crs("EPSG:32633")
strava_segments_gdf = strava_segments_gdf.to_crs("EPSG:32633")

# Add identifier to accidents
accident_locations_gdf = accident_locations_gdf.reset_index(drop=True)
accident_locations_gdf["acc_id"] = accident_locations_gdf.index

# Compute nearest segment
joined = gpd.sjoin_nearest(
    accident_locations_gdf,
    strava_segments_gdf,
    how="left",
    max_distance=17, # hyperparameter that can be tuned to make assignments as accurate as possible
    distance_col="dist"
)

# drop accidents without assigned segment (NaN in index_right)
joined = joined.dropna(subset=["index_right"])

# drop duplicate accidents (by distance to segment)
joined_nearest_unique = (
    joined
    .sort_values("dist")
    .drop_duplicates(subset=["acc_id"], keep="first")
)


print(f"Total accidents: {len(accident_locations_gdf)}")
print(f"Total segments: {len(strava_segments_gdf)}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print (f"Accidents with ambiguous nearest segment: {len(joined) - len(joined_nearest_unique)}")
print(f"ratio of assigned accidents: {len(joined_nearest_unique) / len(accident_locations_gdf):.2%}")

strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")
accident_locations_gdf = gpd.GeoDataFrame(df_accidents, geometry=gpd.points_from_xy(df_accidents.LINREFX, df_accidents.LINREFY), crs="EPSG:4326")

joined_nearest_unique.head()

Total accidents: 33181
Total segments: 4958
Accidents assigned to segments: 25125
Accidents with ambiguous nearest segment: 37
ratio of assigned accidents: 75.72%


Unnamed: 0,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,UKATEGORIE,UART,...,source_file,ULICHTVERH,IstGkfz,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,11,0,9,9,2021,4,9,4,2,1,...,Unfallorte_2021_LinRef.csv,0,0,POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,11,0,5,5,2024,3,9,1,3,5,...,Unfallorte2024_LinRef.csv,0,0,POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,11,0,9,9,2019,9,6,4,3,5,...,Unfallorte2019_LinRef.csv,1,0,POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,11,0,12,12,2019,5,16,5,3,5,...,Unfallorte2019_LinRef.csv,0,0,POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,11,0,9,9,2019,5,18,3,1,0,...,Unfallorte2019_LinRef.csv,0,0,POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


### CHECK HOW WELL SPATIAL JOIN WORKS:  This function specifies a box within which all segments and their accidents are plotted

In [20]:
import folium; import numpy as np; import random

def plot_segments_accidents(segment_df, joined_df, accident_locations_df, coordinates, lat_tol, lon_tol, zoom, save_path=PROJECT_ROOT / "outputs/maps/join_map.html", save=False):

    m = folium.Map(location=coordinates, zoom_start=zoom, tiles=None)
    folium.TileLayer(
    tiles='https://basemaps.cartocdn.com/light_nolabels/{z}/{x}/{y}.png',
    attr='©CartoDB',
    name='CartoDB Light No Labels').add_to(m)
    
    colors = ["red", "blue", "green", "orange", "purple", "pink"]
    
    lat_list = []
    lon_list = []
    
    # 1. get all segments in the specified area
    # 2. add segment line for segment and correponding accidents in same color  
    
    min_lat, max_lat = coordinates[0] - lat_tol, coordinates[0] + lat_tol
    min_lon, max_lon = coordinates[1] - lon_tol, coordinates[1] + lon_tol
    sliced_df = segment_df[segment_df["latitude"].between(min_lat, max_lat) &
                segment_df["longitude"].between(min_lon, max_lon)]
    for segment_idx, segment_row in sliced_df.iterrows():
        color = random.choice(colors)
        segment_coords = [(lat, lon) for lon, lat in segment_row.geometry.coords]
        folium.PolyLine(
            locations=segment_coords,
            color=color,
            weight=5,
            opacity=0.5).add_to(m)
        for _, row in joined_df[joined_df["index_right"] == float(segment_idx)].iterrows():
            lat = row['YGCSWGS84']; lat_list.append(lat)
            lon = row['XGCSWGS84']; lon_list.append(lon)
            folium.CircleMarker(
                location=[lat, lon],
                radius=1,
                color=color,
                fill=True,
                fill_color=color,
                fill_opacity=1.0).add_to(m)
    
    # plot all accidents that are not added to any segment in black
    mask = (~accident_locations_df['YGCSWGS84'].isin(lat_list) & accident_locations_df['YGCSWGS84'].between(min_lat, max_lat)) & (~accident_locations_df['XGCSWGS84'].isin(lon_list) & accident_locations_df['XGCSWGS84'].between(min_lon, max_lon))
    filtered_rows = accident_locations_df[mask]
    print(len(filtered_rows))
    for _, row in filtered_rows.iterrows():
        lat = row['YGCSWGS84']
        lon = row['XGCSWGS84']
        folium.CircleMarker(
            location=[lat, lon],
            radius=1,
            color="black",
            fill=True,
            fill_color=color,
            fill_opacity=1.0).add_to(m)
    if save:
        m.save(save_path)
        print(f"Map saved to {save_path}")
    return m

# --- using the function ---
coordinates = (52.518589, 13.376665) # coordinates for berlin mitte
lat_tol = 1.5e-2; lon_tol = 3e-2
zoom = 14
map_obj = plot_segments_accidents(strava_segments_gdf, joined_nearest_unique, accident_locations_gdf,
                                  coordinates, lat_tol, lon_tol, zoom, save=True)
map_obj

972
Map saved to c:\Users\grasl\Documents\Studium\Master\WS_25_26\Data_Literacy_Project\data_literacy\outputs\maps\join_map.html


### This function displays a segment and surrounding accidents. In red the accidents that are assigned to this segment and in black all other accidents. (no longer needed)

In [22]:
import folium; import numpy as np

def plot_segment_on_map(df, segment, segment_idx, save_path=PROJECT_ROOT / "outputs/maps/map.html", save=False):

    # 2. Extract coordinates
    # Shapely (and WKT) uses (Longitude, Latitude) -> (x, y), Folium requires [Latitude, Longitude] -> [y, x]
    # We must swap them for the map to render correctly.
    segment_coords = [(lat, lon) for lon, lat in segment.coords]

    # 3. Initialize the Map
    # Center the map the mean point of segment
    lat_arr = [lat for lat, long in segment_coords]; long_arr = [long for lat, long in segment_coords]
    start_loc = (np.mean(np.array(lat_arr)), np.mean(np.array(long_arr)))
    m = folium.Map(location=start_loc, zoom_start=16, tiles=None)
    folium.TileLayer(
    tiles='https://basemaps.cartocdn.com/light_nolabels/{z}/{x}/{y}.png',
    attr='©CartoDB',
    name='CartoDB Light No Labels').add_to(m)

    # 4. Add the segment Line
    folium.PolyLine(
        locations=segment_coords,
        color="blue",
        weight=5,
        opacity=0.8,
        tooltip="Your Path"
    ).add_to(m)
    
    # 6. Add accidents (first define how big the radius around segment should be, we dont want to plot all accidents)
    lat_tol = 1e-2; lon_tol = 2e-2
    min_lat, max_lat = start_loc[0] - lat_tol, start_loc[0] + lat_tol
    min_lon, max_lon = start_loc[1] - lon_tol, start_loc[1] + lon_tol
    
    for _, row in df.iterrows():
        lat = row['YGCSWGS84']
        lon = row['XGCSWGS84']
        
        # if accident belongs to given segment color==red
        if row["index_right"] == float(segment_idx):
            folium.CircleMarker(
                location=[lat, lon],
                radius=1,
                color='red',
                fill=True, 
                fill_color='black',
                fill_opacity=0.6,
                popup=f"Accident ID: {row['acc_id']}, Nearest Segment Index: {row['index_right']}"
                ).add_to(m)
        # if accident does not belong to given segment color==black and is still in reasonable proximity to segment
        elif (min_lat < lat < max_lat) and (min_lon < lon < max_lon):
            folium.CircleMarker(
                location=[lat, lon],
                radius=1,
                color='black',
                fill=True, 
                fill_color='black',
                fill_opacity=0.6,
                popup=f"Accident ID: {row['acc_id']}, Nearest Segment Index: {row['index_right']}"
                ).add_to(m)

    if save:
        m.save(save_path)
        print(f"Map saved to {output_file}")
    return m


# --- using the function ---
# 1. select strava segment by idx
segment_idx = 4000
segment_points = strava_segments_gdf.geometry[segment_idx]


# 2. run the function and display the map
map_obj = plot_segment_on_map(joined_nearest_unique, segment_points, segment_idx)
map_obj

### Map displaying all accidents and strava segments

In [23]:
# create map displaying accidents and strava segments
map = folium.Map(location=[52.52, 13.40], zoom_start=12) 
strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")

# add strava segements to map
layer_segments = folium.FeatureGroup("Strava Segments", show=True)
folium.GeoJson(
    strava_segments_gdf,
    style_function=lambda x: {"color": "green"}
).add_to(layer_segments)
map.add_child(layer_segments)


# add accidents to map
layer_accidents = folium.FeatureGroup("Bicycle Accidents", show=True)
for _, row in joined_nearest_unique.iterrows():
    lat = row['YGCSWGS84']  
    lon = row['XGCSWGS84']  
    folium.CircleMarker(
        location=[lat, lon],
        radius=1,
        color='black',
        fill=True, 
        fill_color='black',
        fill_opacity=0.6,
        popup=f"Accident ID: {row['acc_id']}, Nearest Segment Index: {row['index_right']}"
).add_to(layer_accidents)   
map.add_child(layer_accidents)

folium.LayerControl().add_to(map)
map.save(PROJECT_ROOT / "outputs/maps/bicycle_accidents_with_segments_berlin.html")