In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
import geopandas as gpd
import shapely

  import pkg_resources


# Accidents data (for Berlin and bike-related only)

In [None]:
csv_dir = Path("data/csv")
csv_files = sorted(csv_dir.glob("*.csv"))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {csv_dir.resolve()}")

dfs = []
for fp in csv_files:
    df = pd.read_csv(fp, low_memory=False, delimiter=";")
    df["source_file"] = fp.name 
    dfs.append(df)

accidents = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(csv_files)} files -> combined shape: {accidents.shape}")

#print all column names
print("Columns:", accidents.columns.tolist())

accidents.head()

Loaded 9 files -> combined shape: (2098019, 35)
Columns: ['OBJECTID', 'UIDENTSTLA', 'ULAND', 'UREGBEZ', 'UKREIS', 'UGEMEINDE', 'UJAHR', 'UMONAT', 'USTUNDE', 'UWOCHENTAG', 'UKATEGORIE', 'UART', 'UTYP1', 'IstRad', 'IstPKW', 'IstFuss', 'IstKrad', 'IstSonstig', 'LICHT', 'STRZUSTAND', 'LINREFX', 'LINREFY', 'XGCSWGS84', 'YGCSWGS84', 'source_file', 'OBJECTID_1', 'ULICHTVERH', 'IstGkfz', 'IstSonstige', 'UIDENTSTLAE', 'IstStrassenzustand', 'OID_', 'PLST', 'FID', 'IstStrasse']


Unnamed: 0,OBJECTID,UIDENTSTLA,ULAND,UREGBEZ,UKREIS,UGEMEINDE,UJAHR,UMONAT,USTUNDE,UWOCHENTAG,...,OBJECTID_1,ULICHTVERH,IstGkfz,IstSonstige,UIDENTSTLAE,IstStrassenzustand,OID_,PLST,FID,IstStrasse
0,1.0,1170113152013852017,1,0,55,12,2017,1,5,6,...,,,,,,,,,,
1,2.0,1170113171013912017,1,0,60,53,2017,1,6,6,...,,,,,,,,,,
2,3.0,1170106105132242017,1,0,61,11,2017,1,18,6,...,,,,,,,,,,
3,4.0,1170114152013542018,1,0,55,42,2017,1,12,7,...,,,,,,,,,,
4,5.0,1170106161013732017,1,0,62,60,2017,1,8,6,...,,,,,,,,,,


## Renaming columns in English

In [None]:
accident_columns_en = {
    # IDs & metadata
    "OBJECTID": "object_id",
    "OBJECTID_1": "object_id_alt",
    "OID_": "oid",
    "FID": "fid",
    "source_file": "source_file",

    # Unique accident identifiers
    "UIDENTSTLA": "accident_id",
    "UIDENTSTLAE": "accident_id_extended",

    # Administrative divisions
    "ULAND": "land_code",
    "UREGBEZ": "admin_region_code",
    "UKREIS": "district_code",
    "UGEMEINDE": "municipality_code",

    # Time
    "UJAHR": "year",
    "UMONAT": "month",
    "USTUNDE": "hour",
    "UWOCHENTAG": "weekday",

    # Accident classification
    "UKATEGORIE": "injury_severity",
    "UART": "accident_kind",
    "UTYP1": "accident_type",

    # Participants involved (0 or 1)
    "IstRad": "involved_bicycle",
    "IstPKW": "involved_passenger_car",
    "IstFuss": "involved_pedestrian",
    "IstKrad": "involved_motorcycle",
    "IstSonstig": "involved_other_vehicle_old",     # older variant
    "IstGkfz": "involved_goods_vehicle",
    "IstSonstige": "involved_other_vehicle",
    "IstStrasse": "involved_road",
    "IstStrassenzustand": "road_condition_flag",

    # Environmental conditions
    "LICHT": "light_condition_old",
    "ULICHTVERH": "light_condition",                # official variable
    "STRZUSTAND": "road_condition",

    # Data quality
    "PLST": "plausibility_level",
}

accidents = accidents.rename(columns=accident_columns_en)

# drop all accidents that did not involve bicycles (column 'IstRad' != 1)
accidents_bike_berlin = accidents[accidents['involved_bicycle'] == 1].copy()
print(f"Filtered to bicycle accidents -> shape: {accidents_bike_berlin.shape}")

# only keep accidents in Berlin (column 'ULAND' == 11)
accidents_bike_berlin = accidents_bike_berlin[accidents_bike_berlin['land_code'] == 11].copy()
print(f"Filtered to bicycle accidents in Berlin -> shape: {accidents_bike_berlin.shape}")

# Fix decimal commas → floats in coordinate columns for Berlin future analysis
accidents_bike_berlin["XGCSWGS84"] = (
    accidents_bike_berlin["XGCSWGS84"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)
accidents_bike_berlin["YGCSWGS84"] = (
    accidents_bike_berlin["YGCSWGS84"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)

accidents_bike_berlin["LINREFX"] = (
    accidents_bike_berlin["LINREFX"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)
accidents_bike_berlin["LINREFY"] = (
    accidents_bike_berlin["LINREFY"]
    .astype(str).str.replace(",", ".", regex=False)
    .pipe(pd.to_numeric, errors="coerce")
)

display(accidents_bike_berlin.describe())
accidents_bike_berlin.head()

for col in accidents_bike_berlin.columns:
    uniq_cnt = accidents_bike_berlin[col].nunique(dropna=True)
    first_vals = accidents_bike_berlin[col].head(5).tolist()
    print(f"{col}: uniques={uniq_cnt}; first5={first_vals}")


Filtered to bicycle accidents -> shape: (626844, 35)
Filtered to bicycle accidents in Berlin -> shape: (33181, 35)


Unnamed: 0,object_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,injury_severity,...,YGCSWGS84,object_id_alt,light_condition,involved_goods_vehicle,involved_other_vehicle,road_condition_flag,oid,plausibility_level,fid,involved_road
count,14772.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,33181.0,...,33181.0,5192.0,33181.0,33181.0,27989.0,17875.0,13217.0,8924.0,0.0,0.0
mean,173918.322299,11.0,0.0,5.011724,5.011724,2020.887375,6.806938,13.481179,4.076731,2.866158,...,52.506832,192975.22188,0.341008,0.021036,0.089428,0.197762,219381.787622,1.013559,,
std,58196.42271,0.0,0.0,3.466895,3.466895,2.009997,2.911945,4.595099,1.741427,0.346628,...,0.044248,27899.278571,0.714326,0.143507,0.285366,0.41524,37183.087676,0.115657,,
min,3331.0,11.0,0.0,1.0,1.0,2018.0,1.0,0.0,1.0,1.0,...,52.366052,112747.0,0.0,0.0,0.0,0.0,112180.0,1.0,,
25%,140264.75,11.0,0.0,2.0,2.0,2019.0,5.0,10.0,3.0,3.0,...,52.481865,199302.5,0.0,0.0,0.0,0.0,213555.0,1.0,,
50%,194507.5,11.0,0.0,4.0,4.0,2021.0,7.0,14.0,4.0,3.0,...,52.5095,202189.5,0.0,0.0,0.0,0.0,236988.0,1.0,,
75%,231123.5,11.0,0.0,8.0,8.0,2023.0,9.0,17.0,5.0,3.0,...,52.534254,205066.75,0.0,0.0,0.0,0.0,241075.0,1.0,,
max,241014.0,11.0,0.0,12.0,12.0,2024.0,12.0,23.0,7.0,3.0,...,52.660146,208851.0,2.0,1.0,1.0,2.0,268445.0,2.0,,


object_id: uniques=14772; first5=[nan, nan, nan, nan, nan]
accident_id: uniques=0; first5=[nan, nan, nan, nan, nan]
land_code: uniques=1; first5=[11, 11, 11, 11, 11]
admin_region_code: uniques=1; first5=[0, 0, 0, 0, 0]
district_code: uniques=12; first5=[3, 3, 2, 1, 9]
municipality_code: uniques=12; first5=[3, 3, 2, 1, 9]
year: uniques=7; first5=[2018, 2018, 2018, 2018, 2018]
month: uniques=12; first5=[1, 1, 1, 1, 1]
hour: uniques=24; first5=[15, 11, 8, 19, 18]
weekday: uniques=7; first5=[4, 5, 2, 4, 4]
injury_severity: uniques=3; first5=[3, 3, 3, 3, 2]
accident_kind: uniques=10; first5=[6, 5, 5, 5, 5]
accident_type: uniques=7; first5=[7, 2, 2, 7, 3]
involved_bicycle: uniques=1; first5=[1, 1, 1, 1, 1]
involved_passenger_car: uniques=2; first5=[0, 1, 1, 1, 1]
involved_pedestrian: uniques=2; first5=[1, 0, 0, 0, 0]
involved_motorcycle: uniques=2; first5=[0, 0, 0, 0, 0]
involved_other_vehicle_old: uniques=2; first5=[0.0, 0.0, 0.0, 0.0, 0.0]
light_condition_old: uniques=0; first5=[nan, nan, 

# Spatial Join: Accidents with Strava data (code from Luise and Eric)


### Attempt 2: Use sjoin_nearest to assign exactly one (the nearest) segment to each accident
Challenges:
* need to find the right maximum distance so accidents that are not on a segment are not assigned to one.
* assigns two segments if their distance is equal

In [31]:
# This code uses sjoin_nearest (attempt 2)
# load data
strava_segments = pd.read_parquet(path="data/strava/berlin_graph_geometry.parquet")
strava_segments["geometry"] = strava_segments["geometry"].apply(shapely.wkt.loads)
accidents_bike_berlin = accidents_bike_berlin.reset_index(drop=True)

# transform strava segments and accident locations to GeoDataFrames
strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")
accident_locations_gdf = gpd.GeoDataFrame(accidents_bike_berlin, geometry=gpd.points_from_xy(accidents_bike_berlin.XGCSWGS84, accidents_bike_berlin.YGCSWGS84), crs="EPSG:4326")
accident_locations_gdf = accident_locations_gdf.to_crs("EPSG:32633")
strava_segments_gdf = strava_segments_gdf.to_crs("EPSG:32633")

# Add identifier to accidents
accident_locations_gdf = accident_locations_gdf.reset_index(drop=True)
accident_locations_gdf["acc_id"] = accident_locations_gdf.index

# Compute nearest segment
joined = gpd.sjoin_nearest(
    accident_locations_gdf,
    strava_segments_gdf,
    how="left",
    max_distance=10, # hyperparameter that can be tuned to make assignments as accurate as possible
    distance_col="dist"
)

# drop accidents without assigned segment (NaN in index_right)
joined = joined.dropna(subset=["index_right"])

# drop duplicate accidents (by distance to segment)
joined_nearest_unique = (
    joined
    .sort_values("dist")
    .drop_duplicates(subset=["acc_id"], keep="first")
)


print(f"Total accidents: {len(accident_locations_gdf)}")
print(f"Total Bike network Strava segments: {len(strava_segments_gdf)}")
print(f"Unique Bike network Strava segments in matched dataset: {joined_nearest_unique['counter_name'].nunique()}")
print(f"Accidents assigned to segments: {len(joined_nearest_unique)}")
print (f"Accidents with ambiguous nearest segment: {len(joined) - len(joined_nearest_unique)}")
print(f"Ratio of assigned accidents: {len(joined_nearest_unique) / len(accident_locations_gdf):.2%}")

strava_segments_gdf = gpd.GeoDataFrame(strava_segments, geometry="geometry", crs="EPSG:4326")
accident_locations_gdf = gpd.GeoDataFrame(accidents_bike_berlin, geometry=gpd.points_from_xy(accidents_bike_berlin.LINREFX, accidents_bike_berlin.LINREFY), crs="EPSG:4326")

joined_nearest_unique.head()

Total accidents: 33181
Total Bike network Strava segments: 4958
Unique Bike network Strava segments in matched dataset: 3570
Accidents assigned to segments: 21666
Accidents with ambiguous nearest segment: 21
Ratio of assigned accidents: 65.30%


Unnamed: 0,object_id,accident_id,land_code,admin_region_code,district_code,municipality_code,year,month,hour,weekday,...,plausibility_level,fid,involved_road,geometry,acc_id,index_right,counter_name,latitude,longitude,dist
29872,,,11,0,9,9,2021,4,9,4,...,,,,POINT (397322.52 5813776.685),29872,4661.0,streetsegment_4661,52.461939,13.492277,4.7e-05
25448,,,11,0,5,5,2024,3,9,1,...,1.0,,,POINT (377489.976 5821932.897),25448,4436.0,streetsegment_4436,52.533763,13.194189,9.8e-05
9074,199348.0,,11,0,9,9,2019,9,6,4,...,,,,POINT (405293.008 5812309.772),9074,2567.0,streetsegment_2567,52.452859,13.606787,0.000193
6943,194582.0,,11,0,12,12,2019,5,16,5,...,,,,POINT (385705.621 5826533.591),6943,1210.0,streetsegment_1210,52.577151,13.310916,0.000222
7115,194994.0,,11,0,9,9,2019,5,18,3,...,,,,POINT (410373.006 5803066.518),7115,4637.0,streetsegment_4637,52.371061,13.677376,0.000236


# Strava data (bicycle network traffic, other features - daily)

1. We need to aggregate this df to the same granularity as in Accidents data (segment, year, month, weekday) to join. 
2. We can not join only by geo data, as Accidents don't have date column, but Strava contains daily info (eg specific traffic volume or weather on specific day)
3. We can not just calculate mean of all columns in Strava data by year-month-etc... as we also have categorial features (for example `infrastructure_bicyclelane_type`) and some features are constant over time for segment (eg `infrastructure_max_speed` in dataset is constant for segment for all dates)

In [18]:
strava_berlin_data = pd.read_parquet(path="data/strava/berlin_data.parquet")
strava_berlin_data.columns.tolist()


['counter_name',
 'date',
 'count',
 'year',
 'latitude',
 'longitude',
 'geometry',
 'socioeconomic_total_population',
 'socioeconomic_share_residents_5plus_years_same_address',
 'socioeconomic_net_migration_per_100',
 'socioeconomic_migration_volume_per_100',
 'socioeconomic_share_under_18',
 'socioeconomic_share_65_and_older',
 'socioeconomic_youth_dependency_ratio',
 'socioeconomic_old_age_dependency_ratio',
 'socioeconomic_average_age',
 'socioeconomic_greying_index',
 'socioeconomic_share_with_migration_background',
 'socioeconomic_share_foreign_nationals',
 'socioeconomic_share_foreign_eu_nationals',
 'socioeconomic_share_foreign_non_eu_nationals',
 'socioeconomic_gender_distribution',
 'socioeconomic_total_fertility_rate',
 'socioeconomic_unemployment_rate_age_15_to_65',
 'infrastructure_count_education_within0.05km',
 'infrastructure_count_hospitals_within0.05km',
 'infrastructure_count_shops_within0.05km',
 'infrastructure_count_industry_within0.05km',
 'infrastructure_count_

In [None]:
#TODO 
# We need to analyze all features in strava_berlin_data and decide how to aggregate them for segment-year-month-weekday format



## To see which data types we have as features

Results:

1. Mostly we have numerical features, but also categorical ones like `'infrastructure_bicyclelane_type'` - we will check if we need to aggregate them somehow or they are contstant over time.
2. Analysis shows:
    - **Numeric columns (111)**: Traffic counts, speeds, socioeconomic indicators, weather data
    - **Categorical columns**: Infrastructure types, activity types, street properties
    - **Boolean columns (8)**: Holiday flags, weekend indicators, data quality flags
3. **Key finding**: All connectivity and infrastructure columns are constant per segment, so they only need to be taken once per segment. Socioeconomic, motorized, strava, and weather columns vary over time and require aggregation by year-month-weekday.

In [33]:
import pandas as pd

df = strava_berlin_data.copy()

# Basic split by dtype
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()

print("Numeric:", len(numeric_cols))
print(numeric_cols)
print("\nCategorical:", len(categorical_cols))
print(categorical_cols)
print("\nBool:", len(bool_cols))
print(bool_cols)


Numeric: 111
['count', 'latitude', 'longitude', 'socioeconomic_total_population', 'socioeconomic_share_residents_5plus_years_same_address', 'socioeconomic_net_migration_per_100', 'socioeconomic_migration_volume_per_100', 'socioeconomic_share_under_18', 'socioeconomic_share_65_and_older', 'socioeconomic_youth_dependency_ratio', 'socioeconomic_old_age_dependency_ratio', 'socioeconomic_average_age', 'socioeconomic_greying_index', 'socioeconomic_share_with_migration_background', 'socioeconomic_share_foreign_nationals', 'socioeconomic_share_foreign_eu_nationals', 'socioeconomic_share_foreign_non_eu_nationals', 'socioeconomic_gender_distribution', 'socioeconomic_total_fertility_rate', 'socioeconomic_unemployment_rate_age_15_to_65', 'infrastructure_count_education_within0.05km', 'infrastructure_count_hospitals_within0.05km', 'infrastructure_count_shops_within0.05km', 'infrastructure_count_industry_within0.05km', 'infrastructure_count_hotels_within0.05km', 'infrastructure_count_education_withi

### Let's check which features we have contstant for one segment over time, so we don't need to aggregate them futher

In [34]:
import pandas as pd

df = strava_berlin_data
grp = df.groupby("counter_name", sort=False)

summary = []
for col in df.columns:
    if col == "counter_name":
        continue
    nunique = grp[col].nunique(dropna=True)
    varying = nunique.gt(1)
    summary.append({
        "column": col,
        "segments_total": len(nunique),
        "segments_varying": int(varying.sum()),
        "max_unique_within_any_segment": int(nunique.max()),
    })

summary_df = pd.DataFrame(summary).sort_values("segments_varying", ascending=True)
summary_df.head(10)  # view top constant columns


Unnamed: 0,column,segments_total,segments_varying,max_unique_within_any_segment
3,latitude,4958,0,1
4,longitude,4958,0,1
5,geometry,4958,0,1
27,infrastructure_count_hotels_within0.05km,4958,0,1
28,infrastructure_count_education_within0.1km,4958,0,1
29,infrastructure_count_hospitals_within0.1km,4958,0,1
30,infrastructure_count_shops_within0.1km,4958,0,1
31,infrastructure_count_industry_within0.1km,4958,0,1
24,infrastructure_count_hospitals_within0.05km,4958,0,1
25,infrastructure_count_shops_within0.05km,4958,0,1


In [39]:
import numpy as np

# tag dtypes
col_dtype = strava_berlin_data.dtypes
summary_df["dtype_bucket"] = summary_df["column"].map(
    lambda c: "bool" if col_dtype[c].name == "bool"
    else "numeric" if np.issubdtype(col_dtype[c], np.number)
    else "categorical"
)

# constant/varying summary
group_stats = (
    summary_df
    .groupby("group")
    .agg(
        total_cols=("column", "count"),
        constant_cols=("segments_varying", lambda s: (s == 0).sum()),
        varying_cols=("segments_varying", lambda s: (s > 0).sum()),
    )
)

# dtype counts per group
dtype_counts = (
    summary_df
    .groupby(["group", "dtype_bucket"])
    .size()
    .unstack(fill_value=0)
)

# merge everything into one table
group_stats = group_stats.join(dtype_counts, how="left")
group_stats["percent_constant"] = (
    group_stats["constant_cols"] / group_stats["total_cols"] * 100
).round(1)

display(group_stats.sort_values("percent_constant", ascending=False))

print("\nOther group details:")
display(summary_df[summary_df["group"] == "other"])


Unnamed: 0_level_0,total_cols,constant_cols,varying_cols,bool,categorical,numeric,percent_constant
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
connectivity,7,7,0,2,0,5,100.0
infrastructure,58,58,0,1,10,47,100.0
other,14,5,9,5,6,3,35.7
motorized,12,0,12,0,0,12,0.0
socioeconomic,17,0,17,0,0,17,0.0
strava,19,0,19,0,1,18,0.0
weather,9,0,9,0,0,9,0.0



Other group details:


Unnamed: 0,column,segments_total,segments_varying,max_unique_within_any_segment,group,dtype_bucket
3,latitude,4958,0,1,other,numeric
4,longitude,4958,0,1,other,numeric
5,geometry,4958,0,1,other,categorical
123,street_name,4958,0,1,other,categorical
135,is_shortterm,4958,0,1,other,bool
134,is_count_missing,4958,30,2,other,bool
1,count,4958,34,1684,other,numeric
2,year,4958,4958,5,other,categorical
0,date,4958,4958,1826,other,categorical
121,is_publicholiday,4958,4958,2,other,bool


### Result: All connectivity and infrastructure columns are constant per segment. Socioeconomic, Motorized and weather columns vary, so we need to aggregate them.

- Connectivity (7/7 constant, 2 bool, 5 numeric): treat as static attributes per segment; just carry a single value (e.g., first).
- Infrastructure (58/58 constant, 1 bool, 10 categorical, 47 numeric): fully static; keep one value per segment, no temporal aggregation needed.
**- Other (14 cols, 5 constant/9 varying; 5 bool/6 cat/3 num): mixed bag—decide column by column; reassign misfiled cols if any.**
- Motorized (12/12 varying, all numeric): fully time-varying; aggregate over your time buckets (sum for counts, mean for speeds).
- Socioeconomic (17/17 varying, numeric): varies across time in the data; aggregate over your time buckets (sum for counts, mean for speeds).
**- Strava (19/19 varying; 1 categorical, 18 numeric): counts/speeds should be summed/averaged per time bucket; handle the single categorical (strava_activity_type) via ????**
- Weather (9/9 varying, numeric): time-varying; aggregate with mean (or min/max if useful).



# Aggregation of Berlin Strava data 
1. Aggregation keys: counter_name (segment), year, month, weekday (to align with accidents).
2. Constant features stay as-is (no aggregation) since they don’t vary over time.

# For me this code runs 13 minutes :(

In [None]:
# Keys and minimal copy
df = strava_berlin_data.copy()
df.dropna(subset=["latitude", "longitude", "geometry", "street_name", "is_shortterm"], inplace=True)
df["date"] = pd.to_datetime(df["date"])
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["weekday"] = df["date"].dt.day_name()
keys = ["counter_name", "year", "month", "weekday"]

# Constant columns that don't vary over time for one segment, we don't need to aggregate them(from summary_df)
constant_cols = summary_df.loc[summary_df["segments_varying"] == 0, "column"].tolist()

# Time-varying columns
sum_cols = [c for c in [
    "count","strava_total_trip_count","strava_ride_count","strava_ebike_ride_count",
    "strava_total_people_count","strava_total_commute_trip_count","strava_total_leisure_trip_count",
    "strava_total_morning_trip_count","strava_total_midday_trip_count",
    "strava_total_evening_trip_count","strava_total_overnight_trip_count",
    "strava_total_male_people_count","strava_total_female_people_count",
    "strava_total_18_34_people_count","strava_total_35_54_people_count",
    "strava_total_55_64_people_count","strava_total_65_plus_people_count",
    "strava_total_unspecified_people_count",
    "motorized_vehicle_count_all_vehicles_6km","motorized_vehicle_count_cars_6km","motorized_vehicle_count_trucks_6km",
    "motorized_vehicle_count_all_vehicles","motorized_vehicle_count_cars","motorized_vehicle_count_trucks"
] if c in df.columns]

mean_cols = [c for c in [
    "strava_total_average_speed_meters_per_second",
    "motorized_avg_speed_all_vehicles_6km","motorized_avg_speed_cars_6km","motorized_avg_speed_trucks_6km",
    "motorized_avg_speed_all_vehicles","motorized_avg_speed_cars","motorized_avg_speed_trucks",
    "infrastructure_distance_citycenter_km",
] + [c for c in df.columns if c.startswith("weather_")]
  + [c for c in df.columns if c.startswith("socioeconomic_")]
  if c in df.columns]

# Only varying categorical
cat_cols = [c for c in ["strava_activity_type"] if c in df.columns]

# Keep just the columns we need
vary_cols = keys + sum_cols + mean_cols + cat_cols
df_var = df[vary_cols]
df_const = df[["counter_name"] + constant_cols].drop_duplicates("counter_name")

# Cast keys to category to speed up groupby and reduce memory
for k in keys:
    df_var[k] = df_var[k].astype("category")

def fast_mode(s):
    vc = s.value_counts(dropna=True)
    return vc.index[0] if not vc.empty else pd.NA

agg_map = {**{c: "sum" for c in sum_cols},
           **{c: "mean" for c in mean_cols},
           **{c: fast_mode for c in cat_cols}}

# Group with observed=True to avoid cartesian combos of unused categories
agg_segment_ymw = (
    df_var
    .groupby(keys, sort=False, observed=True)
    .agg(agg_map)
    .reset_index()
)

# Rename aggregated columns with prefixes
rename_map = {}
rename_map.update({c: f"sum_{c}" for c in sum_cols if c in agg_segment_ymw.columns})
rename_map.update({c: f"mean_{c}" for c in mean_cols if c in agg_segment_ymw.columns})
rename_map.update({c: f"mode_{c}" for c in cat_cols if c in agg_segment_ymw.columns})
agg_segment_ymw = agg_segment_ymw.rename(columns=rename_map)

final_agg = agg_segment_ymw.merge(df_const, on="counter_name", how="left")
print(final_agg.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_var[k] = df_var[k].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_var[k] = df_var[k].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_var[k] = df_var[k].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

(2082360, 133)


In [None]:
final_agg.head()
# after the rename block
agg_segment_ymw = agg_segment_ymw.rename(columns=rename_map)

# merge and set the final name you’ll inspect
final_df = agg_segment_ymw.merge(df_const, on="counter_name", how="left")

# quick check
print([c for c in final_df.columns if c.startswith(("sum_", "mean_", "mode_"))][:10])
print(final_df.shape)


Unnamed: 0,counter_name,year,month,weekday,count,strava_total_trip_count,strava_ride_count,strava_ebike_ride_count,strava_total_people_count,strava_total_commute_trip_count,strava_total_leisure_trip_count,strava_total_morning_trip_count,strava_total_midday_trip_count,strava_total_evening_trip_count,strava_total_overnight_trip_count,strava_total_male_people_count,strava_total_female_people_count,strava_total_18_34_people_count,strava_total_35_54_people_count,strava_total_55_64_people_count,strava_total_65_plus_people_count,strava_total_unspecified_people_count,motorized_vehicle_count_all_vehicles_6km,motorized_vehicle_count_cars_6km,motorized_vehicle_count_trucks_6km,motorized_vehicle_count_all_vehicles,motorized_vehicle_count_cars,motorized_vehicle_count_trucks,strava_total_average_speed_meters_per_second,motorized_avg_speed_all_vehicles_6km,motorized_avg_speed_cars_6km,motorized_avg_speed_trucks_6km,motorized_avg_speed_all_vehicles,motorized_avg_speed_cars,motorized_avg_speed_trucks,infrastructure_distance_citycenter_km_x,weather_temp_avg,weather_temp_min,weather_temp_max,weather_precipitation,weather_snowfall,weather_wind_speed_avg,weather_wind_speed_gust,weather_pressure,weather_sunshine_duration,socioeconomic_total_population,socioeconomic_share_residents_5plus_years_same_address,socioeconomic_net_migration_per_100,socioeconomic_migration_volume_per_100,socioeconomic_share_under_18,...,infrastructure_bicyclelane_type,infrastructure_count_hotels_within0.5km,infrastructure_count_industry_within0.5km,infrastructure_count_shops_within0.5km,infrastructure_count_hospitals_within0.5km,infrastructure_count_education_within0.5km,infrastructure_count_hotels_within0.25km,infrastructure_count_industry_within0.25km,infrastructure_count_shops_within0.25km,infrastructure_count_hospitals_within0.25km,infrastructure_count_education_within0.25km,infrastructure_count_hotels_within0.1km,infrastructure_sum_fla_percent,infrastructure_str_flges_percent,infrastructure_arable_land_percent,infrastructure_horticulture_percent,infrastructure_cyclability,infrastructure_cyclability_commute,infrastructure_cyclability_touring,infrastructure_groesse,infrastructure_baustelle_percent,infrastructure_brach1_percent,infrastructure_brach2_percent,infrastructure_brach3_percent,connectivity_degree,connectivity_closeness,connectivity_betweenness,infrastructure_is_within_cyclingroute,infrastructure_flaeche_gross_percent,infrastructure_residential_use_percent,infrastructure_weekend_house_area_percent,infrastructure_forest_area_percent,infrastructure_traffic_area_percent,infrastructure_waste_disposal_percent,infrastructure_city_square_percent,infrastructure_park_area_percent,infrastructure_misch_percent,infrastructure_allotment_gardens_percent,infrastructure_kerngebiet_percent,infrastructure_grassland_percent,infrastructure_commercial_area_percent,infrastructure_water_bodies_percent,infrastructure_public_facilities_percent,infrastructure_cemetery_percent,street_name,connectivity_pagerank,connectivity_is_cycling_main_network,connectivity_clustering,is_shortterm,connectivity_is_cycling_minor_network
0,streetsegment_3572,2019,1,Tuesday,939.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43647.5,40904.75,2742.75,59787.97341,56635.105134,3152.818948,0.0,45.260417,46.35625,32.452083,45.771685,46.455645,32.430798,11.894904,2.1,-0.54,4.46,2.88,2.0,24.12,56.24,1006.52,98.4,7250.0,67.1,2.2,18.5,17.6,...,no bicycle lane,0,0,3,0,0,0,0,1,0,0,0,87.750202,12.519863,0.0,0.0,infrastructure_cyclability_isunknown,infrastructure_cyclability_commuteunknown,infrastructure_cyclability_touringunknown,5186910,0.0,10.155698,0.0,0.0,4.0,0.032436,0.023557,True,48.722318,48.722318,2.454209,4.211928,1.701624,0.0,0.0,8.693872,1.345265,2.514189,0.0,0.572873,3.06619,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True
1,streetsegment_3572,2019,1,Wednesday,1055.0,5.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,48056.5,44799.5,3256.75,64771.950968,60740.175115,4031.501957,1.058,44.947917,46.052083,33.652083,45.113291,45.884082,33.561315,11.894904,0.44,-2.02,2.3,1.06,0.0,21.32,54.64,1006.68,114.0,7250.0,67.1,2.2,18.5,17.6,...,no bicycle lane,0,0,3,0,0,0,0,1,0,0,0,87.750202,12.519863,0.0,0.0,infrastructure_cyclability_isunknown,infrastructure_cyclability_commuteunknown,infrastructure_cyclability_touringunknown,5186910,0.0,10.155698,0.0,0.0,4.0,0.032436,0.023557,True,48.722318,48.722318,2.454209,4.211928,1.701624,0.0,0.0,8.693872,1.345265,2.514189,0.0,0.572873,3.06619,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True
2,streetsegment_3572,2019,1,Thursday,1180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49646.5,46393.0,3254.25,66929.695825,62920.865118,4008.766533,0.0,44.289583,45.327083,33.922917,45.523875,46.276095,33.990308,11.894904,-0.04,-1.92,2.14,1.16,0.0,14.54,36.38,1011.36,63.6,7250.0,67.1,2.2,18.5,17.6,...,no bicycle lane,0,0,3,0,0,0,0,1,0,0,0,87.750202,12.519863,0.0,0.0,infrastructure_cyclability_isunknown,infrastructure_cyclability_commuteunknown,infrastructure_cyclability_touringunknown,5186910,0.0,10.155698,0.0,0.0,4.0,0.032436,0.023557,True,48.722318,48.722318,2.454209,4.211928,1.701624,0.0,0.0,8.693872,1.345265,2.514189,0.0,0.572873,3.06619,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True
3,streetsegment_3572,2019,1,Friday,735.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39097.0,36684.5,2412.25,52944.066952,49995.958866,2948.894316,0.0,45.747396,46.929688,34.190104,46.532172,47.279617,34.265049,11.894904,-0.175,-2.35,2.075,0.6,0.0,16.225,43.725,1016.425,130.5,7250.0,67.1,2.2,18.5,17.6,...,no bicycle lane,0,0,3,0,0,0,0,1,0,0,0,87.750202,12.519863,0.0,0.0,infrastructure_cyclability_isunknown,infrastructure_cyclability_commuteunknown,infrastructure_cyclability_touringunknown,5186910,0.0,10.155698,0.0,0.0,4.0,0.032436,0.023557,True,48.722318,48.722318,2.454209,4.211928,1.701624,0.0,0.0,8.693872,1.345265,2.514189,0.0,0.572873,3.06619,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True
4,streetsegment_3572,2019,1,Saturday,461.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30726.0,29063.5,1662.5,40930.444237,39061.925883,1868.406794,0.0,46.203125,47.260417,32.963542,46.852041,47.515043,32.76079,11.894904,2.675,-0.475,5.0,2.65,2.5,16.1,39.175,1011.0,94.5,7250.0,67.1,2.2,18.5,17.6,...,no bicycle lane,0,0,3,0,0,0,0,1,0,0,0,87.750202,12.519863,0.0,0.0,infrastructure_cyclability_isunknown,infrastructure_cyclability_commuteunknown,infrastructure_cyclability_touringunknown,5186910,0.0,10.155698,0.0,0.0,4.0,0.032436,0.023557,True,48.722318,48.722318,2.454209,4.211928,1.701624,0.0,0.0,8.693872,1.345265,2.514189,0.0,0.572873,3.06619,1.371846,1.807918,1.132279,ALB,0.000178,False,0.333333,False,True


In [50]:
strava_berlin_data[strava_berlin_data['counter_name'] == 'streetsegment_3572'][["date", "count", "strava_total_trip_count", "strava_ride_count", "strava_total_people_count"]]

Unnamed: 0,date,count,strava_total_trip_count,strava_ride_count,strava_total_people_count
0,2019-01-01,93.0,0.0,0.0,0.0
1,2019-01-02,123.0,0.0,0.0,0.0
2,2019-01-03,151.0,0.0,0.0,0.0
3,2019-01-04,149.0,0.0,0.0,0.0
4,2019-01-05,146.0,0.0,0.0,0.0
...,...,...,...,...,...
1821,2023-12-27,272.0,10.0,10.0,5.0
1822,2023-12-28,281.0,10.0,5.0,10.0
1823,2023-12-29,164.0,5.0,5.0,5.0
1824,2023-12-30,204.0,5.0,5.0,5.0
