# Exploring Bike Accident Data for Berlin

first visualization and exploration of destatis accident data, filtering for bicycle accidents 

## About the Data Set
* source: [Destatis Unfallatlas](https://unfallatlas.statistikportal.de/)
* [Licence](http://www.govdata.de/dl-de/by-2-0)
* for timespan of 2016-2024

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

: 

In [None]:
csv_dir = Path("data/csv")
csv_files = sorted(csv_dir.glob("*.csv"))

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {csv_dir.resolve()}")

dfs = []
for fp in csv_files:
    df = pd.read_csv(fp, low_memory=False, delimiter=";")
    df["source_file"] = fp.name 
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(csv_files)} files -> combined shape: {df_all.shape}")

#print all column names
print("Columns:", df_all.columns.tolist())

df_all.head()

In [None]:
# drop all accidents that did not involve bicycles (column 'IstRad' != 1)
df_bike = df_all[df_all['IstRad'] == 1].copy()
print(f"Filtered to bicycle accidents -> shape: {df_bike.shape}")

# only keep accidents in Berlin (column 'ULAND' == 11)
df_bike_berlin = df_bike[df_bike['ULAND'] == 11].copy()
print(f"Filtered to bicycle accidents in Berlin -> shape: {df_bike_berlin.shape}")
df_bike_berlin.head()

In [None]:
# plot number of accidents by weekday ('UWOCHENTAG')
weekday_counts = df_bike_berlin['UWOCHENTAG'].value_counts().sort_index()

plt.figure(figsize=(5, 3))
weekday_counts.plot(kind='bar')
plt.xlabel('Weekday (1=Sunday, 7=Saturday)')
plt.ylabel('Number of Accidents')
plt.title('Bicycle Accidents by Weekday')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot number of accidents by hour of day ('USTUNDE')
hour_counts = df_bike_berlin['USTUNDE'].value_counts().sort_index()
plt.figure(figsize=(8, 3))
hour_counts.plot(kind='bar')    
plt.xlabel('Hour of Day')
plt.ylabel('Number of Accidents')
plt.title('Bicycle Accidents by Hour of Day')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# plot months on the x-axis and years as series (switch year and month)
month_year_counts = df_bike_berlin.groupby(['UMONAT', 'UJAHR']).size().unstack(fill_value=0)
month_year_counts = month_year_counts.sort_index()

# replace numeric month index with short names for readability
month_names = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
month_year_counts.index = month_year_counts.index.map(lambda m: month_names[m-1] if 1 <= m <= 12 else str(m))

month_year_counts.plot(kind='bar', figsize=(10, 5))
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.title('Bicycle Accidents by Month (years as series)')
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
  