# Road Accident Exploration

Explore road accident data and their distribution across municipalities.

In [None]:
def load_data(path="../../data/raw/features/46241-01-04-5-road-accidents.csv"):
    import pandas as pd

    accident_columns = [
        "accident_count",
        "injury_accidents",
        "property_damage_accidents",
        "fatalities",
        "injured",
    ]

    df = pd.read_csv(
        path,
        delimiter=";",
        na_values=["-"],
        skiprows=7,
        skipfooter=4,
        engine="python",
        header=None,
        names=["MU_ID", "MU_name"] + accident_columns,
    )
    # add AGS column by right-padding MU_ID with zeros to 8 characters (adds trailing zeros if necessary)
    df["AGS"] = df["MU_ID"].str.ljust(8, "0")

    return df

In [None]:
raw_df = load_data()
raw_df.head()

Data contain 5 columns of interest: 
- accidents (total): Total number of road accidents in the municipality.
- accidents with injuries: Number of road accidents that resulted in injuries.
- accidents with property damage: Number of road accidents that resulted in property damage only.
- fatalities: Number of fatalities resulting from road accidents.
- injured: Number of people injured in road accidents.

In [None]:
# load municipality and filter
from geoscore_de.data_flow.features.municipality import MunicipalityFeature

muni_df = MunicipalityFeature("../../data/raw/municipalities_2022.csv").load()
filtered_df = raw_df[raw_df["AGS"].isin(muni_df["AGS"])]

In [None]:
import plotnine as gg

# create hist of total accidents
(
    gg.ggplot(filtered_df, gg.aes(x="accident_count"))
    + gg.geom_histogram(bins=30)
    + gg.labs(title="Distribution of Total Accidents", x="Total Accidents", y="Count")
    + gg.scale_y_log10()
)

There are different sizes of municipalities in Germany, we should calculat something like accidents per capita.

In [None]:
muni_df

In [None]:
# merge muni_df with filtered_df to get Persons column
merged_df = filtered_df.merge(muni_df[["AGS", "Persons"]], on="AGS", how="left")

# define accident columns to weight
accident_columns = [
    "accident_count",
    "injury_accidents",
    "property_damage_accidents",
    "fatalities",
    "injured",
]

# weight all accident columns by Persons (per capita)
for col in accident_columns:
    merged_df[f"{col}_per_capita"] = merged_df[col] / merged_df["Persons"]

merged_df

In [None]:
import plotnine as gg


def get_dist_plot(df, col):
    return (
        gg.ggplot(df, gg.aes(x=col))
        + gg.geom_histogram(bins=40)
        + gg.labs(title=f"Distribution of {col}", x=col, y="Count")
    )

In [None]:
for col in accident_columns:
    display(get_dist_plot(merged_df, f"{col}_per_capita"))

This graph looks more similar to a beta distribution