In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

DATA_PROCESSED = Path("../data/processed")
df = pd.read_parquet(DATA_PROCESSED / "city_day_merged.parquet")
df.head()


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.histplot(df["AccidentCount"], bins=30, ax=axes[0])
axes[0].set_title("Daily Accident Count")

sns.histplot(df["AverageSeverity"], bins=20, ax=axes[1])
axes[1].set_title("Average Severity")

if "AccidentsPer100k" in df.columns:
    sns.histplot(df["AccidentsPer100k"], bins=30, ax=axes[2])
    axes[2].set_title("Accidents per 100k")
plt.tight_layout()


In [None]:
acc_by_month = df.groupby("Month")["AccidentCount"].mean().reset_index()

plt.figure(figsize=(6,4))
sns.barplot(data=acc_by_month, x="Month", y="AccidentCount")
plt.title("Mean Daily Accident Count by Month")
plt.show()
df.groupby("is_weekend")["AccidentCount"].mean()


In [None]:
if "is_rainy" in df.columns:
    df.groupby("is_rainy")["AccidentCount"].describe()


In [None]:
num_cols = [
    "AccidentCount",
    "AverageSeverity",
    "AccidentsPer100k",
    "Temperature", "Precipitation", "Snow",
    "WindSpeed", "Visibility",
]

num_cols = [c for c in num_cols if c in df.columns]
corr = df[num_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()
