This ranks the most dangerous location-and-hour combinations across the country. It tells us exactly where and when crashes are both severe and slow to clear. 

In [2]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../data/processed/accidents.csv")

In [4]:
df["start_time"] = pd.to_datetime(df["start_time"], errors="coerce")
df["end_time"]   = pd.to_datetime(df["end_time"], errors="coerce")

df["duration_min"] = (
    df["end_time"] - df["start_time"]
).dt.total_seconds() / 60

df["duration_min"] = df["duration_min"].clip(lower=0, upper=360)

df["hour"] = df["start_time"].dt.hour
df["is_severe"] = (df["severity"] == 4).astype(int)

risk = (
    df.groupby(["state", "city", "hour"])
    .agg(
        n_accidents=("severity", "count"),
        severe_rate=("is_severe", "mean"),
        median_duration=("duration_min", "median")
    )
    .reset_index()
)

risk = risk[risk["n_accidents"] >= 20]

risk["duration_norm"] = risk["median_duration"] / risk["median_duration"].max()

risk["risk_score"] = (
    0.6 * risk["severe_rate"] +
    0.4 * risk["duration_norm"]
)

top20 = risk.sort_values("risk_score", ascending=False).head(20)
top20


Unnamed: 0,state,city,hour,n_accidents,severe_rate,median_duration,duration_norm,risk_score
44430,tx,andrews,11,20,0.75,179.0,0.497222,0.648889
16998,fl,melbourne,7,26,0.192308,359.983333,0.999954,0.515366
17398,fl,ocala,0,20,0.1,360.0,1.0,0.46
1050,az,flagstaff,1,22,0.090909,360.0,1.0,0.454545
39903,pa,mercer,15,32,0.46875,146.566667,0.40713,0.444102
30030,nc,wilmington,16,25,0.32,194.166667,0.539352,0.407741
45015,tx,houston,5,84,0.011905,360.0,1.0,0.407143
29756,nc,raleigh,0,113,0.00885,360.0,1.0,0.40531
28359,mt,deer_lodge,18,20,0.0,360.0,1.0,0.4
22754,la,shreveport,13,23,0.0,360.0,1.0,0.4
