## Imports And Set Up

In [28]:
import pandas as pd
import numpy as np
import os

os.makedirs("../data/summary_stats", exist_ok=True)

## Summary Table

### Check Missing Values

In [24]:
# === Load and normalize ===
df = pd.read_csv("../data/crash_data_2014_to_2024.csv")
df.columns = df.columns.str.strip().str.lower()


# === 1️⃣ Remove NaN entries in key vars ===
key_cols = [
    "county_id",
    "crash_severity_id",
    "light_condition_id",
    "weather_condition_id",
    "roadway_surf_condition_id"
]
df_clean = df.dropna(subset=key_cols)

# === 2️⃣ Remove invalid category codes ===
invalid_codes = [8, 89, 96, 97, 98, 99, 88, "U", "Unknown", "Not Provided", "Invalid"]
for col in key_cols[1:]:  # skip county_id
    df_clean = df_clean[~df_clean[col].isin(invalid_codes)]

print(f"Original rows: {len(df)}")
print(f"Rows after full filtering: {len(df_clean)}")


Original rows: 272524
Rows after full filtering: 267888


### Create Summary CSV

In [25]:
# === County ID → Name mapping ===
county_map = {
    11: "Davis",
    35: "Salt Lake",
    49: "Utah",
    53: "Washington",
    57: "Weber"
}

df["county_name"] = df["county_id"].map(county_map)
df = df[df["county_name"].notna()]  # keep only the 5 target counties

# === Variable label mappings ===
severity_map = {
    1: "No injury / PDO",
    2: "Possible injury",
    3: "Suspected Minor Injury",
    4: "Suspected Serious Injury",
    5: "Fatal"
}

light_map = {
    1: "Daylight",
    2: "Dark - Lighted",
    3: "Dark - Not Lighted",
    4: "Dark - Unknown Lighting",
    5: "Dawn",
    6: "Dusk"
}

weather_map = {
    1: "Clear",
    2: "Cloudy",
    3: "Rain",
    4: "Snowing",
    5: "Blowing Snow",
    6: "Sleet, Hail",
    7: "Fog, Smog",
    8: "Severe Crosswinds",
    9: "Blowing Sand / Dirt"
}

road_map = {
    1: "Dry",
    2: "Wet",
    3: "Snow",
    4: "Slush",
    5: "Ice / Frost",
    6: "Water",
    7: "Mud",
    8: "Sand / Dirt / Gravel",
    9: "Oil",
    10: "Dirt",
    11: "Gravel",
    12: "Sand",
    97: "Other"
}

# === Helper function to make county-level proportion table ===
def make_summary(df, var, mapping, var_label):
    tmp = df[[var, "county_name"]].copy()
    tmp[var] = tmp[var].map(mapping)

    # Count occurrences by county and category
    counts = tmp.groupby(["county_name", var]).size().unstack(fill_value=0)

    # Convert to proportions (row sums = 1 per county)
    props = counts.div(counts.sum(axis=1), axis=0).T.reset_index()

    # The first column now contains category labels — name it explicitly
    props.rename(columns={props.columns[0]: "Category"}, inplace=True)

    # Add the numeric code dynamically using the mapping dictionary
    label_to_code = {v: k for k, v in mapping.items()}
    props.insert(0, "Code", props["Category"].map(label_to_code))

    # Fill any missing labels as 'Unknown'
    props["Category"] = props["Category"].fillna("Unknown")

    return props


# === Generate summaries ===
severity_summary = make_summary(df, "crash_severity_id", severity_map, "Crash Severity")
light_summary = make_summary(df, "light_condition_id", light_map, "Light Condition")
weather_summary = make_summary(df, "weather_condition_id", weather_map, "Weather Condition")
road_summary = make_summary(df, "roadway_surf_condition_id", road_map, "Roadway Surface Condition")

# === Combine all summaries into one DataFrame ===
severity_summary["Variable"] = "Crash Severity"
light_summary["Variable"] = "Light Condition"
weather_summary["Variable"] = "Weather Condition"
road_summary["Variable"] = "Road Surface"

combined = pd.concat(
    [severity_summary, light_summary, weather_summary, road_summary],
    ignore_index=True
)

# Move "Variable" to the front
cols = ["Variable", "Code", "Category"] + [c for c in combined.columns if c not in ["Variable", "Code", "Category"]]
combined = combined[cols]

# === Save one combined CSV ===
combined.to_csv("../data/summary_stats/summary_all_variables.csv", index=False)


### Add Standard Errors

In [27]:
# === Load your combined summary file ===
df = pd.read_csv("../data/summary_stats/summary_all_variables.csv")

# === Total number of observations (Salt Lake crashes) ===
n = 267888

# === Compute binomial standard error for each proportion ===
# Formula: SE = sqrt(p * (1 - p) / n)
df["SE"] = np.sqrt(df["Salt Lake"] * (1 - df["Salt Lake"]) / n)

# === Optional: round for readability ===
df["Salt Lake"] = df["Salt Lake"].round(6)
df["SE"] = df["SE"].round(6)

# === Save new version ===
df.to_csv("../data/summary_stats/summary_all_variables_with_SE.csv", index=False)
