## Data Exploration

In [3]:
import pandas as pd

In [5]:
df = pd.read_csv("crash_data_2014_to_2024.csv")

# normalize column names
df.columns = df.columns.str.strip().str.lower()

# verify
print("Columns:", df.columns.tolist())

Columns: ['crash_id', 'crash_datetime', 'year', 'month', 'day', 'hour', 'weekday', 'date', 'region_id', 'county_id', 'lat', 'long', 'route', 'route_direction', 'roadway_type', 'exit_number', 'ramp_id', 'milepoint', 'location_description', 'crash_severity_id', 'light_condition_id', 'weather_condition_id', 'manner_collision_id', 'pavement_id', 'roadway_surf_condition_id', 'roadway_junct_feature_id', 'road_jurisdiction_id', 'work_zone_related_ynu', 'work_zone_worker_present_ynu', 'work_zone_id', 'work_zone_location_id', 'horizontal_alignment_id', 'vertical_alignment_id', 'roadway_contrib_circum_id', 'number_vehicles_involved', 'total_number_roadway_lanes', 'first_harmful_event_id', 'first_harmful_evt_loc_id', 'officer_department_code', 'motor_carrier_involved_yn', 'city', 'main_road_name', 'at_intersection_with', 'ni_distance_feet', 'ni_direction', 'ni_nearest_intersection', '2014-2014']


In [9]:
import pandas as pd

# === Load data ===
df = pd.read_csv("crash_data_2014_to_2024.csv")

# === Normalize columns first ===
df.columns = df.columns.str.strip().str.lower()
print("Normalized columns:", df.columns.tolist())

# === Target columns (double-check names) ===
cols = [
    "crash_datetime",
    "light_condition_id",
    "weather_condition_id",
    "manner_collision_id",
    "roadway_surf_condition_id",
    "roadway_junct_feature_id",
]

# === Identify which columns are missing ===
missing = [c for c in cols if c not in df.columns]
if missing:
    print(f"⚠️ Missing columns: {missing}")
else:
    print("✅ All expected columns present.")

# === Convert datetime ===
if "crash_datetime" in df.columns:
    df["crash_datetime"] = pd.to_datetime(df["crash_datetime"], errors="coerce")
    df["year"] = df["crash_datetime"].dt.year
    df["month"] = df["crash_datetime"].dt.month

# === Yearly crash summary ===
if "year" in df.columns:
    datetime_summary = (
        df.groupby("year")
        .size()
        .reset_index(name="crash_count")
        .sort_values("year")
    )
    print("\nCrash count by year:")
    print(datetime_summary.head(20))

# === Frequency summaries ===
def freq_summary(col):
    if col not in df.columns:
        print(f"⚠️ Column not found: {col}")
        return pd.DataFrame()

    summary = (
        df[col]
        .value_counts(dropna=False)
        .rename_axis(col + "_value")  # avoid collision
        .reset_index(name="count")
        .sort_values(by=col + "_value", ascending=True)
    )
    total = summary["count"].sum()
    summary["percent"] = (summary["count"] / total * 100).round(2)

    print(f"\n--- {col} ---")
    print(summary)
    return summary


summaries = {}
for col in cols[1:]:
    table = freq_summary(col)
    if not table.empty:
        summaries[col] = table

# === Save summaries ===
for name, table in summaries.items():
    table.to_csv(f"summary/summary_{name}.csv", index=False)

print("\n✅ Summary CSVs saved to /data/")


Normalized columns: ['crash_id', 'crash_datetime', 'year', 'month', 'day', 'hour', 'weekday', 'date', 'region_id', 'county_id', 'lat', 'long', 'route', 'route_direction', 'roadway_type', 'exit_number', 'ramp_id', 'milepoint', 'location_description', 'crash_severity_id', 'light_condition_id', 'weather_condition_id', 'manner_collision_id', 'pavement_id', 'roadway_surf_condition_id', 'roadway_junct_feature_id', 'road_jurisdiction_id', 'work_zone_related_ynu', 'work_zone_worker_present_ynu', 'work_zone_id', 'work_zone_location_id', 'horizontal_alignment_id', 'vertical_alignment_id', 'roadway_contrib_circum_id', 'number_vehicles_involved', 'total_number_roadway_lanes', 'first_harmful_event_id', 'first_harmful_evt_loc_id', 'officer_department_code', 'motor_carrier_involved_yn', 'city', 'main_road_name', 'at_intersection_with', 'ni_distance_feet', 'ni_direction', 'ni_nearest_intersection', '2014-2014']
✅ All expected columns present.

Crash count by year:
    year  crash_count
0   2014       