
# Traffic Accident Analysis — Data Cleaning, EDA & Hotspot Visualization

This notebook performs data loading, full data cleaning, exploratory data analysis (EDA), and hotspot visualization for a traffic accident dataset.
It focuses on identifying patterns related to **road condition**, **weather**, and **time of day**, and visualizes accident hotspots and contributing factors.

**How to use**
1. Place your accident dataset CSV (for example, `US_Accidents_Dec20.csv` or `accidents.csv`) in the same folder as this notebook, **or**
2. The notebook will attempt to download a public sample if available. If the dataset requires Kaggle authentication, upload it to the notebook environment or mount Google Drive in Colab.
3. Run cells step-by-step in Jupyter, VS Code (Jupyter), or Google Colab.

**Outputs**
- Cleaned dataset saved as `accidents_cleaned.csv`
- EDA plots (time-of-day, weather, road condition distributions)
- Spatial hotspot visualizations (using folium if lat/lng provided)


In [None]:

# === Setup ===
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Optional packages (uncomment to install in Colab)
# !pip install folium
# !pip install geopandas

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 200)
sns.set(style='whitegrid')
%matplotlib inline


In [None]:

# === Load dataset (local or attempt known sources) ===
local_candidates = [
    "US_Accidents_Dec20.csv",
    "US_Accidents_Dec21.csv",
    "US_Accidents.csv",
    "accidents.csv",
    "accidents_2016_2020.csv"
]

df = None
for p in local_candidates:
    if os.path.exists(p):
        try:
            df = pd.read_csv(p)
            print(f"Loaded local file: {p}")
            break
        except Exception as e:
            print(f"Found file {p} but failed to read: {e}")

if df is None:
    print("No local file found. Please upload your CSV to the environment or mount Google Drive.")
    # Optionally try a small public sample (if available)
    sample_url = "https://raw.githubusercontent.com/plotly/datasets/master/2016-weather-data-seattle.csv"
    try:
        print("Attempting to download a small sample (not actual accidents data) for demonstration...")
        df = pd.read_csv(sample_url)
        print("Downloaded sample dataset (weather sample). NOTE: Replace with actual accident CSV for real analysis.")
    except Exception as e:
        raise RuntimeError("No dataset available. Please upload your accident CSV file.") from e

print('\nDataset shape:', df.shape)
df.head()


In [None]:

# === Data Cleaning (adaptive) ===
print("Columns found:", df.columns.tolist())

# Try to identify typical columns if present
possible_time_cols = [c for c in df.columns if 'time' in c.lower() or 'date' in c.lower()]
possible_lat = [c for c in df.columns if 'lat' in c.lower()]
possible_lng = [c for c in df.columns if 'lon' in c.lower() or 'lng' in c.lower()]

print("Detected time columns:", possible_time_cols)
print("Detected latitude columns:", possible_lat)
print("Detected longitude columns:", possible_lng)

# Parse Start_Time or equivalent into datetime, hour, day, month
time_col = None
for c in possible_time_cols:
    lc = c.lower()
    if 'start' in lc or 'time' in lc or 'date' in lc:
        time_col = c; break

if time_col is not None:
    print("Using time column:", time_col)
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
    df['hour'] = df[time_col].dt.hour
    df['day_of_week'] = df[time_col].dt.day_name()
    df['month'] = df[time_col].dt.month_name()
else:
    print("No obvious time column found. You'll need to provide a time column for temporal analysis.")

# Standardize weather/road condition columns if present
weather_cols = [c for c in df.columns if 'weather' in c.lower()]
road_cols = [c for c in df.columns if 'road' in c.lower() or 'surface' in c.lower()]

print("Weather-like cols:", weather_cols)
print("Road-like cols:", road_cols)

# Clean weather column: lower, strip, group rare categories
if weather_cols:
    wc = weather_cols[0]
    df[wc] = df[wc].astype(str).str.lower().str.strip()
    # simplify categories (basic rules)
    df['weather_simple'] = df[wc].replace({
        'clear':'clear', 'sunny':'clear', 'mostly clear':'clear',
        'rain':'rain', 'light rain':'rain', 'drizzle':'rain', 'heavy rain':'rain',
        'snow':'snow', 'light snow':'snow', 'sleet':'snow',
        'fog':'fog', 'haze':'fog', 'mist':'fog', 'overcast':'cloudy', 'cloudy':'cloudy'
    })
else:
    df['weather_simple'] = np.nan

# Clean road condition column
if road_cols:
    rc = road_cols[0]
    df[rc] = df[rc].astype(str).str.lower().str.strip()
    df['road_simple'] = df[rc].replace({
        'dry':'dry', 'wet':'wet', 'icy':'icy', 'snow':'snow', 'damp':'wet'
    })
else:
    df['road_simple'] = np.nan

# If lat/lng present, drop rows with missing coordinates for hotspot maps
lat_col = possible_lat[0] if possible_lat else None
lng_col = possible_lng[0] if possible_lng else None

if lat_col and lng_col:
    df = df.dropna(subset=[lat_col, lng_col])
    df[lat_col] = pd.to_numeric(df[lat_col], errors='coerce')
    df[lng_col] = pd.to_numeric(df[lng_col], errors='coerce')

# Basic missing value report
print("\nMissing values summary:")
print(df.isnull().sum().sort_values(ascending=False).head(20))

# Save cleaned CSV
cleaned_path = "accidents_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"Saved cleaned dataset to: {cleaned_path}")
df.head()


In [None]:

# === Exploratory Data Analysis (EDA) ===
# Time of day distribution
if 'hour' in df.columns:
    plt.figure(figsize=(8,4))
    sns.countplot(x='hour', data=df)
    plt.title('Accidents by Hour of Day')
    plt.xlabel('Hour')
    plt.ylabel('Count')
    plt.show()

# Day of week heatmap (hour vs day)
if set(['hour','day_of_week']).issubset(df.columns):
    pivot = df.pivot_table(index='hour', columns='day_of_week', values=df.columns[0], aggfunc='count').fillna(0)
    # Reorder days
    days_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    pivot = pivot[days_order]
    plt.figure(figsize=(10,6))
    sns.heatmap(pivot, cmap='YlOrRd')
    plt.title('Accident counts: Hour vs Day of Week')
    plt.show()

# Weather and road condition plots
if 'weather_simple' in df.columns:
    plt.figure(figsize=(8,4))
    order = df['weather_simple'].value_counts().index
    sns.countplot(y='weather_simple', data=df, order=order)
    plt.title('Accidents by Weather Condition (simplified)')
    plt.show()

if 'road_simple' in df.columns:
    plt.figure(figsize=(8,4))
    order = df['road_simple'].value_counts().index
    sns.countplot(y='road_simple', data=df, order=order)
    plt.title('Accidents by Road Surface Condition (simplified)')
    plt.show()

# Severity analysis if available
sev_cols = [c for c in df.columns if 'severity' in c.lower()]
if sev_cols:
    sc = sev_cols[0]
    plt.figure(figsize=(6,4))
    sns.countplot(x=sc, data=df)
    plt.title('Accident Severity Distribution')
    plt.show()

# Top cities/states (if columns exist)
if 'City' in df.columns:
    top_cities = df['City'].value_counts().head(10)
    plt.figure(figsize=(10,4))
    sns.barplot(x=top_cities.values, y=top_cities.index)
    plt.title('Top 10 Cities by Accident Count')
    plt.show()

if 'State' in df.columns:
    top_states = df['State'].value_counts().head(10)
    plt.figure(figsize=(10,4))
    sns.barplot(x=top_states.values, y=top_states.index)
    plt.title('Top 10 States by Accident Count')
    plt.show()


In [None]:

# === Hotspot visualization (folium) ===
# Folium is optional; only run if lat/lng columns detected
try:
    import folium
    from folium.plugins import HeatMap
except Exception as e:
    folium = None
    print("Folium not installed; to enable map plotting, install folium in the environment.")

lat_col = [c for c in df.columns if 'lat' in c.lower()]
lng_col = [c for c in df.columns if 'lon' in c.lower() or 'lng' in c.lower()]

if lat_col and lng_col and folium is not None:
    latc = lat_col[0]; lngc = lng_col[0]
    # sample data for faster plotting
    sample = df[[latc,lngc]].dropna().sample(min(len(df), 20000), random_state=42)
    center = [sample[latc].mean(), sample[lngc].mean()]
    m = folium.Map(location=center, zoom_start=6)
    HeatMap(sample.values.tolist(), radius=8, blur=10).add_to(m)
    display(m)
    # Save map as HTML
    m.save('accident_hotspot_map.html')
    print("Saved hotspot map to accident_hotspot_map.html")
else:
    print("Latitude/Longitude columns not found or folium not installed. Skipping hotspot map.")


In [None]:

# === Summary & Next Steps ===
print("Accident data cleaning and EDA complete.")
print("Cleaned CSV: accidents_cleaned.csv")
print("If you want hotspot maps, ensure your dataset has latitude/longitude and install folium.")
