In [37]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import gradio as gr
from sklearn.preprocessing import LabelEncoder

In [38]:
# 2. Load Dataset
df = pd.read_csv("US_Accidents_March23.csv")  # Replace with actual file name
print("Data loaded. Shape:", df.shape)

Data loaded. Shape: (7728394, 46)


In [39]:
# 3. Initial Exploration
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)       

In [40]:
df.describe()

Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
count,7728394.0,7728394.0,7728394.0,4325632.0,4325632.0,7728394.0,7564541.0,5729375.0,7554250.0,7587715.0,7551296.0,7157161.0,5524808.0
mean,2.212384,36.20119,-94.70255,36.26183,-95.72557,0.5618423,61.66329,58.25105,64.83104,29.53899,9.090376,7.68549,0.00840721
std,0.4875313,5.076079,17.39176,5.272905,18.10793,1.776811,19.01365,22.38983,22.82097,1.00619,2.688316,5.424983,0.1102246
min,1.0,24.5548,-124.6238,24.56601,-124.5457,0.0,-89.0,-89.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,33.39963,-117.2194,33.46207,-117.7543,0.0,49.0,43.0,48.0,29.37,10.0,4.6,0.0
50%,2.0,35.82397,-87.76662,36.18349,-88.02789,0.03,64.0,62.0,67.0,29.86,10.0,7.0,0.0
75%,2.0,40.08496,-80.35368,40.17892,-80.24709,0.464,76.0,75.0,84.0,30.03,10.0,10.4,0.0
max,4.0,49.0022,-67.11317,49.075,-67.10924,441.75,207.0,207.0,100.0,58.63,140.0,1087.0,36.47


In [41]:
df.isnull().sum()

ID                             0
Source                         0
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  3402762
End_Lng                  3402762
Distance(mi)                   0
Description                    5
Street                     10869
City                         253
County                         0
State                          0
Zipcode                     1915
Country                        0
Timezone                    7808
Airport_Code               22635
Weather_Timestamp         120228
Temperature(F)            163853
Wind_Chill(F)            1999019
Humidity(%)               174144
Pressure(in)              140679
Visibility(mi)            177098
Wind_Direction            175206
Wind_Speed(mph)           571233
Precipitation(in)        2203586
Weather_Condition         173459
Amenity                        0
Bump      

In [42]:
# 4. Drop irrelevant or mostly null columns
df = df.drop(columns=['ID', 'Source', 'End_Lat', 'End_Lng'], errors='ignore')

In [43]:
# 5. Handle missing values
df['Distance(mi)'].fillna(df['Distance(mi)'].median(), inplace=True)
df = df.dropna()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Distance(mi)'].fillna(df['Distance(mi)'].median(), inplace=True)


In [44]:
# 6. Convert time columns safely
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')
df['End_Time'] = pd.to_datetime(df['End_Time'], format='mixed', errors='coerce')

# Remove rows where conversion failed
df = df.dropna(subset=['Start_Time', 'End_Time'])

# Calculate Duration in minutes
df['Duration'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds() / 60

In [45]:
# 7. Feature Engineering: Extracting hour, day, etc.
df['Hour'] = df['Start_Time'].dt.hour
df['Weekday'] = df['Start_Time'].dt.weekday
df['Month'] = df['Start_Time'].dt.month

In [46]:
# 8. Encode categorical features safely

# Convert boolean columns to integers (0/1) if present
bool_cols = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
             'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
             'Traffic_Signal', 'Turning_Loop']

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(int)

# One-hot encode twilight-related time features if they exist
time_cols = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
available_time_cols = [col for col in time_cols if col in df.columns]

if available_time_cols:
    df = pd.get_dummies(df, columns=available_time_cols, drop_first=True)
else:
    print("No twilight-related time columns found.")


In [47]:
print(df.columns.tolist())

['Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Duration', 'Hour', 'Weekday', 'Month', 'Sunrise_Sunset_Night', 'Civil_Twilight_Night', 'Nautical_Twilight_Night', 'Astronomical_Twilight_Night']


In [52]:
# Select non-numeric columns (typically object or bool types)
cat_cols = df.select_dtypes(exclude='number').columns

# Initialize LabelEncoder
le = LabelEncoder()

# Apply label encoding to each categorical column
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [55]:
# 9. Define features and target
X = df.drop(columns=['Severity', 'Start_Time', 'End_Time'])
y = df['Severity']

In [56]:
# 10. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
# 11. Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [58]:
# 12. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [59]:
# 13. Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[  9624   2425    428      4]
 [  1144 867544  11080   7622]
 [   173  19897  96425    173]
 [    39  14398   1771   8854]]
              precision    recall  f1-score   support

           1       0.88      0.77      0.82     12481
           2       0.96      0.98      0.97    887390
           3       0.88      0.83      0.85    116668
           4       0.53      0.35      0.42     25062

    accuracy                           0.94   1041601
   macro avg       0.81      0.73      0.77   1041601
weighted avg       0.94      0.94      0.94   1041601



In [60]:
# 14. Prediction Function for Gradio
def predict_severity(Distance, Duration, Hour, Weekday, Month,
                     Amenity, Bump, Crossing, Give_Way, Junction,
                     No_Exit, Railway, Roundabout, Station, Stop,
                     Traffic_Calming, Traffic_Signal, Turning_Loop,
                     SS_Night, CT_Night, NT_Night, AT_Night):
    input_data = np.array([[Distance, Duration, Hour, Weekday, Month,
                            Amenity, Bump, Crossing, Give_Way, Junction,
                            No_Exit, Railway, Roundabout, Station, Stop,
                            Traffic_Calming, Traffic_Signal, Turning_Loop,
                            SS_Night, CT_Night, NT_Night, AT_Night]])
    input_scaled = scaler.transform(input_data)
    prediction = model.predict(input_scaled)
    return f"🚨 Predicted Severity Level: {int(prediction[0])}"


In [61]:
# 15. Gradio Interface
inputs = [
    gr.Number(label="Distance (miles)"),
    gr.Number(label="Duration (minutes)"),
    gr.Slider(0, 23, label="Hour of Day"),
    gr.Slider(0, 6, label="Day of Week (0=Monday)"),
    gr.Slider(1, 12, label="Month"),
    *[gr.Checkbox(label=col) for col in [
        "Amenity", "Bump", "Crossing", "Give_Way", "Junction",
        "No_Exit", "Railway", "Roundabout", "Station", "Stop",
        "Traffic_Calming", "Traffic_Signal", "Turning_Loop"
    ]],
    *[gr.Checkbox(label=col) for col in [
        "SS_Night", "CT_Night", "NT_Night", "AT_Night"
    ]]
]

output = gr.Textbox(label="Prediction")

gr.Interface(fn=predict_severity, inputs=inputs, outputs=output,
             title="🧠 AI Traffic Accident Severity Predictor",
             description="Enter traffic accident data to predict its severity using AI."
).launch(share=True)

* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://032773ed56a8f49a45.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


