In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import shap
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_crashes = pd.read_csv('/content/drive/MyDrive/Phase4/Project/P4Dataset/Refined_Traffic_Crashes_-_Crashes_20250412.csv', low_memory=False)
df_people = pd.read_csv('/content/drive/MyDrive/Phase4/Project/P4Dataset/df_people_cleaned.csv', low_memory=False)
df_vehicles = pd.read_csv('/content/drive/MyDrive/Phase4/Project/P4Dataset/df_vehicles_cleaned.csv', low_memory=False)

# Data Understanding

In [5]:
df_people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2052949 entries, 0 to 2052948
Data columns (total 13 columns):
 #   Column               Dtype  
---  ------               -----  
 0   PERSON_TYPE          object 
 1   CRASH_RECORD_ID      object 
 2   SEX                  object 
 3   AGE                  float64
 4   DRIVER_ACTION        object 
 5   DRIVER_VISION        object 
 6   PHYSICAL_CONDITION   object 
 7   PEDPEDAL_ACTION      object 
 8   PEDPEDAL_VISIBILITY  object 
 9   PEDPEDAL_LOCATION    object 
 10  BAC_RESULT           object 
 11  BAC_RESULT VALUE     float64
 12  CELL_PHONE_USE       object 
dtypes: float64(2), object(11)
memory usage: 203.6+ MB


In [6]:
df_people["PERSON_TYPE"].unique()

array(['DRIVER', 'PASSENGER', 'PEDESTRIAN', 'BICYCLE',
       'NON-MOTOR VEHICLE', 'NON-CONTACT VEHICLE'], dtype=object)

In [7]:
#lets focus only on drivers, so i have filtered the driver value from the people dataframe
df_people1=df_people[df_people["PERSON_TYPE"]=="DRIVER"]

In [8]:
df_vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1906958 entries, 0 to 1906957
Data columns (total 6 columns):
 #   Column                Dtype  
---  ------                -----  
 0   CRASH_RECORD_ID       object 
 1   VEHICLE_YEAR          float64
 2   VEHICLE_DEFECT        object 
 3   VEHICLE_TYPE          object 
 4   MANEUVER              object 
 5   EXCEED_SPEED_LIMIT_I  object 
dtypes: float64(1), object(5)
memory usage: 87.3+ MB


# Data Cleaning

In [9]:
df_driver = pd.merge(df_vehicles, df_people1, on='CRASH_RECORD_ID', how='inner')

In [10]:
df_driver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3340730 entries, 0 to 3340729
Data columns (total 18 columns):
 #   Column                Dtype  
---  ------                -----  
 0   CRASH_RECORD_ID       object 
 1   VEHICLE_YEAR          float64
 2   VEHICLE_DEFECT        object 
 3   VEHICLE_TYPE          object 
 4   MANEUVER              object 
 5   EXCEED_SPEED_LIMIT_I  object 
 6   PERSON_TYPE           object 
 7   SEX                   object 
 8   AGE                   float64
 9   DRIVER_ACTION         object 
 10  DRIVER_VISION         object 
 11  PHYSICAL_CONDITION    object 
 12  PEDPEDAL_ACTION       object 
 13  PEDPEDAL_VISIBILITY   object 
 14  PEDPEDAL_LOCATION     object 
 15  BAC_RESULT            object 
 16  BAC_RESULT VALUE      float64
 17  CELL_PHONE_USE        object 
dtypes: float64(3), object(15)
memory usage: 458.8+ MB


In [11]:
#joining the crash dataframe to the new dataframe
merged_df = pd.merge(df_crashes, df_driver, on='CRASH_RECORD_ID', how='inner')


In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337561 entries, 0 to 3337560
Data columns (total 36 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   CRASH_RECORD_ID          object 
 1   CRASH_DATE               object 
 2   POSTED_SPEED_LIMIT       int64  
 3   TRAFFIC_CONTROL_DEVICE   object 
 4   DEVICE_CONDITION         object 
 5   WEATHER_CONDITION        object 
 6   LIGHTING_CONDITION       object 
 7   FIRST_CRASH_TYPE         object 
 8   TRAFFICWAY_TYPE          object 
 9   ROADWAY_SURFACE_COND     object 
 10  ROAD_DEFECT              object 
 11  PRIM_CONTRIBUTORY_CAUSE  object 
 12  SEC_CONTRIBUTORY_CAUSE   object 
 13  BEAT_OF_OCCURRENCE       float64
 14  NUM_UNITS                int64  
 15  CRASH_HOUR               int64  
 16  CRASH_DAY_OF_WEEK        int64  
 17  CRASH_MONTH              int64  
 18  LOCATION                 object 
 19  VEHICLE_YEAR             float64
 20  VEHICLE_DEFECT           object 
 21  VEHICLE_

## Target Grouping

In [13]:
merged_df["PRIM_CONTRIBUTORY_CAUSE"].unique()

array(['FAILING TO YIELD RIGHT-OF-WAY', 'UNABLE TO DETERMINE',
       'IMPROPER TURNING/NO SIGNAL', 'DISREGARDING TRAFFIC SIGNALS',
       'IMPROPER BACKING', 'FOLLOWING TOO CLOSELY',
       'FAILING TO REDUCE SPEED TO AVOID CRASH',
       'IMPROPER OVERTAKING/PASSING', 'TURNING RIGHT ON RED',
       'IMPROPER LANE USAGE', 'NOT APPLICABLE',
       'DRIVING ON WRONG SIDE/WRONG WAY', 'DISREGARDING STOP SIGN',
       'DISREGARDING ROAD MARKINGS',
       'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER',
       'WEATHER', 'EQUIPMENT - VEHICLE CONDITION',
       'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE',
       'PHYSICAL CONDITION OF DRIVER',
       'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)',
       'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST',
       'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)',
       'DISTRACTION - FROM INSIDE VEHICLE', 'ANIMAL',
       'DISTRACTION - FROM OUTSIDE VEHICLE',
       'CELL PHONE USE OTH

In [14]:
#changing the y to fewer levels
cause_map = {
    #  Driver Behavior
    'FOLLOWING TOO CLOSELY': 'Driver Behavior',
    'FAILING TO REDUCE SPEED TO AVOID CRASH': 'Driver Behavior',
    'IMPROPER BACKING': 'Driver Behavior',
    'IMPROPER TURNING/NO SIGNAL': 'Driver Behavior',
    'IMPROPER LANE USAGE': 'Driver Behavior',
    'IMPROPER OVERTAKING/PASSING': 'Driver Behavior',
    'DRIVING ON WRONG SIDE/WRONG WAY': 'Driver Behavior',
    'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER': 'Driver Behavior',
    'EXCEEDING AUTHORIZED SPEED LIMIT': 'Driver Behavior',
    'EXCEEDING SAFE SPEED FOR CONDITIONS': 'Driver Behavior',

    # Distraction
    'DISTRACTION - FROM INSIDE VEHICLE': 'Distraction',
    'DISTRACTION - FROM OUTSIDE VEHICLE': 'Distraction',
    'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)': 'Distraction',
    'TEXTING': 'Distraction',
    'CELL PHONE USE OTHER THAN TEXTING': 'Distraction',

    # Traffic Rule Violations
    'FAILING TO YIELD RIGHT-OF-WAY': 'Traffic Violations',
    'DISREGARDING OTHER TRAFFIC SIGNS': 'Traffic Violations',
    'DISREGARDING TRAFFIC SIGNALS': 'Traffic Violations',
    'DISREGARDING ROAD MARKINGS': 'Traffic Violations',
    'DISREGARDING STOP SIGN': 'Traffic Violations',
    'DISREGARDING YIELD SIGN': 'Traffic Violations',
    'TURNING RIGHT ON RED': 'Traffic Violations',

    # Environmental Factors
    'WEATHER': 'Environment',
    'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)': 'Environment',
    'ROAD ENGINEERING/SURFACE/MARKING DEFECTS': 'Environment',
    'ROAD CONSTRUCTION/MAINTENANCE': 'Environment',
    'OBSTRUCTED CROSSWALKS': 'Environment',

    # Substance Use
    'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)': 'Substance Use',
    'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)': 'Substance Use',

    #Vehicle/Driver Condition
    'EQUIPMENT - VEHICLE CONDITION': 'Vehicle or Driver Condition',
    'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE': 'Vehicle or Driver Condition',
    'PHYSICAL CONDITION OF DRIVER': 'Vehicle or Driver Condition',

    #Other / Unusual Situations
    'RELATED TO BUS STOP': 'Other or Unusual',
    'PASSING STOPPED SCHOOL BUS': 'Other or Unusual',
    'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST': 'Other or Unusual',
    'ANIMAL': 'Other or Unusual',
    'BICYCLE ADVANCING LEGALLY ON RED LIGHT': 'Other or Unusual',
    'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT': 'Other or Unusual',

    #Unknown / Not Applicable
    'UNABLE TO DETERMINE': 'Unknown/NA',
    'NOT APPLICABLE': 'Unknown/NA'
}

# Apply the mapping
merged_df["PRIM_CONTRIBUTORY_CAUSE"]= merged_df["PRIM_CONTRIBUTORY_CAUSE"].map(cause_map)

## Missing Values

In [15]:
merged_df.isna().mean()*100

Unnamed: 0,0
CRASH_RECORD_ID,0.0
CRASH_DATE,0.0
POSTED_SPEED_LIMIT,0.0
TRAFFIC_CONTROL_DEVICE,0.0
DEVICE_CONDITION,0.0
WEATHER_CONDITION,0.0
LIGHTING_CONDITION,0.0
FIRST_CRASH_TYPE,0.0
TRAFFICWAY_TYPE,0.0
ROADWAY_SURFACE_COND,0.0


In [16]:
merged_df = merged_df.loc[:, merged_df.isna().mean() <= 0.9]

In [17]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337561 entries, 0 to 3337560
Data columns (total 30 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   CRASH_RECORD_ID          object 
 1   CRASH_DATE               object 
 2   POSTED_SPEED_LIMIT       int64  
 3   TRAFFIC_CONTROL_DEVICE   object 
 4   DEVICE_CONDITION         object 
 5   WEATHER_CONDITION        object 
 6   LIGHTING_CONDITION       object 
 7   FIRST_CRASH_TYPE         object 
 8   TRAFFICWAY_TYPE          object 
 9   ROADWAY_SURFACE_COND     object 
 10  ROAD_DEFECT              object 
 11  PRIM_CONTRIBUTORY_CAUSE  object 
 12  SEC_CONTRIBUTORY_CAUSE   object 
 13  BEAT_OF_OCCURRENCE       float64
 14  NUM_UNITS                int64  
 15  CRASH_HOUR               int64  
 16  CRASH_DAY_OF_WEEK        int64  
 17  CRASH_MONTH              int64  
 18  LOCATION                 object 
 19  VEHICLE_YEAR             float64
 20  VEHICLE_DEFECT           object 
 21  VEHICLE_

In [18]:
# Delete rows where values are missing
merged_df = merged_df[~(merged_df.isnull())]

In [19]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337561 entries, 0 to 3337560
Data columns (total 30 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   CRASH_RECORD_ID          object 
 1   CRASH_DATE               object 
 2   POSTED_SPEED_LIMIT       int64  
 3   TRAFFIC_CONTROL_DEVICE   object 
 4   DEVICE_CONDITION         object 
 5   WEATHER_CONDITION        object 
 6   LIGHTING_CONDITION       object 
 7   FIRST_CRASH_TYPE         object 
 8   TRAFFICWAY_TYPE          object 
 9   ROADWAY_SURFACE_COND     object 
 10  ROAD_DEFECT              object 
 11  PRIM_CONTRIBUTORY_CAUSE  object 
 12  SEC_CONTRIBUTORY_CAUSE   object 
 13  BEAT_OF_OCCURRENCE       float64
 14  NUM_UNITS                int64  
 15  CRASH_HOUR               int64  
 16  CRASH_DAY_OF_WEEK        int64  
 17  CRASH_MONTH              int64  
 18  LOCATION                 object 
 19  VEHICLE_YEAR             float64
 20  VEHICLE_DEFECT           object 
 21  VEHICLE_

In [20]:
merged_df.isnull().sum()

Unnamed: 0,0
CRASH_RECORD_ID,0
CRASH_DATE,0
POSTED_SPEED_LIMIT,0
TRAFFIC_CONTROL_DEVICE,0
DEVICE_CONDITION,0
WEATHER_CONDITION,0
LIGHTING_CONDITION,0
FIRST_CRASH_TYPE,0
TRAFFICWAY_TYPE,0
ROADWAY_SURFACE_COND,0


In [21]:
#fixing the datatypes so it is easier for modelling

## Feature grouping & reduction

In [22]:
merged_df['TRAFFIC_CONTROL_DEVICE'].unique()

array(['STOP SIGN/FLASHER', 'NO CONTROLS', 'TRAFFIC SIGNAL', 'UNKNOWN',
       'OTHER', 'PEDESTRIAN CROSSING SIGN', 'RR CROSSING SIGN', 'YIELD',
       'RAILROAD CROSSING GATE', 'OTHER RAILROAD CROSSING',
       'FLASHING CONTROL SIGNAL', 'POLICE/FLAGMAN', 'DELINEATORS',
       'LANE USE MARKING', 'NO PASSING', 'BICYCLE CROSSING SIGN'],
      dtype=object)

In [23]:
#reducing the number of levels
control_map = {
    'PEDESTRIAN CROSSING SIGN': 'Pedestrian Control',
    'BICYCLE CROSSING SIGN': 'Pedestrian Control',
    'SCHOOL ZONE': 'Pedestrian Control',

    'RAILROAD CROSSING GATE': 'Railroad Control',
    'RR CROSSING SIGN': 'Railroad Control',
    'OTHER RAILROAD CROSSING': 'Railroad Control',

    'DELINEATORS': 'Other'
}
merged_df['TRAFFIC_CONTROL_DEVICE'] = merged_df['TRAFFIC_CONTROL_DEVICE'].map(control_map)

In [24]:
merged_df['DEVICE_CONDITION'].unique()

array(['NO CONTROLS', 'FUNCTIONING PROPERLY', 'UNKNOWN',
       'NOT FUNCTIONING', 'OTHER', 'FUNCTIONING IMPROPERLY',
       'WORN REFLECTIVE MATERIAL', 'MISSING'], dtype=object)

In [25]:
merged_df['FIRST_CRASH_TYPE'].unique()

array(['ANGLE', 'FIXED OBJECT', 'PARKED MOTOR VEHICLE', 'TURNING',
       'PEDALCYCLIST', 'REAR TO FRONT', 'SIDESWIPE OPPOSITE DIRECTION',
       'REAR END', 'SIDESWIPE SAME DIRECTION', 'HEAD ON',
       'OTHER NONCOLLISION', 'OTHER OBJECT', 'PEDESTRIAN', 'REAR TO SIDE',
       'REAR TO REAR', 'ANIMAL', 'OVERTURNED', 'TRAIN'], dtype=object)

In [26]:
crash_type_map = {
    # Vehicle-to-Vehicle
    'REAR END': 'Vehicle Collision',
    'HEAD ON': 'Vehicle Collision',
    'ANGLE': 'Vehicle Collision',
    'TURNING': 'Vehicle Collision',
    'SIDESWIPE SAME DIRECTION': 'Vehicle Collision',
    'SIDESWIPE OPPOSITE DIRECTION': 'Vehicle Collision',
    'REAR TO FRONT': 'Vehicle Collision',
    'REAR TO REAR': 'Vehicle Collision',
    'REAR TO SIDE': 'Vehicle Collision',

    # Vulnerable Road Users
    'PEDESTRIAN': 'Vulnerable User',
    'PEDALCYCLIST': 'Vulnerable User',

    # Stationary or Object
    'FIXED OBJECT': 'Fixed Object',
    'PARKED MOTOR VEHICLE': 'Fixed Object',
    'OTHER OBJECT': 'Fixed Object',
    'OTHER NONCOLLISION': 'Fixed Object',

    # Other or Special Cases
    'TRAIN': 'Special',
    'ANIMAL': 'Special',
    'OVERTURNED': 'Special'
}
merged_df['FIRST_CRASH_TYPE'] = merged_df['FIRST_CRASH_TYPE'].map(crash_type_map)

In [27]:
merged_df['TRAFFICWAY_TYPE'].unique()

array(['NOT DIVIDED', 'RAMP', 'DIVIDED - W/MEDIAN (NOT RAISED)',
       'FOUR WAY', 'DIVIDED - W/MEDIAN BARRIER', 'PARKING LOT', 'UNKNOWN',
       'OTHER', 'Y-INTERSECTION', 'ONE-WAY', 'ALLEY', 'ROUNDABOUT',
       'UNKNOWN INTERSECTION TYPE', 'T-INTERSECTION',
       'FIVE POINT, OR MORE', 'TRAFFIC ROUTE', 'NOT REPORTED', 'DRIVEWAY',
       'CENTER TURN LANE', 'L-INTERSECTION'], dtype=object)

In [28]:
intersection_map = {
    # Common intersections
    'FOUR WAY': 'Intersection',
    'T-INTERSECTION': 'Intersection',
    'Y-INTERSECTION': 'Intersection',
    'L-INTERSECTION': 'Intersection',
    'FIVE POINT, OR MORE': 'Intersection',
    'ROUNDABOUT': 'Intersection',

    # Divided roads
    'DIVIDED - W/MEDIAN BARRIER': 'Divided Road',
    'DIVIDED - W/MEDIAN (NOT RAISED)': 'Divided Road',

    # Non-divided
    'NOT DIVIDED': 'Undivided Road',
    'ONE-WAY': 'Undivided Road',
    'CENTER TURN LANE': 'Undivided Road',

    # Special or non-road spaces
    'PARKING LOT': 'Other Road Environment',
    'RAMP': 'Other Road Environment',
    'ALLEY': 'Other Road Environment',
    'DRIVEWAY': 'Other Road Environment',
    'TRAFFIC ROUTE': 'Other Road Environment',

    # Unknowns & misc
    'UNKNOWN': 'Unknown',
    'UNKNOWN INTERSECTION TYPE': 'Unknown',
    'NOT REPORTED': 'Unknown',
    'OTHER': 'Other'
}
merged_df['TRAFFICWAY_TYPE'] = merged_df['TRAFFICWAY_TYPE'].map(intersection_map)

In [29]:
merged_df['SEX'].unique()
merged_df['SEX'] = merged_df['SEX'].replace('X', 'Unspecified')

In [30]:
merged_df['DRIVER_ACTION'].unique()

array(['FAILED TO YIELD', 'NONE', 'UNKNOWN', 'IMPROPER TURN',
       'IMPROPER BACKING', 'DISREGARDED CONTROL DEVICES',
       'FOLLOWED TOO CLOSELY', 'IMPROPER PARKING', 'OTHER',
       'OVERCORRECTED', 'IMPROPER LANE CHANGE', 'WRONG WAY/SIDE',
       'IMPROPER PASSING', 'EVADING POLICE VEHICLE',
       'EMERGENCY VEHICLE ON CALL', 'TOO FAST FOR CONDITIONS',
       'STOPPED SCHOOL BUS', 'CELL PHONE USE OTHER THAN TEXTING',
       'TEXTING', 'LICENSE RESTRICTIONS'], dtype=object)

In [31]:
driver_action_map = {
    # Common driving violations
    'FAILED TO YIELD': 'Failure to Yield',
    'DISREGARDED CONTROL DEVICES': 'Traffic Violation',
    'WRONG WAY/SIDE': 'Traffic Violation',
    'STOPPED SCHOOL BUS': 'Traffic Violation',

    # Speed and spacing
    'TOO FAST FOR CONDITIONS': 'Unsafe Speed',
    'FOLLOWED TOO CLOSELY': 'Unsafe Speed',

    # Maneuver-related
    'IMPROPER TURN': 'Improper Maneuver',
    'IMPROPER LANE CHANGE': 'Improper Maneuver',
    'IMPROPER BACKING': 'Improper Maneuver',
    'IMPROPER PASSING': 'Improper Maneuver',
    'IMPROPER PARKING': 'Improper Maneuver',
    'OVERCORRECTED': 'Improper Maneuver',

    # Distraction
    'TEXTING': 'Distracted',
    'CELL PHONE USE OTHER THAN TEXTING': 'Distracted',

    # Legal / unusual edge cases
    'LICENSE RESTRICTIONS': 'Legal Issue',
    'EMERGENCY VEHICLE ON CALL': 'Special Vehicle Action',
    'EVADING POLICE VEHICLE': 'Evasive/Illegal Action',

    # General
    'NONE': 'None',
    'UNKNOWN': 'Unknown',
    'OTHER': 'Other'
}
merged_df['DRIVER_ACTION'] = merged_df['DRIVER_ACTION'].map(driver_action_map)

In [32]:
merged_df['DRIVER_VISION'].unique()

array(['NOT OBSCURED', 'UNKNOWN', 'PARKED VEHICLES', 'OTHER',
       'MOVING VEHICLES', 'BLINDED - SUNLIGHT', 'WINDSHIELD (WATER/ICE)',
       'SIGNBOARD', 'BUILDINGS', 'TREES, PLANTS', 'BLINDED - HEADLIGHTS',
       'BLOWING MATERIALS', 'EMBANKMENT', 'HILLCREST'], dtype=object)

In [33]:
driver_vision_map = {
    'NOT OBSCURED': 'Clear Vision',
    'UNKNOWN': 'Unknown',
    'OTHER': 'Other',

    # Light-related visibility issues
    'BLINDED - SUNLIGHT': 'Glare',
    'BLINDED - HEADLIGHTS': 'Glare',

    # Environmental/material obstruction
    'BLOWING MATERIALS': 'Environmental Obstruction',
    'WINDSHIELD (WATER/ICE)': 'Environmental Obstruction',

    # Physical/static object obstruction
    'MOVING VEHICLES': 'Vehicle Obstruction',
    'PARKED VEHICLES': 'Vehicle Obstruction',
    'HILLCREST': 'Physical Obstruction',
    'TREES, PLANTS': 'Physical Obstruction',
    'BUILDINGS': 'Physical Obstruction',
    'SIGNBOARD': 'Physical Obstruction',
    'EMBANKMENT': 'Physical Obstruction'
}
merged_df['DRIVER_VISION'] = merged_df['DRIVER_VISION'].map(driver_vision_map)

In [34]:
merged_df['BAC_RESULT'].unique()

array(['TEST NOT OFFERED', 'TEST REFUSED', 'TEST TAKEN',
       'TEST PERFORMED, RESULTS UNKNOWN'], dtype=object)

In [35]:
#joining the primary and secondary cause under one column
merged_df["PRIM_CONTRIBUTORY_CAUSE"].unique()

array(['Traffic Violations', 'Unknown/NA', 'Driver Behavior',
       'Environment', 'Vehicle or Driver Condition', 'Substance Use',
       'Other or Unusual', 'Distraction'], dtype=object)

In [36]:
merged_df["SEC_CONTRIBUTORY_CAUSE"].unique()

array(['FAILING TO YIELD RIGHT-OF-WAY', 'NOT APPLICABLE',
       'UNABLE TO DETERMINE', 'ROAD CONSTRUCTION/MAINTENANCE',
       'IMPROPER LANE USAGE', 'IMPROPER TURNING/NO SIGNAL',
       'FAILING TO REDUCE SPEED TO AVOID CRASH',
       'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE', 'WEATHER',
       'IMPROPER BACKING',
       'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER',
       'IMPROPER OVERTAKING/PASSING', 'EQUIPMENT - VEHICLE CONDITION',
       'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)',
       'FOLLOWING TOO CLOSELY', 'DISREGARDING TRAFFIC SIGNALS',
       'PHYSICAL CONDITION OF DRIVER',
       'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)',
       'DISREGARDING STOP SIGN', 'DISREGARDING ROAD MARKINGS',
       'DISREGARDING YIELD SIGN', 'DISTRACTION - FROM INSIDE VEHICLE',
       'ANIMAL', 'RELATED TO BUS STOP',
       'DISREGARDING OTHER TRAFFIC SIGNS',
       'DRIVING ON WRONG SIDE/WRONG WAY',
       'CELL PHONE USE

In [37]:
sec_cause_map = {
    'DISTRACTION - FROM INSIDE VEHICLE': 'Distraction',
    'DISTRACTION - FROM OUTSIDE VEHICLE': 'Distraction',
    'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)': 'Distraction',
    'TEXTING': 'Distraction',
    'CELL PHONE USE OTHER THAN TEXTING': 'Distraction',
    'UNABLE TO DETERMINE': 'Unknown/NA',
    'NOT APPLICABLE': 'Unknown/NA',
    'WEATHER': 'Environment',
    'ROAD CONSTRUCTION/MAINTENANCE': 'Environment',
    'ROAD ENGINEERING/SURFACE/MARKING DEFECTS': 'Environment',
    'FAILING TO YIELD RIGHT-OF-WAY': 'Traffic Violations',
    'FAILING TO REDUCE SPEED TO AVOID CRASH': 'Traffic Violations',
    'DISREGARDING STOP SIGN': 'Traffic Violations',
    'DISREGARDING TRAFFIC SIGNALS': 'Traffic Violations',
    'DISREGARDING YIELD SIGN': 'Traffic Violations',
    'DISREGARDING ROAD MARKINGS': 'Traffic Violations',
    'DISREGARDING OTHER TRAFFIC SIGNS': 'Traffic Violations',
    'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE': 'Driver Behavior',
    'IMPROPER TURNING/NO SIGNAL': 'Driver Behavior',
    'IMPROPER OVERTAKING/PASSING': 'Driver Behavior',
    'FOLLOWING TOO CLOSELY': 'Driver Behavior',
    'IMPROPER LANE USAGE': 'Driver Behavior',
    'DRIVING ON WRONG SIDE/WRONG WAY': 'Driver Behavior',
    'IMPROPER BACKING': 'Driver Behavior',
    'EXCEEDING SAFE SPEED FOR CONDITIONS': 'Driver Behavior',
    'EXCEEDING AUTHORIZED SPEED LIMIT': 'Driver Behavior',
    'TURNING RIGHT ON RED': 'Driver Behavior',
    'PASSING STOPPED SCHOOL BUS': 'Driver Behavior',
    'EQUIPMENT - VEHICLE CONDITION': 'Vehicle or Driver Condition',
    'PHYSICAL CONDITION OF DRIVER': 'Vehicle or Driver Condition',
    'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)': 'Substance Use',
    'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)': 'Substance Use',
    'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER': 'Driver Behavior',
    'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)': 'Environment',
    'EMERGENCY VEHICLE ON CALL': 'Other or Unusual',
    'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST': 'Other or Unusual',
    'ANIMAL': 'Other or Unusual',
    'RELATED TO BUS STOP': 'Other or Unusual',
    'BICYCLE ADVANCING LEGALLY ON RED LIGHT': 'Other or Unusual',
    'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT': 'Other or Unusual',
    'OBSTRUCTED CROSSWALKS': 'Other or Unusual',
}
merged_df['SEC_CONTRIBUTORY_CAUSE'] = merged_df['SEC_CONTRIBUTORY_CAUSE'].map(sec_cause_map)


In [38]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337561 entries, 0 to 3337560
Data columns (total 30 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   CRASH_RECORD_ID          object 
 1   CRASH_DATE               object 
 2   POSTED_SPEED_LIMIT       int64  
 3   TRAFFIC_CONTROL_DEVICE   object 
 4   DEVICE_CONDITION         object 
 5   WEATHER_CONDITION        object 
 6   LIGHTING_CONDITION       object 
 7   FIRST_CRASH_TYPE         object 
 8   TRAFFICWAY_TYPE          object 
 9   ROADWAY_SURFACE_COND     object 
 10  ROAD_DEFECT              object 
 11  PRIM_CONTRIBUTORY_CAUSE  object 
 12  SEC_CONTRIBUTORY_CAUSE   object 
 13  BEAT_OF_OCCURRENCE       float64
 14  NUM_UNITS                int64  
 15  CRASH_HOUR               int64  
 16  CRASH_DAY_OF_WEEK        int64  
 17  CRASH_MONTH              int64  
 18  LOCATION                 object 
 19  VEHICLE_YEAR             float64
 20  VEHICLE_DEFECT           object 
 21  VEHICLE_

In [39]:
df_clean = merged_df

## Feature Selection

In [40]:
# get the numeric cols and the cat cols
num_original_columns = df_clean.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df_clean.select_dtypes(exclude=np.number).columns.tolist()

print("Categorical columns:", categorical_cols)
num_original_columns

Categorical columns: ['CRASH_RECORD_ID', 'CRASH_DATE', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'LOCATION', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'MANEUVER', 'PERSON_TYPE', 'SEX', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION', 'BAC_RESULT']


['POSTED_SPEED_LIMIT',
 'BEAT_OF_OCCURRENCE',
 'NUM_UNITS',
 'CRASH_HOUR',
 'CRASH_DAY_OF_WEEK',
 'CRASH_MONTH',
 'VEHICLE_YEAR',
 'AGE']

In [41]:
target_col = "PRIM_CONTRIBUTORY_CAUSE"

numeric_features = num_original_columns

categorical_features = categorical_cols

final_features = numeric_features + categorical_features
df_model = df_clean[final_features + [target_col]].dropna()

# Encoding

In [42]:
from sklearn.preprocessing import LabelEncoder
encoded_df = df_model.copy()
le = LabelEncoder()

In [43]:
for col in categorical_features :
    try:
        encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))
    except Exception as e:
        print(f"Could not encode {col}: {e}")

Could not encode PRIM_CONTRIBUTORY_CAUSE: y should be a 1d array, got an array of shape (5548, 2) instead.


In [44]:
#Target and Features
X = encoded_df.drop('PRIM_CONTRIBUTORY_CAUSE', axis=1)
y = encoded_df['PRIM_CONTRIBUTORY_CAUSE']

In [45]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [46]:
#Scale Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)