In [59]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [60]:
pd.set_option('display.max_columns', None)

In [61]:
df= pd.read_parquet('../data/processed/us_accidents_polars.parquet')
df

Unnamed: 0,Severity,Start_Time,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,distance_to_airport(mi)
0,3,2016-02-08 05:46:00,OH,36.9,91.0,29.68,10.0,Calm,8.0,0.02,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night,2.752093
1,2,2016-02-08 06:07:59,OH,37.9,100.0,29.65,10.0,Calm,8.1,0.00,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day,5.800391
2,2,2016-02-08 06:49:27,OH,36.0,100.0,29.67,10.0,SW,3.5,0.00,Overcast,False,False,False,False,False,False,False,False,False,False,False,True,False,Night,Night,Day,Day,9.577639
3,3,2016-02-08 07:23:34,OH,35.1,96.0,29.64,9.0,SW,4.6,0.00,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Day,Day,Day,10.702810
4,2,2016-02-08 07:39:07,OH,36.0,89.0,29.65,6.0,SW,3.5,0.00,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,True,False,Day,Day,Day,Day,3.308864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705754,2,2019-08-23 18:03:25,CA,86.0,40.0,28.92,10.0,W,13.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,5.130412
7705755,2,2019-08-23 19:11:30,CA,70.0,73.0,29.39,10.0,SW,6.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,3.397634
7705756,2,2019-08-23 19:00:21,CA,73.0,64.0,29.74,10.0,SSW,10.0,0.00,Partly Cloudy,False,False,False,False,True,False,False,False,False,False,False,False,False,Day,Day,Day,Day,6.984253
7705757,2,2019-08-23 19:00:21,CA,71.0,81.0,29.62,10.0,SW,8.0,0.00,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,3.184000


In [62]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7705759 entries, 0 to 7705758
Data columns (total 29 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Severity                 int64  
 1   Start_Time               object 
 2   State                    object 
 3   Temperature(F)           float64
 4   Humidity(%)              float64
 5   Pressure(in)             float64
 6   Visibility(mi)           float64
 7   Wind_Direction           object 
 8   Wind_Speed(mph)          float64
 9   Precipitation(in)        float64
 10  Weather_Condition        object 
 11  Amenity                  bool   
 12  Bump                     bool   
 13  Crossing                 bool   
 14  Give_Way                 bool   
 15  Junction                 bool   
 16  No_Exit                  bool   
 17  Railway                  bool   
 18  Roundabout               bool   
 19  Station                  bool   
 20  Stop                     bool   
 21  Traffic_

In [63]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed')

df['start_month'] = df['Start_Time'].dt.month
df['start_year'] = df['Start_Time'].dt.year
df['start_hour'] = df['Start_Time'].dt.hour
df['start_day'] = df['Start_Time'].dt.day

In [64]:
df.drop(columns=['Start_Time'], inplace=True, axis=1, errors='ignore')

In [65]:
for col in df.select_dtypes(include=['object']).columns:
    print(f"Column: {col}, Unique Values: {df[col].nunique()}")

Column: State, Unique Values: 49
Column: Wind_Direction, Unique Values: 24
Column: Weather_Condition, Unique Values: 144
Column: Sunrise_Sunset, Unique Values: 2
Column: Civil_Twilight, Unique Values: 2
Column: Nautical_Twilight, Unique Values: 2
Column: Astronomical_Twilight, Unique Values: 2


In [66]:
dict(df['Weather_Condition'].value_counts())

{'Fair': np.int64(2560802),
 'Mostly Cloudy': np.int64(1016195),
 'Cloudy': np.int64(817082),
 'Clear': np.int64(808743),
 'Partly Cloudy': np.int64(698972),
 'Overcast': np.int64(382866),
 'Light Rain': np.int64(352957),
 'Scattered Clouds': np.int64(204829),
 'Light Snow': np.int64(128680),
 'Fog': np.int64(99238),
 'Rain': np.int64(84331),
 'Haze': np.int64(76223),
 'Fair / Windy': np.int64(35671),
 'Heavy Rain': np.int64(32309),
 'Light Drizzle': np.int64(22684),
 'Thunder in the Vicinity': np.int64(17611),
 'Cloudy / Windy': np.int64(17035),
 'T-Storm': np.int64(16810),
 'Mostly Cloudy / Windy': np.int64(16508),
 'Snow': np.int64(15537),
 'Thunder': np.int64(14202),
 'Light Rain with Thunder': np.int64(13597),
 'Smoke': np.int64(12668),
 'Wintry Mix': np.int64(11703),
 'Partly Cloudy / Windy': np.int64(10241),
 'Heavy T-Storm': np.int64(9671),
 'Light Rain / Windy': np.int64(7946),
 'Light Snow / Windy': np.int64(6826),
 'Heavy Snow': np.int64(5003),
 'Light Thunderstorms and Rain

In [67]:
def group_weather_conditions(condition):
    """
    Groups weather conditions into broader categories.
    """
    condition = str(condition).lower()
    if 't-storm' in condition or 'thunder' in condition:
        return 'Thunderstorm'
    elif 'snow' in condition or 'sleet' in condition or 'ice' in condition or 'wintry' in condition or 'freezing' in condition or 'hail' in condition:
        return 'Snow/Ice'
    elif 'rain' in condition or 'drizzle' in condition or 'shower' in condition:
        return 'Rain'
    elif 'cloudy' in condition or 'overcast' in condition:
        return 'Cloudy'
    elif 'scattered clouds' in condition or 'partly cloudy' in condition or 'mostly cloudy' in condition:
        return 'Partly Cloudy'
    elif 'fog' in condition or 'mist' in condition or 'haze' in condition:
        return 'Fog/Mist/Haze'
    elif 'clear' in condition or 'fair' in condition:
        return 'Clear/Fair'
    elif 'smoke' in condition or 'dust' in condition or 'sand' in condition or 'ash' in condition:
        return 'Smoke/Dust/Sand'
    elif 'squalls' in condition or 'tornado' in condition or 'funnel' in condition:
        return 'Severe'
    else:
        return 'Other'

In [68]:
df['Weather_Group'] = df['Weather_Condition'].apply(group_weather_conditions)

In [69]:
df['Weather_Group'].value_counts()

Weather_Group
Clear/Fair         3405216
Cloudy             2958899
Rain                512097
Partly Cloudy       204829
Fog/Mist/Haze       188634
Snow/Ice            179749
Other               154076
Thunderstorm         88603
Smoke/Dust/Sand      13533
Severe                 123
Name: count, dtype: int64

In [70]:
df[df['Weather_Group'] == 'Other']['Weather_Condition'].value_counts()

Weather_Condition
N/A Precipitation    3252
Name: count, dtype: int64

In [71]:
df.drop(columns=(['Weather_Condition']), inplace=True, axis=1, errors='ignore')

In [72]:
df

Unnamed: 0,Severity,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,distance_to_airport(mi),start_month,start_year,start_hour,start_day,Weather_Group
0,3,OH,36.9,91.0,29.68,10.0,Calm,8.0,0.02,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night,2.752093,2,2016,5,8,Rain
1,2,OH,37.9,100.0,29.65,10.0,Calm,8.1,0.00,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day,5.800391,2,2016,6,8,Rain
2,2,OH,36.0,100.0,29.67,10.0,SW,3.5,0.00,False,False,False,False,False,False,False,False,False,False,False,True,False,Night,Night,Day,Day,9.577639,2,2016,6,8,Cloudy
3,3,OH,35.1,96.0,29.64,9.0,SW,4.6,0.00,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Day,Day,Day,10.702810,2,2016,7,8,Cloudy
4,2,OH,36.0,89.0,29.65,6.0,SW,3.5,0.00,False,False,False,False,False,False,False,False,False,False,False,True,False,Day,Day,Day,Day,3.308864,2,2016,7,8,Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705754,2,CA,86.0,40.0,28.92,10.0,W,13.0,0.00,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,5.130412,8,2019,18,23,Clear/Fair
7705755,2,CA,70.0,73.0,29.39,10.0,SW,6.0,0.00,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,3.397634,8,2019,19,23,Clear/Fair
7705756,2,CA,73.0,64.0,29.74,10.0,SSW,10.0,0.00,False,False,False,False,True,False,False,False,False,False,False,False,False,Day,Day,Day,Day,6.984253,8,2019,19,23,Cloudy
7705757,2,CA,71.0,81.0,29.62,10.0,SW,8.0,0.00,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,3.184000,8,2019,19,23,Clear/Fair


In [73]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7705759 entries, 0 to 7705758
Data columns (total 32 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   Severity                 int64   
 1   State                    category
 2   Temperature(F)           float64 
 3   Humidity(%)              float64 
 4   Pressure(in)             float64 
 5   Visibility(mi)           float64 
 6   Wind_Direction           category
 7   Wind_Speed(mph)          float64 
 8   Precipitation(in)        float64 
 9   Amenity                  bool    
 10  Bump                     bool    
 11  Crossing                 bool    
 12  Give_Way                 bool    
 13  Junction                 bool    
 14  No_Exit                  bool    
 15  Railway                  bool    
 16  Roundabout               bool    
 17  Station                  bool    
 18  Stop                     bool    
 19  Traffic_Calming          bool    
 20  Traffic_Signal          

In [75]:
df['start_hour_sin'] = np.sin(2 * np.pi * df['start_hour'] / 24)
df['start_hour_cos'] = np.cos(2 * np.pi * df['start_hour'] / 24)

df['start_month_sin'] = np.sin(2 * np.pi * df['start_month'] / 12)
df['start_month_cos'] = np.cos(2 * np.pi * df['start_month'] / 12)

df['start_day_sin'] = np.sin(2 * np.pi * df['start_day'] / 31)
df['start_day_cos'] = np.cos(2 * np.pi * df['start_day'] / 31)

In [80]:
df.drop(columns=['start_hour', 'start_month', 'start_day'], inplace=True, axis=1, errors='ignore')

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7705759 entries, 0 to 7705758
Data columns (total 35 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   Severity                 int64   
 1   State                    category
 2   Temperature(F)           float64 
 3   Humidity(%)              float64 
 4   Pressure(in)             float64 
 5   Visibility(mi)           float64 
 6   Wind_Direction           category
 7   Wind_Speed(mph)          float64 
 8   Precipitation(in)        float64 
 9   Amenity                  bool    
 10  Bump                     bool    
 11  Crossing                 bool    
 12  Give_Way                 bool    
 13  Junction                 bool    
 14  No_Exit                  bool    
 15  Railway                  bool    
 16  Roundabout               bool    
 17  Station                  bool    
 18  Stop                     bool    
 19  Traffic_Calming          bool    
 20  Traffic_Signal          

In [82]:
for col in df.select_dtypes(include=['category']).columns:
    print(f"Column: {col}, Categories: {df[col].cat.categories}, Num Categories: {len(df[col].cat.categories)}")

Column: State, Categories: Index(['AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'IA', 'ID',
       'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS',
       'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR',
       'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV',
       'WY'],
      dtype='object'), Num Categories: 49
Column: Wind_Direction, Categories: Index(['CALM', 'Calm', 'E', 'ENE', 'ESE', 'East', 'N', 'NE', 'NNE', 'NNW',
       'NW', 'North', 'S', 'SE', 'SSE', 'SSW', 'SW', 'South', 'VAR',
       'Variable', 'W', 'WNW', 'WSW', 'West'],
      dtype='object'), Num Categories: 24
Column: Sunrise_Sunset, Categories: Index(['Day', 'Night'], dtype='object'), Num Categories: 2
Column: Civil_Twilight, Categories: Index(['Day', 'Night'], dtype='object'), Num Categories: 2
Column: Nautical_Twilight, Categories: Index(['Day', 'Night'], dtype='object'), Num Categories: 2
Column: Astronomical_Twilight, Categories: I

In [83]:
df[df.isna().any(axis=1)]

Unnamed: 0,Severity,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,distance_to_airport(mi),start_year,Weather_Group,start_hour_sin,start_hour_cos,start_month_sin,start_month_cos,start_day_sin,start_day_cos
601,3,OH,44.6,69.0,29.36,10.0,,10.4,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,18.624366,2016,Other,9.659258e-01,-0.258819,1.000000,6.123234e-17,0.790776,-0.612106
1957,2,CA,57.0,77.0,29.95,10.0,,6.0,0.0,False,False,True,False,False,False,False,False,False,False,False,True,False,Night,Night,Night,Night,16.795367,2016,Other,7.071068e-01,0.707107,-0.500000,-8.660254e-01,0.571268,0.820763
1968,2,CA,57.0,77.0,29.95,10.0,,6.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,10.396041,2016,Other,8.660254e-01,-0.500000,-0.500000,-8.660254e-01,0.571268,0.820763
1973,2,CA,57.0,77.0,29.95,10.0,,6.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,10.421936,2016,Other,2.588190e-01,-0.965926,-0.500000,-8.660254e-01,0.571268,0.820763
1978,2,CA,57.0,77.0,29.95,10.0,,6.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,9.432830,2016,Other,-2.588190e-01,-0.965926,-0.500000,-8.660254e-01,0.571268,0.820763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705628,2,OR,70.0,61.0,26.07,10.0,CALM,0.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,,2019,Clear/Fair,1.224647e-16,-1.000000,-0.866025,-5.000000e-01,-0.998717,-0.050649
7705639,2,OR,70.0,60.0,29.92,10.0,WNW,5.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,,2019,Clear/Fair,1.224647e-16,-1.000000,-0.866025,-5.000000e-01,-0.998717,-0.050649
7705697,2,CA,76.0,56.0,29.72,10.0,,7.0,0.0,False,False,False,False,True,False,False,False,False,False,False,False,False,Day,Day,Day,Day,10.856916,2019,Other,5.000000e-01,-0.866025,-0.866025,-5.000000e-01,-0.998717,-0.050649
7705717,4,CA,90.0,17.0,24.91,10.0,VAR,7.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,,2019,Clear/Fair,-9.659258e-01,-0.258819,-0.866025,-5.000000e-01,-0.998717,-0.050649


## Deep Learning

In [84]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import joblib
import copy

# For xAI
from sklearn.inspection import permutation_importance

In [85]:
X = df.drop(columns=['Severity'])
y = df['Severity']

In [86]:
cat_cols = df.select_dtypes(include=['category']).columns.tolist()
bool_cols = X.select_dtypes(include=['bool']).columns.tolist()
num_cols = X.select_dtypes(include=['int', 'float']).columns.tolist()


print("Categorical Columns:", cat_cols)
print("Boolean Columns:", bool_cols)
print("Numerical Columns:", num_cols)

Categorical Columns: ['State', 'Wind_Direction', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'Weather_Group']
Boolean Columns: ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']
Numerical Columns: ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)', 'distance_to_airport(mi)', 'start_year', 'start_hour_sin', 'start_hour_cos', 'start_month_sin', 'start_month_cos', 'start_day_sin', 'start_day_cos']


In [87]:
numerical_pipeline = Pipeline(
    steps=[
        ('imputation_mode', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('Scaler', MinMaxScaler())
    ]
)


# Categorical pipeline (excluding time-based features from encoding)

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


boolean_pipeline = Pipeline(steps=[
    ('to_int', FunctionTransformer(lambda x: x.astype(np.int64)))
])

In [88]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, num_cols),
        ('cat', categorical_pipeline, cat_cols),
        ('bool', boolean_pipeline, bool_cols)
    ],
    remainder='passthrough'  # or 'passthrough' if you want unlisted columns kept
)

In [89]:
# Split data: train -> validation -> test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [90]:
X_train

Unnamed: 0,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,distance_to_airport(mi),start_year,Weather_Group,start_hour_sin,start_hour_cos,start_month_sin,start_month_cos,start_day_sin,start_day_cos
491283,CA,61.0,62.0,29.92,10.0,West,9.2,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night,5.012434,2017,Clear/Fair,0.258819,9.659258e-01,-5.000000e-01,-8.660254e-01,0.998717,-0.050649
3872625,CA,103.0,15.0,29.47,10.0,NW,9.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,3.940838,2022,Clear/Fair,-0.965926,-2.588190e-01,-8.660254e-01,-5.000000e-01,-0.937752,0.347305
5762370,VA,44.0,84.0,29.74,10.0,SSW,6.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night,16.126272,2021,Clear/Fair,0.707107,7.071068e-01,-2.449294e-16,1.000000e+00,-0.101168,-0.994869
7139710,CA,66.0,15.0,29.08,10.0,NW,10.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,,2020,Clear/Fair,0.965926,-2.588190e-01,1.000000e+00,6.123234e-17,0.571268,0.820763
3878020,SC,41.0,100.0,29.07,2.0,ENE,7.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Day,Day,8.510284,2022,Rain,-1.000000,-1.836970e-16,-2.449294e-16,1.000000e+00,0.299363,-0.954139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6946448,PA,48.0,82.0,29.00,6.0,VAR,5.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,8.043117,2020,Rain,-0.707107,-7.071068e-01,5.000000e-01,-8.660254e-01,0.998717,-0.050649
2359698,MN,24.8,100.0,29.92,3.0,ENE,5.8,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Day,Day,18.599170,2018,Cloudy,1.000000,6.123234e-17,-2.449294e-16,1.000000e+00,0.201299,0.979530
7381629,VA,46.9,25.0,29.94,10.0,West,11.5,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night,3.676551,2019,Clear/Fair,0.000000,1.000000e+00,8.660254e-01,5.000000e-01,-0.937752,0.347305
661771,MD,73.0,47.0,29.94,10.0,S,10.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,Day,Day,Day,1.107577,2022,Clear/Fair,-0.965926,-2.588190e-01,1.000000e+00,6.123234e-17,-0.485302,-0.874347


In [91]:
X_train_p = preprocessor.fit_transform(X_train)

In [92]:
X_val_p = preprocessor.transform(X_val)
X_test_p = preprocessor.transform(X_test)

In [93]:
print(f"Training data shape: {X_train_p.shape}")
print(f"Validation data shape: {X_val_p.shape}")
print(f"Test data shape: {X_test_p.shape}")

Training data shape: (4931685, 34)
Validation data shape: (1232922, 34)
Test data shape: (1541152, 34)


In [94]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)
y_test_encoded = le.transform(y_test)

In [96]:
import os
import dill


artifacts_dir = '../models/artifacts/'
os.makedirs(artifacts_dir, exist_ok=True)

dill.dump(preprocessor, open(os.path.join(artifacts_dir, 'preprocessor.pkl'), 'wb'))
dill.dump(le, open(os.path.join(artifacts_dir, 'label_encoder.pkl'), 'wb'))
print("Preprocessor and Label Encoder saved to disk.")

Preprocessor and Label Encoder saved to disk.


In [97]:
num_feature_count = len(num_cols)

X_train_num_p, X_train_cat_p = X_train_p[:, :num_feature_count], X_train_p[:, num_feature_count:]
X_val_num_p, X_val_cat_p = X_val_p[:, :num_feature_count], X_val_p[:, num_feature_count:]
X_test_num_p, X_test_cat_p = X_test_p[:, :num_feature_count], X_test_p[:, num_feature_count:]

# Convert to tensors
X_train_num_tensor = torch.tensor(X_train_num_p, dtype=torch.float32)
X_train_cat_tensor = torch.tensor(X_train_cat_p, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)

X_val_num_tensor = torch.tensor(X_val_num_p, dtype=torch.float32)
X_val_cat_tensor = torch.tensor(X_val_cat_p, dtype=torch.long)
y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.long)

X_test_num_tensor = torch.tensor(X_test_num_p, dtype=torch.float32)
X_test_cat_tensor = torch.tensor(X_test_cat_p, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

In [98]:
X_train_num_tensor

tensor([[0.5068, 0.6162, 0.5103,  ..., 0.0670, 1.0000, 0.4733],
        [0.6486, 0.1414, 0.5026,  ..., 0.2500, 0.0305, 0.6728],
        [0.4493, 0.8384, 0.5072,  ..., 1.0000, 0.4494, 0.0000],
        ...,
        [0.4591, 0.2424, 0.5107,  ..., 0.7500, 0.0305, 0.6728],
        [0.5473, 0.4646, 0.5107,  ..., 0.5000, 0.2570, 0.0604],
        [0.5743, 0.3232, 0.5076,  ..., 0.0000, 0.8261, 0.1184]])

In [99]:
# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_num_tensor, X_train_cat_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_num_tensor, X_val_cat_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [100]:
# Define the Embedding NN Model
class EmbeddingNN(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, output_dim):
        super(EmbeddingNN, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories + 1, min(50, (num_categories + 1) // 2))
            for num_categories in cat_cardinalities
        ])
        total_embed_dim = sum([emb.embedding_dim for emb in self.embeddings])
        self.fc = nn.Sequential(
            nn.Linear(num_numeric + total_embed_dim, 256), nn.ReLU(), nn.BatchNorm1d(256), nn.Dropout(0.3),
            nn.Linear(256, 128), nn.ReLU(), nn.BatchNorm1d(128), nn.Dropout(0.3),
            nn.Linear(128, output_dim)
        )

    def forward(self, x_num, x_cat):
        embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(embedded + [x_num], dim=1)
        return self.fc(x)

# Get cardinalities from the fitted OrdinalEncoder for the embedding layers
cat_cardinalities = [len(cats) for cats in preprocessor.named_transformers_['cat']['encoder'].categories_]

model = EmbeddingNN(
    num_numeric=X_train_num_tensor.shape[1],
    cat_cardinalities=cat_cardinalities,
    output_dim=len(le.classes_)
)

In [101]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Early Stopping parameters
patience = 5
epochs_no_improve = 0
best_val_loss = np.inf
best_model_wts = copy.deepcopy(model.state_dict())
epochs = 50 # Set a higher max epoch count

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch_num, batch_cat, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_num, batch_cat)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_num, batch_cat, batch_y in val_loader:
            outputs = model(batch_num, batch_cat)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
    
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        epochs_no_improve += 1

    if epochs_no_improve == patience:
        print(f"Early stopping triggered after {epoch + 1} epochs.")
        break

# Load the best model weights for evaluation
model.load_state_dict(best_model_wts)

Epoch 1/50, Train Loss: 0.5377, Val Loss: 0.4990
Epoch 2/50, Train Loss: 0.5028, Val Loss: 0.4917
Epoch 3/50, Train Loss: 0.4969, Val Loss: 0.4876
Epoch 4/50, Train Loss: 0.4935, Val Loss: 0.4850
Epoch 5/50, Train Loss: 0.4914, Val Loss: 0.4830
Epoch 6/50, Train Loss: 0.4898, Val Loss: 0.4816
Epoch 7/50, Train Loss: 0.4887, Val Loss: 0.4807
Epoch 8/50, Train Loss: 0.4877, Val Loss: 0.4801
Epoch 9/50, Train Loss: 0.4869, Val Loss: 0.4791
Epoch 10/50, Train Loss: 0.4862, Val Loss: 0.4789
Epoch 11/50, Train Loss: 0.4857, Val Loss: 0.4781
Epoch 12/50, Train Loss: 0.4851, Val Loss: 0.4792
Epoch 13/50, Train Loss: 0.4847, Val Loss: 0.4773
Epoch 14/50, Train Loss: 0.4843, Val Loss: 0.4774
Epoch 15/50, Train Loss: 0.4841, Val Loss: 0.4763
Epoch 16/50, Train Loss: 0.4836, Val Loss: 0.4768
Epoch 17/50, Train Loss: 0.4834, Val Loss: 0.4757
Epoch 18/50, Train Loss: 0.4830, Val Loss: 0.4764
Epoch 19/50, Train Loss: 0.4827, Val Loss: 0.4759
Epoch 20/50, Train Loss: 0.4826, Val Loss: 0.4759
Epoch 21/

<All keys matched successfully>

In [102]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_num_tensor, X_test_cat_tensor)
    _, predicted_tensor = torch.max(test_outputs, 1)
    y_pred = le.inverse_transform(predicted_tensor.numpy())

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Test Accuracy: 0.8086


In [103]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

model.eval()
with torch.no_grad():
    # forward pass
    test_outputs = model(X_test_num_tensor, X_test_cat_tensor)

    # predicted classes
    _, predicted_tensor = torch.max(test_outputs, 1)

    # inverse transform to original labels (if you encoded labels)
    y_pred = le.inverse_transform(predicted_tensor.cpu().numpy())

# If your y_test is still encoded, decode it as well:
# y_test_decoded = le.inverse_transform(y_test)

# Accuracy
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# F1 Score (weighted handles class imbalance)
print(f"F1 Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")

# Precision & Recall (weighted)
print(f"Precision (weighted): {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall (weighted): {recall_score(y_test, y_pred, average='weighted'):.4f}")

# Full classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8086
F1 Score (weighted): 0.7556
Precision (weighted): 0.7734
Recall (weighted): 0.8086

Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.10      0.18     13440
           2       0.82      0.98      0.89   1227747
           3       0.59      0.16      0.25    259237
           4       0.60      0.03      0.06     40728

    accuracy                           0.81   1541152
   macro avg       0.66      0.32      0.34   1541152
weighted avg       0.77      0.81      0.76   1541152

