In [1]:
import pandas as pd
df = pd.read_csv("../data/US_Accidents_March23_balanced.csv")
df = df.drop('State', axis=1)
df.head(1)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Stop,Traffic_Calming,Traffic_Signal,Civil_Twilight,Year,Month,Weekday,Day,Hour,Minute
0,1,37.62278,-77.511887,0.0,Henrico,67.0,70.0,29.96,10.0,Variable,...,False,False,False,Day,2022,5,0,23,6,58


In [2]:
X = df

In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features = ['Temperature(F)','Distance(mi)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)','Precipitation(in)','Start_Lng','Start_Lat','Year', 'Month','Weekday','Day','Hour','Minute']
X[features] = scaler.fit_transform(X[features])
X.head(1)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Stop,Traffic_Calming,Traffic_Signal,Civil_Twilight,Year,Month,Weekday,Day,Hour,Minute
0,1,0.534242,0.829552,0.0,Henrico,0.681223,0.69697,0.964553,0.083333,Variable,...,False,False,False,Day,0.857143,0.363636,0.166667,0.733333,0.26087,0.983051


In [4]:
categorical_features = {"City", "Wind_Direction", "Weather_Condition", "Civil_Twilight"}

for cat in categorical_features:
    X[cat] = X[cat].astype("category")

In [5]:
print("Unique classes for each categorical feature:")
for cat in categorical_features:
    print("{:15s}".format(cat), "\t", len(X[cat].unique()))

Unique classes for each categorical feature:
Wind_Direction  	 10
Weather_Condition 	 11
City            	 8718
Civil_Twilight  	 2


In [6]:
X = X.replace([True, False], [1, 0])

X.head(1)

  X = X.replace([True, False], [1, 0])


Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Stop,Traffic_Calming,Traffic_Signal,Civil_Twilight,Year,Month,Weekday,Day,Hour,Minute
0,1,0.534242,0.829552,0.0,Henrico,0.681223,0.69697,0.964553,0.083333,Variable,...,0,0,0,Day,0.857143,0.363636,0.166667,0.733333,0.26087,0.983051


In [7]:
# Remove city because it will be encoded later
onehot_cols = list(categorical_features - {"City"})

X = pd.get_dummies(X, columns=onehot_cols, drop_first=True)

X.head(1)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),...,Weather_Condition_Fog,Weather_Condition_Hail,Weather_Condition_Rain,Weather_Condition_Sand,Weather_Condition_Smoke,Weather_Condition_Snow,Weather_Condition_Thunderstorm,Weather_Condition_Tornado,Weather_Condition_Windy,Civil_Twilight_Night
0,1,0.534242,0.829552,0.0,Henrico,0.681223,0.69697,0.964553,0.083333,0.012931,...,False,False,False,False,False,False,False,False,False,False


In [8]:
import category_encoders as ce

binary_encoder = ce.binary.BinaryEncoder()

city_binary_enc = binary_encoder.fit_transform(X["City"])

In [9]:
X = pd.concat([X, city_binary_enc], axis=1).drop("City", axis=1)
    
X.head(1)

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),...,City_4,City_5,City_6,City_7,City_8,City_9,City_10,City_11,City_12,City_13
0,1,0.534242,0.829552,0.0,0.681223,0.69697,0.964553,0.083333,0.012931,0.0,...,0,0,0,0,0,0,0,0,0,1


### Random Forest Model

In [10]:
from sklearn.model_selection import train_test_split

sample = X
y_sample = sample["Severity"]
X_sample = sample.drop("Severity", axis=1)

X_train, X_validate, y_train, y_validate = train_test_split(X_sample, y_sample, random_state=42)
print(X_train.shape, y_train.shape)
print(X_validate.shape, y_validate.shape)

(187722, 61) (187722,)
(62574, 61) (62574,)


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_jobs=-1, random_state=42)
parameters = [{"n_estimators": [50, 100, 200, 500], "max_depth": [5, 10, 15, 30]}]
grid = GridSearchCV(rfc, parameters, verbose=5, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [14]:
# Convert columns to DataFrame
columns_df = pd.DataFrame(X_train.columns, columns=['Column_Name'])

# Export to CSV
columns_df.to_csv('X_train_columns.csv', index=False, sep=',')

### Joblib Dump (scaler, binary_encoder, model)

In [20]:
import joblib

joblib.dump(scaler, "../models/scaler.joblib")
joblib.dump(binary_encoder, "../models/binary_encoder.joblib")
joblib.dump(grid.best_estimator_, "../models/random_forest_model.joblib")

['../models/scaler.joblib']