In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [30]:
df = pd.read_csv('datasets/us_accidents_expanded.csv')

## Machine Learning 2
A different approach to ML models

### Dataset Processing
Before we can try out machine learning models, we need to first prepare our dataset.
The first thing is to drop features that we don't believe are necessary.

In [31]:
df_ml = df.copy()

# dropping timestamps for now, relying on elapsed time instead
df_ml.drop(columns=['Start_Datetime', 'End_Datetime', 'Start_Date', 'End_Date', 'Start_Time', 'End_Time'], inplace=True)

# dropping exact coordinates, relying on distance instead
df_ml.drop(columns=['Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng'], inplace=True)

# description, address, and weather timestamp don't provide any meaningful information
df_ml.drop(columns=['Description', 'Street', 'City', 'County', 'Zipcode', 'Country', 'Weather_Timestamp'], inplace=True)

# temp: drop elapsed time cause it's bugged
df_ml.drop(columns=['Elapsesd_Time'], inplace=True)

Next we need to adjust the day/night features to true/false respectively.

In [32]:
# change day/night to true/false respectively
day_night = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
df_ml[day_night] = df_ml[day_night].replace('Day', True)
df_ml[day_night] = df_ml[day_night].replace('Night', False)

Next, we need to one-hot encode the remaining categorical values.

In [33]:
df_ml = pd.get_dummies(df_ml, columns=['Airport_Code', 'Wind_Direction'])

We also need to drop records containing missing values. We've imputed what we can 
during the dataset processing, we now we just drop the remaining ones.

In [34]:
df_ml.dropna(inplace=True)

Finally, we need to convert the `Severity` target feature to string so it's
treated as a discrete class value.

In [35]:
df_ml['Severity'] = df_ml['Severity'].astype(str)

We can now split into `X` (features) and `y` (target class) for training.

In [36]:
X = df_ml.drop(columns=['Severity'])
y = df_ml['Severity']

### Train/Val/Test Split
We initially wanted to use cross validation to test our models. However, due to 
the sheer size of our dataset, it would be very time intensive. Instead, our plan
is to split the dataset into training, validation, and testing. Training and validation
sets will be used to gauge performances of various models. Once we determine the best
model, we'll run it one last time on the test set. The split we use for train/val/test
will be 70/20/10.

In [37]:
# split off testing data
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.1, random_state=1234)

# split rest into training and validation
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=2/9, random_state=1234)

Before we start to train our models, we need to normalize our numerical data. We will
use sklearn's `MinMaxScaler` for this.

In [44]:
scaler = MinMaxScaler()
numeric_columns = df_ml.select_dtypes(include=['number']).columns
print('Normalizing the following columns:')
for col in numeric_columns:
    print(f' - {col}')

X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

Normalizing the following columns:
 - Distance(mi)
 - Temperature(F)
 - Wind_Chill(F)
 - Humidity(%)
 - Pressure(in)
 - Visibility(mi)
 - Wind_Speed(mph)
 - Precipitation(in)


Our final processed dataset looks something like this:

In [45]:
X_train.head()

Unnamed: 0,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Amenity,Bump,...,Wind_Direction_SSE,Wind_Direction_SSW,Wind_Direction_SW,Wind_Direction_South,Wind_Direction_VAR,Wind_Direction_Variable,Wind_Direction_W,Wind_Direction_WNW,Wind_Direction_WSW,Wind_Direction_West
214985,0.014837,0.395556,0.451613,0.683673,0.919795,0.1,0.020576,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
317281,0.006614,0.297778,0.350806,0.602041,0.87884,0.1,0.020576,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
105583,0.004035,0.524444,0.568548,0.581633,0.912969,0.1,0.024691,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
405621,0.000534,0.244444,0.266129,0.77551,0.883106,0.005,0.057613,0.000991,False,False,...,False,False,False,False,False,False,False,False,False,False
484393,0.011163,0.328889,0.391129,0.479592,0.805461,0.1,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False


### ML Models
We will be testing a variety of different models. We will be testing out:
- Logistic Regression
- Decision Trees
- Neural Networks

#### Logistic Regression
Our **Logistic Regression** model has been adjusted to best fit our dataset. One key
property of our dataset is it's sparseness. The majority of its features are one-hot
encoded, leading to sparsity. Therefore, we choose `saga` for the solver since it's
best suited for large, sparse datasets.

In [47]:
model = LogisticRegression(
    solver='saga',
    random_state=1234,
    multi_class='multinomial',
    class_weight='balanced',
    max_iter=250,
    n_jobs=3
)
model.fit(X_train, y_train)




In [49]:
model.score(X_val, y_val)

0.5112336953398839

#### Decision Tree
Next is a **Decision Tree** classifier.

In [50]:
tree = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=1234
)
tree.fit(X_train, y_train)

In [51]:
tree.score(X_val, y_val)

0.7919180601045797

#### Neural Network

In [52]:
mlp = MLPClassifier(
    hidden_layer_sizes=(50, 50),
    alpha=0.01,
    random_state=1234,
    early_stopping=True
)