<div style='background-color:orange'>
<a id='TableOfContents'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Table of Contents
    </h1></i></u></b>
    <li><a href='#imports'>Imports</a>
    <li><a href='#initial'>Initial Setup</a>
    <li><a href='#model'>Modeling</a>
    <li><a href='#bestmodel'>Best Model(s)</a>
    <li><a href='#misc'>Miscellaneous</a>
    </li>
</div>

<div style='background-color:orange'>
<a id='imports'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Imports
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [10]:
# Vectorization & Dataframe
import numpy as np
import pandas as pd

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Classification Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Encoder
from sklearn.preprocessing import OrdinalEncoder

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Model metric evaluation
from sklearn.metrics import accuracy_score

# .py files
import wrangle as w

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

<div style='background-color:orange'>
<a id='initial'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Initial Setup
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [11]:
# Load up master dataset
master = w.wrangle()
master.shape

(14174, 132)

In [12]:
# Feature selection for modeling
features = [
    'city',
    'contributing_factors',
    'county',
    'physical_feature_1',
    'physical_feature_2',
    'population_group',
    'driver_license_class',
    'driver_license_endorsements',
    'driver_license_restrictions',
    'driver_license_state',
    'driver_license_type'
]

In [13]:
# Encode feature selected columns
encoder = OrdinalEncoder()
for col in features:
    encoded_col = encoder.fit_transform(master[[col]])
    master[col] = encoded_col

In [14]:
# Split the data
train, validate, test = w.split(master, stratify='person_injury_severity')
train.shape, validate.shape, test.shape

((9924, 132), (2832, 132), (1418, 132))

In [15]:
# Create x/y for each split dataset
train_x = train[features]
train_y = train.person_injury_severity
validate_x = validate[features]
validate_y = validate.person_injury_severity
test_x = test[features]
test_y = test.person_injury_severity

<div style='background-color:orange'>
<a id='model'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Models
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [19]:
# In preparation of GridSearchCV, create model dictionary
models = {
    'Decision Tree' : (DecisionTreeClassifier(), {'max_depth' : [None] + list(range(3, 20)),
                                                  'min_samples_split' : list(range(2, 5)),
                                                  'min_samples_leaf' : list(range(1, 5)),
                                                  'random_state' : [1776]}),
    'Random Forest' : (RandomForestClassifier(), {'n_estimators' : [100, 200, 300],
                                                  'max_depth' : [None] + list(range(3, 20)),
                                                  'min_samples_split' : list(range(2, 5)),
                                                  'min_samples_leaf' : list(range(1, 5)),
                                                  'random_state' : [1776]}),
    'KNN' : (KNeighborsClassifier(), {'n_neighbors' : [5, 10, 50, 100],
                                      'weights' : ['uniform', 'distance'],
                                      'algorithm' : ['ball_tree', 'kd_tree', 'brute', 'auto'],
                                      'random_state' : [1776]}),
    'Logistic Regression' : (LogisticRegression(), {'C' : [0.1, 1, 10],
                                                    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
                                                    'random_state' : [1776]})
}

In [None]:
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)
    grid_search.fit(train_x, train_y)

In [None]:
# Ran at 1545
grid_search

<div style='background-color:orange'>
<a id='bestmodel'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Best Model(s)
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

<div style='background-color:orange'>
<a id='misc'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Miscellaneous
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>