<div style='background-color:orange'>
<a id='TableOfContents'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Table of Contents
    </h1></i></u></b>
    <li><a href='#imports'>Imports</a>
    <li><a href='#initial'>Initial Setup</a>
    <li><a href='#model'>Modeling</a>
    <li><a href='#bestmodel'>Best Model(s)</a>
    <li><a href='#misc'>Miscellaneous</a>
    </li>
</div>

<div style='background-color:orange'>
<a id='imports'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Imports
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [1]:
# Vectorization & Dataframe
import numpy as np
import pandas as pd

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Classification Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Encoder
from sklearn.preprocessing import OrdinalEncoder

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Model metric evaluation
from sklearn.metrics import accuracy_score

# .py files
import wrangle as w

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

<div style='background-color:orange'>
<a id='initial'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Initial Setup
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [2]:
# Load up master dataset
master = w.wrangle()
master.shape

(14174, 132)

In [3]:
# Feature selection for modeling
features = [
    'city',
    'contributing_factors',
    'county',
    'physical_feature_1',
    'physical_feature_2',
    'population_group',
    'driver_license_class',
    'driver_license_endorsements',
    'driver_license_restrictions',
    'driver_license_state',
    'driver_license_type'
]

In [4]:
# Encode feature selected columns
encoder = OrdinalEncoder()
for col in features:
    encoded_col = encoder.fit_transform(master[[col]])
    master[col] = encoded_col

In [5]:
# Test???
risk_list, risk_dict = w.risk_scores_iterate_columns(master, features)
master['agg_injury_risk'] = risk_list
features = ['agg_injury_risk']

In [6]:
# Split the data
train, validate, test = w.split(master, stratify='person_injury_severity')
train.shape, validate.shape, test.shape

((9924, 133), (2832, 133), (1418, 133))

In [7]:
# Create x/y for each split dataset
train_x = train[features]
train_y = train.person_injury_severity
validate_x = validate[features]
validate_y = validate.person_injury_severity
test_x = test[features]
test_y = test.person_injury_severity

<div style='background-color:orange'>
<a id='model'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Models
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [8]:
# In preparation of GridSearchCV, create model dictionary
models = {
    'Decision Tree' : (DecisionTreeClassifier(), {'max_depth' : [None] + list(range(3, 15)),
                                                  'min_samples_split' : list(range(2, 4)),
                                                  'min_samples_leaf' : list(range(1, 3)),
                                                  'random_state' : [1776]}),
    'Random Forest' : (RandomForestClassifier(), {'n_estimators' : [100, 200, 300],
                                                  'max_depth' : [None] + list(range(3, 15)),
                                                  'min_samples_split' : list(range(2, 4)),
                                                  'min_samples_leaf' : list(range(1, 3)),
                                                  'random_state' : [1776]}),
    'KNN' : (KNeighborsClassifier(), {'n_neighbors' : [5, 10, 50],
                                      'weights' : ['uniform', 'distance'],
                                      'algorithm' : ['ball_tree', 'kd_tree', 'brute', 'auto']}),
    'Logistic Regression' : (LogisticRegression(), {'C' : [0.1, 1, 10],
                                                    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
                                                    'random_state' : [1776]})
}

In [10]:
model_results = []
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)
    grid_search.fit(train_x, train_y)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    train_accuracy = grid_search.best_score_
    validate_accuracy = best_model.score(validate_x, validate_y)
    model_results.append({
        'Model': model_name,
        'Best Estimator': best_model,
        'Best Parameters': best_params,
        'Train Accuracy': train_accuracy,
        'Validate Accuracy': validate_accuracy
    })
df_model_results = pd.DataFrame(model_results)

In [12]:
df_model_results.to_dict()

{'Model': {0: 'Decision Tree',
  1: 'Random Forest',
  2: 'KNN',
  3: 'Logistic Regression'},
 'Best Estimator': {0: DecisionTreeClassifier(max_depth=3, random_state=1776),
  1: RandomForestClassifier(max_depth=3, random_state=1776),
  2: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=50),
  3: LogisticRegression(C=10, random_state=1776, solver='sag')},
 'Best Parameters': {0: {'max_depth': 3,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'random_state': 1776},
  1: {'max_depth': 3,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 100,
   'random_state': 1776},
  2: {'algorithm': 'ball_tree', 'n_neighbors': 50, 'weights': 'uniform'},
  3: {'C': 10, 'random_state': 1776, 'solver': 'sag'}},
 'Train Accuracy': {0: 0.6117491569838304,
  1: 0.6123532847160152,
  2: 0.6088263792963353,
  3: 0.6039898533355001}}

In [13]:
testthingy = {'Model': {0: 'Decision Tree',
  1: 'Random Forest',
  2: 'KNN',
  3: 'Logistic Regression'},
 'Best Estimator': {0: DecisionTreeClassifier(max_depth=3, random_state=1776),
  1: RandomForestClassifier(max_depth=3, random_state=1776),
  2: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=50),
  3: LogisticRegression(C=10, random_state=1776, solver='sag')},
 'Best Parameters': {0: {'max_depth': 3,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'random_state': 1776},
  1: {'max_depth': 3,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'n_estimators': 100,
   'random_state': 1776},
  2: {'algorithm': 'ball_tree', 'n_neighbors': 50, 'weights': 'uniform'},
  3: {'C': 10, 'random_state': 1776, 'solver': 'sag'}},
 'Train Accuracy': {0: 0.6117491569838304,
  1: 0.6123532847160152,
  2: 0.6088263792963353,
  3: 0.6039898533355001}}

In [14]:
pd.DataFrame(testthingy)

Unnamed: 0,Model,Best Estimator,Best Parameters,Train Accuracy
0,Decision Tree,"DecisionTreeClassifier(max_depth=3, random_sta...","{'max_depth': 3, 'min_samples_leaf': 1, 'min_s...",0.611749
1,Random Forest,"RandomForestClassifier(max_depth=3, random_sta...","{'max_depth': 3, 'min_samples_leaf': 1, 'min_s...",0.612353
2,KNN,"KNeighborsClassifier(algorithm='ball_tree', n_...","{'algorithm': 'ball_tree', 'n_neighbors': 50, ...",0.608826
3,Logistic Regression,"LogisticRegression(C=10, random_state=1776, so...","{'C': 10, 'random_state': 1776, 'solver': 'sag'}",0.60399


<div style='background-color:orange'>
<a id='bestmodel'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Best Model(s)
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

<div style='background-color:orange'>
<a id='misc'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Miscellaneous
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>