<div style='background-color:orange'>
<a id='TableOfContents'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Table of Contents
    </h1></i></u></b>
    <li><a href='#imports'>Imports</a>
    <li><a href='#initial'>Initial Setup</a>
    <li><a href='#model'>Modeling</a>
    <li><a href='#bestmodel'>Best Model(s)</a>
    <li><a href='#misc'>Miscellaneous</a>
    </li>
</div>

<div style='background-color:orange'>
<a id='imports'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Imports
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [1]:
# Vectorization & Dataframe
import numpy as np
import pandas as pd

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Classification Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Encoder
from sklearn.preprocessing import OrdinalEncoder

# Model metric evaluation
from sklearn.metrics import accuracy_score

# .py files
import wrangle as w

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

<div style='background-color:orange'>
<a id='initial'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Initial Setup
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [2]:
# Load up master dataset
master = w.wrangle()
master.shape

(14174, 132)

In [3]:
# Feature selection for modeling
vehicle_cols = [
    'license_plate_state',
    'vehicle_body_style',
    'vehicle_color',
    'vehicle_defect_1',
    'vehicle_make',
    'vehicle_model_name',
    'vehicle_model_year'
]

In [4]:
# Add the risk_agg column
risk_list, risk_dict = w.risk_scores_iterate_columns(master, vehicle_cols)
master['agg_injury_risk'] = risk_list

In [5]:
# Append the new column to the feature selection list
vehicle_cols.append('agg_injury_risk')

In [6]:
# THIS IS OPTIONAL!!!
# It appears that this column alone performs better
vehicle_cols = ['agg_injury_risk']

In [7]:
# Encode feature selected columns
encoder = OrdinalEncoder()
for col in vehicle_cols:
    if master[col].dtype == 'object':
        encoded_col = encoder.fit_transform(master[[col]])
        master[col] = encoded_col

In [8]:
# Split the data
train, validate, test = w.split(master, stratify='person_injury_severity')
train.shape, validate.shape, test.shape

((9924, 133), (2832, 133), (1418, 133))

In [9]:
# Create x/y for each split dataset
train_x = train[vehicle_cols]
train_y = train.person_injury_severity
validate_x = validate[vehicle_cols]
validate_y = validate.person_injury_severity
test_x = test[vehicle_cols]
test_y = test.person_injury_severity

<div style='background-color:orange'>
<a id='model'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Models
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    <li><a href='#modelcreatedict'>Create Model Dictionary</a>
    <li><a href='#modelbaseline'>Baseline</a>
    <li><a href='#modeldtc'>DecisionTreeClassifier</a>
    <li><a href='#modelrfc'>RandomForestClassifier</a>
    <li><a href='#modelknn'>K-Nearest Neighbors</a>
    <li><a href='#modellr'>Logistic Regression</a>
    </li>
</div>

<a id='modelcreatedict'></a>
<h3><b><i>
    Create Model Dictionary
</i></b></h3>
<li><a href='#model'>Models Top</a></li>

In [10]:
# Create base look of models_dict
models_dict = {
    'model_name' : [],
    'model_type' : [],
    'model_params' : [],
    'train_accuracy' : [],
    'validate_accuracy' : [],
    'accuracy_diff' : []
}

---

<a id='modelbaseline'></a>
<h3><b><i>
    Baseline
</i></b></h3>
<li><a href='#model'>Models Top</a></li>

In [11]:
# Establish the ratio of the most common value
mode_value = train.person_injury_severity.mode()[0]
mode_total_train = (train.person_injury_severity == mode_value).sum()
mode_total_validate = (validate.person_injury_severity == mode_value).sum()
mode_total_test = (test.person_injury_severity == mode_value).sum()
mode_percent_train = mode_total_train / train.shape[0]
mode_percent_validate = mode_total_validate / validate.shape[0]
mode_percent_test = mode_total_test / test.shape[0]

In [12]:
# Append baseline to 'models_dict'
models_dict['model_name'].append('baseline')
models_dict['model_type'].append('baseline - mode')
models_dict['model_params'].append('none')
models_dict['train_accuracy'].append(f'{mode_percent_train:.3f}')
models_dict['validate_accuracy'].append(f'{mode_percent_validate:.3f}')
models_dict['accuracy_diff'].append(f'{mode_percent_validate - mode_percent_train:.3f}')

In [13]:
# Check out the 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,model_type,model_params,train_accuracy,validate_accuracy,accuracy_diff
0,baseline,baseline - mode,none,0.575,0.575,0.0


---

<a id='modeldtc'></a>
<h3><b><i>
    DecisionTreeClassifier
</i></b></h3>
<li><a href='#model'>Models Top</a></li>

In [14]:
# Create generic DecisionTreeClassifer
for i in range(1, 11):
    model_name = f'dtc{i}'
    dtc = DecisionTreeClassifier(max_depth=i, random_state=1776)
    dtc.fit(train_x, train_y)
    train_score = dtc.score(train_x, train_y)
    val_score = dtc.score(validate_x, validate_y)
    diff_score = val_score - train_score
    models_dict['model_name'].append(model_name)
    models_dict['model_type'].append('Decision Tree Classifier')
    models_dict['model_params'].append(f'max_depth={i}, random_state=1776')
    models_dict['train_accuracy'].append(f'{train_score:.3f}')
    models_dict['validate_accuracy'].append(f'{val_score:.3f}')
    models_dict['accuracy_diff'].append(f'{diff_score:.3f}')

In [15]:
# Check out the 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,model_type,model_params,train_accuracy,validate_accuracy,accuracy_diff
0,baseline,baseline - mode,none,0.575,0.575,0.0
1,dtc1,Decision Tree Classifier,"max_depth=1, random_state=1776",0.575,0.575,0.0
2,dtc2,Decision Tree Classifier,"max_depth=2, random_state=1776",0.586,0.585,-0.001
3,dtc3,Decision Tree Classifier,"max_depth=3, random_state=1776",0.588,0.584,-0.003
4,dtc4,Decision Tree Classifier,"max_depth=4, random_state=1776",0.588,0.584,-0.004
5,dtc5,Decision Tree Classifier,"max_depth=5, random_state=1776",0.59,0.579,-0.011
6,dtc6,Decision Tree Classifier,"max_depth=6, random_state=1776",0.592,0.582,-0.01
7,dtc7,Decision Tree Classifier,"max_depth=7, random_state=1776",0.597,0.579,-0.018
8,dtc8,Decision Tree Classifier,"max_depth=8, random_state=1776",0.6,0.576,-0.025
9,dtc9,Decision Tree Classifier,"max_depth=9, random_state=1776",0.605,0.574,-0.031


---

<a id='modelrfc'></a>
<h3><b><i>
    RandomForestClassifier
</i></b></h3>
<li><a href='#model'>Models Top</a></li>

In [16]:
# Create generic RandomForestClassifer
for i in range(1, 11):
    model_name = f'rfc{i}'
    rfc = RandomForestClassifier(max_depth=i, random_state=1776)
    rfc.fit(train_x, train_y)
    train_score = rfc.score(train_x, train_y)
    val_score = rfc.score(validate_x, validate_y)
    diff_score = val_score - train_score
    models_dict['model_name'].append(model_name)
    models_dict['model_type'].append('Random Forest Classifier')
    models_dict['model_params'].append(f'max_depth={i}, random_state=1776')
    models_dict['train_accuracy'].append(f'{train_score:.3f}')
    models_dict['validate_accuracy'].append(f'{val_score:.3f}')
    models_dict['accuracy_diff'].append(f'{diff_score:.3f}')

In [17]:
# Check out the 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,model_type,model_params,train_accuracy,validate_accuracy,accuracy_diff
0,baseline,baseline - mode,none,0.575,0.575,0.0
1,dtc1,Decision Tree Classifier,"max_depth=1, random_state=1776",0.575,0.575,0.0
2,dtc2,Decision Tree Classifier,"max_depth=2, random_state=1776",0.586,0.585,-0.001
3,dtc3,Decision Tree Classifier,"max_depth=3, random_state=1776",0.588,0.584,-0.003
4,dtc4,Decision Tree Classifier,"max_depth=4, random_state=1776",0.588,0.584,-0.004
5,dtc5,Decision Tree Classifier,"max_depth=5, random_state=1776",0.59,0.579,-0.011
6,dtc6,Decision Tree Classifier,"max_depth=6, random_state=1776",0.592,0.582,-0.01
7,dtc7,Decision Tree Classifier,"max_depth=7, random_state=1776",0.597,0.579,-0.018
8,dtc8,Decision Tree Classifier,"max_depth=8, random_state=1776",0.6,0.576,-0.025
9,dtc9,Decision Tree Classifier,"max_depth=9, random_state=1776",0.605,0.574,-0.031


---

<a id='modelknn'></a>
<h3><b><i>
    K-Nearest Neighbors
</i></b></h3>
<li><a href='#model'>Models Top</a></li>

In [18]:
# Create generic K-Nearest Neighbors
for i in range(11, 21):
    model_name = f'knn{i}'
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    train_score = knn.score(train_x, train_y)
    val_score = knn.score(validate_x, validate_y)
    diff_score = val_score - train_score
    models_dict['model_name'].append(model_name)
    models_dict['model_type'].append('K-Nearest Neighbors')
    models_dict['model_params'].append(f'n_neighbors={i}')
    models_dict['train_accuracy'].append(f'{train_score:.3f}')
    models_dict['validate_accuracy'].append(f'{val_score:.3f}')
    models_dict['accuracy_diff'].append(f'{diff_score:.3f}')

In [19]:
# Check out the 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,model_type,model_params,train_accuracy,validate_accuracy,accuracy_diff
0,baseline,baseline - mode,none,0.575,0.575,0.0
1,dtc1,Decision Tree Classifier,"max_depth=1, random_state=1776",0.575,0.575,0.0
2,dtc2,Decision Tree Classifier,"max_depth=2, random_state=1776",0.586,0.585,-0.001
3,dtc3,Decision Tree Classifier,"max_depth=3, random_state=1776",0.588,0.584,-0.003
4,dtc4,Decision Tree Classifier,"max_depth=4, random_state=1776",0.588,0.584,-0.004
5,dtc5,Decision Tree Classifier,"max_depth=5, random_state=1776",0.59,0.579,-0.011
6,dtc6,Decision Tree Classifier,"max_depth=6, random_state=1776",0.592,0.582,-0.01
7,dtc7,Decision Tree Classifier,"max_depth=7, random_state=1776",0.597,0.579,-0.018
8,dtc8,Decision Tree Classifier,"max_depth=8, random_state=1776",0.6,0.576,-0.025
9,dtc9,Decision Tree Classifier,"max_depth=9, random_state=1776",0.605,0.574,-0.031


---

<a id='modellr'></a>
<h3><b><i>
    Logistic Regression
</i></b></h3>
<li><a href='#model'>Models Top</a></li>

In [20]:
# Create generic Logistic Regression
for i in range(1, 11):
    model_name = f'lr{i}'
    lr = LogisticRegression(solver='liblinear', intercept_scaling=i)
    lr.fit(train_x, train_y)
    train_score = lr.score(train_x, train_y)
    val_score = lr.score(validate_x, validate_y)
    diff_score = val_score - train_score
    models_dict['model_name'].append(model_name)
    models_dict['model_type'].append('Logistic Regression')
    models_dict['model_params'].append(f'solver=\'liblinear\', intercept_scaling{i}')
    models_dict['train_accuracy'].append(f'{train_score:.3f}')
    models_dict['validate_accuracy'].append(f'{val_score:.3f}')
    models_dict['accuracy_diff'].append(f'{diff_score:.3f}')

In [21]:
# Check out the 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,model_type,model_params,train_accuracy,validate_accuracy,accuracy_diff
0,baseline,baseline - mode,none,0.575,0.575,0.0
1,dtc1,Decision Tree Classifier,"max_depth=1, random_state=1776",0.575,0.575,0.0
2,dtc2,Decision Tree Classifier,"max_depth=2, random_state=1776",0.586,0.585,-0.001
3,dtc3,Decision Tree Classifier,"max_depth=3, random_state=1776",0.588,0.584,-0.003
4,dtc4,Decision Tree Classifier,"max_depth=4, random_state=1776",0.588,0.584,-0.004
5,dtc5,Decision Tree Classifier,"max_depth=5, random_state=1776",0.59,0.579,-0.011
6,dtc6,Decision Tree Classifier,"max_depth=6, random_state=1776",0.592,0.582,-0.01
7,dtc7,Decision Tree Classifier,"max_depth=7, random_state=1776",0.597,0.579,-0.018
8,dtc8,Decision Tree Classifier,"max_depth=8, random_state=1776",0.6,0.576,-0.025
9,dtc9,Decision Tree Classifier,"max_depth=9, random_state=1776",0.605,0.574,-0.031


dtc2 appears to perform the best

<div style='background-color:orange'>
<a id='bestmodel'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Best Model(s)
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>

In [22]:
# Create best_models_dict
best_models_dict = {
    'model_name' : ['baseline'],
    'model_type' : ['baseline - mode'],
    'model_params' : ['none'],
    'train_accuracy' : [f'{mode_percent_train:.3f}'],
    'validate_accuracy' : [f'{mode_percent_validate:.3f}'],
    'test_accuracy' : [f'{mode_percent_test:.3f}']
}

In [23]:
# Recreate rfc5 and test it
dtc = DecisionTreeClassifier(max_depth=2, random_state=1776)
dtc.fit(train_x, train_y)
train_score = dtc.score(train_x, train_y)
val_score = dtc.score(validate_x, validate_y)
test_score = dtc.score(test_x, test_y)
best_models_dict['model_name'].append('dtc2')
best_models_dict['model_type'].append('Decision Tree Classifier')
best_models_dict['model_params'].append(f'max_depth=2, random_state=1776')
best_models_dict['train_accuracy'].append(f'{train_score:.3f}')
best_models_dict['validate_accuracy'].append(f'{val_score:.3f}')
best_models_dict['test_accuracy'].append(f'{test_score:.3f}')

In [24]:
# Check out the 'models_dict'
pd.DataFrame(best_models_dict)

Unnamed: 0,model_name,model_type,model_params,train_accuracy,validate_accuracy,test_accuracy
0,baseline,baseline - mode,none,0.575,0.575,0.575
1,dtc2,Decision Tree Classifier,"max_depth=2, random_state=1776",0.586,0.585,0.587


<div style='background-color:orange'>
<a id='misc'></a>
    <b><u><i><h1 style='text-align:center ; padding-top:5px'>
        Miscellaneous
    </h1></i></u></b>
    <li><a href='#TableOfContents'>Table of Contents</a>
    </li>
</div>