In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets, svm
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

In [18]:
df = pd.read_csv("../data/data-clean/super-clean-vehicle-2020-2021.csv", index_col = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23171 entries, 0 to 23170
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   DEATHS              23171 non-null  int64  
 1   DR_DRINK            23171 non-null  int64  
 2   DR_HGT              23171 non-null  int64  
 3   DR_WGT              23171 non-null  int64  
 4   HIT_RUNNAME         23171 non-null  int64  
 5   TRAV_SP             23171 non-null  int64  
 6   HOUR                23171 non-null  int64  
 7   MOD_YEAR            23171 non-null  int64  
 8   MODELNAME           23171 non-null  object 
 9   VSPD_LIM            23171 non-null  int64  
 10  VPICBODYCLASSNAME   23171 non-null  object 
 11  NUMOCCS             23171 non-null  int64  
 12  MAKENAME            23171 non-null  object 
 13  MINUTE              23171 non-null  int64  
 14  TIME                23171 non-null  float64
 15  APRX_WGT            23171 non-null  float64
 16  MOD_

In [20]:
# split to train and test
X = df.drop(columns = ['MODELNAME', 'DEATHOCCURRED', 'VPICBODYCLASSNAME',
                      'MAKENAME'])
y = df.VPICBODYCLASSNAME
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)


In [21]:
# Comparing SVM and linear SVC (squared hinge loss) to classify body type
SVC = svm.SVC(kernel="linear", C=1)
SVC.fit(X_train, y_train)
y_score_svc = SVC.decision_function(X_test)
LSVC = svm.LinearSVC(C=1, max_iter=1000, dual=False)
LSVC.fit(X_train, y_train)
y_score = LSVC.decision_function(X_test)

In [22]:
svc_y_pred = SVC.predict(X_test)
lsvc_y_pred = LSVC.predict(X_test)

In [25]:
svc_recall = recall_score(y_test, svc_y_pred, average = 'weighted')
lsvc_recall = recall_score(y_test, lsvc_y_pred, average = 'weighted')

In [27]:
svc_f1 = f1_score(y_test, svc_y_pred, average='weighted')
lsvc_f1 = f1_score(y_test, lsvc_y_pred, average='weighted')

In [30]:
svc_percision = precision_score(y_test, svc_y_pred, average='weighted')
lsvc_percision = precision_score(y_test, lsvc_y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
svc_accuracy = accuracy_score(y_test, svc_y_pred, normalize=False)
lsvc_accuracy = accuracy_score(y_test, lsvc_y_pred, normalize=False)

#### Reducing dimensionality by eliminating makename as a feature and reapplying

In [32]:
# reapplying the same two models except leaving out make name
# reduces features from 55 to 11
df_red =pd.read_csv('../data/data-clean/super-clean-vehicle-2020-2021.csv', index_col = 0)
df_red.head()

Unnamed: 0,DEATHS,DR_DRINK,DR_HGT,DR_WGT,HIT_RUNNAME,TRAV_SP,HOUR,MOD_YEAR,MODELNAME,VSPD_LIM,VPICBODYCLASSNAME,NUMOCCS,MAKENAME,MINUTE,TIME,APRX_WGT,MOD_YEAR_5YR_CHUNK,RELATIVE_SPEED,BMI,DEATHOCCURRED
0,3,1,67,145,0,80,2,1997,ES-250/300/300h/330/ 350,45,Sedan/Saloon,4,Lexus,58,2.966667,3742.851635,1995,35,22.70773,1
1,1,0,67,215,0,70,14,1995,Camry,65,Sedan/Saloon,1,Toyota,55,14.916667,3683.158842,1995,5,33.670082,1
2,0,0,68,228,0,10,14,2016,Medium/Heavy - CBE,55,Truck-Tractor,1,Peterbilt,55,14.916667,21481.49057,2015,-45,34.663495,0
3,1,0,73,133,0,80,15,2007,"Azera (For 2018 on, code as vehicle model 398)",35,Sedan/Saloon,5,Hyundai,20,15.333333,3230.411839,2005,45,17.545318,1
4,1,0,64,115,0,70,0,2013,Suburban/Yukon XL (2004 on; see 431 for 1950- ...,45,Sport Utility Vehicle (SUV)/Multi-Purpose Vehi...,1,GMC,45,0.75,4824.798634,2015,25,19.737549,1


In [33]:
# split to train and test
X_red = df_red.drop(columns = ['MODELNAME', 'DEATHOCCURRED', 'VPICBODYCLASSNAME', 'MAKENAME'])
y_red = df_red.VPICBODYCLASSNAME
X_red_train, X_red_test, y_red_train, y_red_test = train_test_split(X_red, y_red, test_size=0.2, random_state=42)

sc = StandardScaler()
X_red_train = sc.fit_transform(X_red_train)
X_red_test = sc.transform (X_red_test)

In [34]:
# Comparing SVM and linear SVC (squared hinge loss) to classify body type
SVC_red = svm.SVC(kernel="linear", C=1)
SVC_red.fit(X_red_train, y_red_train)
y_score_svc_red = SVC_red.decision_function(X_red_test)

LSVC_red = svm.LinearSVC(C=1, max_iter=1000, dual="auto")
LSVC_red.fit(X_red_train, y_red_train)
y_score_lsvc_red = LSVC_red.decision_function(X_red_test)

InvalidParameterError: The 'dual' parameter of LinearSVC must be an instance of 'bool', an instance of 'numpy.bool_' or an instance of 'int'. Got 'auto' instead.

In [None]:
svc_y_pred_red = SVC_red.predict(X_red_test)
lsvc_y_pred_red = LSVC_red.predict(X_red_test)

In [None]:
#recall score for reduced model
svc_recall_red = recall_score(y_red_test, svc_y_pred_red, average = 'weighted')
lsvc_recall_red = recall_score(y_red_test, lsvc_y_pred_red, average = 'weighted')

In [66]:
# f1 score for reduced model
svc_f1_red = f1_score(y_red_test, svc_y_pred_red, average='weighted')
lsvc_f1_red = f1_score(y_red_test, lsvc_y_pred_red, average='weighted')

In [56]:
# accuracy score for reduced model
svc_accuracy_red = accuracy_score(y_red_test, svc_y_pred_red, normalize=False)
lsvc_accuracy_red = accuracy_score(y_red_test, lsvc_y_pred_red, normalize=False)

In [65]:
svc_percision_red = precision_score(y_red_test, svc_y_pred_red, average='weighted')
lsvc_percision_red = precision_score(y_red_test, lsvc_y_pred_red, average = 'weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [73]:
data = [
    ['SVC', svc_accuracy, svc_percision, svc_recall, svc_f1], 
    ['Linear SVC', lsvc_accuracy, lsvc_percision, lsvc_recall, lsvc_f1],
    ['Reduced SVC', svc_accuracy_red, svc_percision_red, svc_recall_red, svc_f1_red],
    ['Reduced Linear SVC', lsvc_accuracy_red, lsvc_percision_red, lsvc_recall_red, lsvc_f1_red]
]
model_comparison = pd.DataFrame(
    data,
    columns = ['Model Type', 'Accuracy Score', 'Precision Score', 'Recall Score', 'F1 Score']
)
model_comparison

Unnamed: 0,Model Type,Accuracy Score,Precision Score,Recall Score,F1 Score
0,SVC,2533.0,0.558209,0.579501,0.538466
1,Linear SVC,2538.0,0.569897,0.580645,0.536464
2,Reduced SVC,1541.0,0.230243,0.352551,0.247562
3,Reduced Linear SVC,1578.0,0.321388,0.361016,0.290559
