In [50]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score

In [51]:
df = pd.read_csv('AccidentSample.csv')

In [52]:
# df = df[df['ROADTYPE_ID']<7]
df = df[(df['ROADTYPE_ID']<7) & (df['ROADSKIN_ID']<7) & (df['ACDPOINT_ID']<7) & (df['ATMOSPHERE_ID']<7) & (df['LIGHT_ID']<7)]
df.head()

Unnamed: 0,ACCIDENT_ID,ACCIDENT_MONTH,ACCIDENT_YEAR,ROADTYPE_ID,ROADSKIN_ID,ACDPOINT_ID,ATMOSPHERE_ID,LIGHT_ID,PERSON_AGE,PERSON_STATE
2,3,8,14,4,1,1,1,3,60,0
3,4,11,16,3,1,1,1,1,68,0
5,6,1,17,4,1,1,1,1,72,0
6,7,1,17,4,3,3,1,1,19,0
7,8,10,16,4,1,1,1,2,20,1


In [53]:
df = df[df['ACCIDENT_YEAR']<16]
df.head()

Unnamed: 0,ACCIDENT_ID,ACCIDENT_MONTH,ACCIDENT_YEAR,ROADTYPE_ID,ROADSKIN_ID,ACDPOINT_ID,ATMOSPHERE_ID,LIGHT_ID,PERSON_AGE,PERSON_STATE
2,3,8,14,4,1,1,1,3,60,0
59,60,7,13,1,1,1,1,1,74,1
60,61,7,13,1,1,1,1,1,78,1
62,63,10,12,1,1,1,1,3,46,1
63,64,10,12,1,1,1,1,1,44,1


In [54]:
df = df[df['PERSON_AGE']<100]
df = df[df['PERSON_AGE']>0]
df.loc[(df.PERSON_AGE <= 14),'PERSON_AGE']=1
df.loc[(df.PERSON_AGE >= 15) & (df.PERSON_AGE <= 24),'PERSON_AGE']=2
df.loc[(df.PERSON_AGE >= 25) & (df.PERSON_AGE <= 64),'PERSON_AGE']=3
df.loc[(df.PERSON_AGE > 64) ,'PERSON_AGE']=4
df.head()

Unnamed: 0,ACCIDENT_ID,ACCIDENT_MONTH,ACCIDENT_YEAR,ROADTYPE_ID,ROADSKIN_ID,ACDPOINT_ID,ATMOSPHERE_ID,LIGHT_ID,PERSON_AGE,PERSON_STATE
2,3,8,14,4,1,1,1,3,3,0
59,60,7,13,1,1,1,1,1,4,1
60,61,7,13,1,1,1,1,1,4,1
62,63,10,12,1,1,1,1,3,3,1
63,64,10,12,1,1,1,1,1,3,1


In [55]:
df.to_csv('filtered_sample.csv') # save a filtered sample

In [56]:
df.groupby('PERSON_STATE')["ACCIDENT_ID"].count()

PERSON_STATE
0    108226
1     23978
2       679
Name: ACCIDENT_ID, dtype: int64

In [57]:
data_arr = df[["ROADTYPE_ID","ROADSKIN_ID","ACDPOINT_ID","ATMOSPHERE_ID","LIGHT_ID","PERSON_AGE"]].to_numpy()
label_arr = df["PERSON_STATE"].to_numpy()

In [58]:
# Shuffle data
index0 = np.where(label_arr==0)
np.random.shuffle(index0[0])
index1 = np.where(label_arr==1)
np.random.shuffle(index1[0])
index2 = np.where(label_arr==2)
np.random.shuffle(index2[0])
size = len(index2[0])
# Person state 3 have 679 samples 
# Select only 679 sample for each class to prevent imbalance sample and bias result
index_all = np.concatenate((index0[0][:size],index1[0][:size],index2[0][:size]),axis = None)
len(data_arr[index_all])

2037

In [59]:
# split test,train sample
X_train, X_test, y_train, y_test = train_test_split(data_arr[index_all], label_arr[index_all], test_size=0.3, random_state=2374)

In [60]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Confusion matrix\n",confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
#print(cross_val_score(gnb, data_arr[index_all], label_arr[index_all], cv=6))
#print(np.average(cross_val_score(cnb, data_arr[index_all], label_arr[index_all], cv=6)))

Confusion matrix
 [[ 95 103   6]
 [ 45 132  14]
 [ 62 138  17]]
              precision    recall  f1-score   support

           0       0.47      0.47      0.47       204
           1       0.35      0.69      0.47       191
           2       0.46      0.08      0.13       217

    accuracy                           0.40       612
   macro avg       0.43      0.41      0.36       612
weighted avg       0.43      0.40      0.35       612



In [61]:
cnb = CategoricalNB()
y_pred = cnb.fit(X_train, y_train).predict(X_test)
result = confusion_matrix(y_test, y_pred)
print("Confusion matrix\n",result)
print(classification_report(y_test, y_pred))
#print(cross_val_score(cnb, data_arr[index_all], label_arr[index_all], cv=6))
#print(np.average(cross_val_score(cnb, data_arr[index_all], label_arr[index_all], cv=6)))

Confusion matrix
 [[100  81  23]
 [ 37 123  31]
 [ 65 115  37]]
              precision    recall  f1-score   support

           0       0.50      0.49      0.49       204
           1       0.39      0.64      0.48       191
           2       0.41      0.17      0.24       217

    accuracy                           0.42       612
   macro avg       0.43      0.43      0.41       612
weighted avg       0.43      0.42      0.40       612

