In [2]:
# Standard Headers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

In [3]:
#read the data
data = pd.read_csv('vermont.csv', header='infer')
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(283285, 23)

In [4]:
#all the missing values for each column
pd.isnull(data).sum()

id                            0
state                         0
stop_date                     0
stop_time                     0
location_raw                694
county_name                 705
county_fips                 705
fine_grained_location       347
police_department             0
driver_gender              1712
driver_age_raw             1171
driver_age                 1286
driver_race_raw            3984
driver_race                4817
violation_raw              2178
violation                  2178
search_conducted              0
search_type_raw            2240
search_type              279866
contraband_found             34
stop_outcome               2325
is_arrested                   0
officer_id                   12
dtype: int64

In [5]:
# Location is needed to build an accurate model so all missing data will be dropped
data = data.dropna(subset = ['county_name'])

# Drivers gender is also needed so will drop records without gender
data = data.dropna(subset = ['driver_gender'])

# Drivers race is also needed so will drop records without race
data = data.dropna(subset = ['driver_race'])

# Violation is also needed so will drop records without violation
data = data.dropna(subset = ['violation'])

# Search Type Raw, Contraband Found, Stop Outcome and Officer ID only include 500 records for missing values so we will drop these fields too
data = data.dropna(subset = ['search_type_raw'])
data = data.dropna(subset = ['contraband_found'])
data = data.dropna(subset = ['stop_outcome'])
data = data.dropna(subset = ['officer_id'])

# Drivers age is important but to avoid dropping further data we will fill this in with average age
data['driver_age']=data['driver_age'].astype(float) #convert data from strings to floats
mean = data['driver_age'].mean()
print("mean=", mean)
data['driver_age'].fillna(mean, inplace=True)

pd.isnull(data).sum()

mean= 38.805525581327295


id                            0
state                         0
stop_date                     0
stop_time                     0
location_raw                  0
county_name                   0
county_fips                   0
fine_grained_location       202
police_department             0
driver_gender                 0
driver_age_raw              767
driver_age                    0
driver_race_raw               0
driver_race                   0
violation_raw                 0
violation                     0
search_conducted              0
search_type_raw               0
search_type              270902
contraband_found              0
stop_outcome                  0
is_arrested                   0
officer_id                    0
dtype: int64

In [7]:
data.shape


(274201, 23)

In [8]:
features_X = data.drop(['id','state','stop_date','stop_time','fine_grained_location','driver_age_raw','violation_raw','search_type','officer_id','county_name','driver_race','is_arrested'], axis = 1)
features_X.corr()

classLabel_Y = data['is_arrested']
features_X.head()

Unnamed: 0,location_raw,county_fips,police_department,driver_gender,driver_age,driver_race_raw,violation,search_conducted,search_type_raw,contraband_found,stop_outcome
0,East Montpelier,50023.0,MIDDLESEX VSP,M,22.0,White,Moving violation,False,No Search Conducted,False,Citation
3,Whiting,50001.0,NEW HAVEN VSP,F,18.0,White,Moving violation,False,No Search Conducted,False,Arrest for Violation
4,Hardwick,50005.0,ROYALTON VSP,M,18.0,White,Moving violation,False,No Search Conducted,False,Written Warning
5,Hardwick,50005.0,ROYALTON VSP,F,20.0,White,Equipment,False,No Search Conducted,False,Written Warning
8,Rochester,50027.0,ROCKINGHAM VSP,M,24.0,Black,Moving violation,False,No Search Conducted,False,Written Warning


In [9]:
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

# Converting all labels to numbers
features_X['location_raw']= label_encoder.fit_transform(features_X['location_raw']) 
features_X['county_fips']= label_encoder.fit_transform(features_X['county_fips'])
features_X['police_department']= label_encoder.fit_transform(features_X['police_department'])
features_X['driver_gender']= label_encoder.fit_transform(features_X['driver_gender'])
features_X['driver_race_raw']= label_encoder.fit_transform(features_X['driver_race_raw'])
features_X['violation']= label_encoder.fit_transform(features_X['violation'])
features_X['search_type_raw']= label_encoder.fit_transform(features_X['search_type_raw'])
features_X['contraband_found']= label_encoder.fit_transform(features_X['contraband_found'])
features_X['stop_outcome']= label_encoder.fit_transform(features_X['stop_outcome'])
#features_X['is_arrested']= label_encoder.fit_transform(features_X['is_arrested'])



In [10]:
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
#number of true/false in class label before balancing data
counter = Counter(classLabel_Y)
print(counter)
#the data needs to be balanced
#oversampeling the minority class with smote
oversample = SMOTE(sampling_strategy=0.0125)
data_X, data_Y = oversample.fit_resample(features_X, classLabel_Y)
#undersampeling the majority class
under = RandomUnderSampler(sampling_strategy=0.9)
data_X, data_Y = under.fit_resample(data_X, data_Y)
#number of true/false in class label afrer balancing data
counter = Counter(data_Y)
print(counter)

data_X = pd.DataFrame(data_X)

Counter({False: 270911, True: 3290})
Counter({False: 3762, True: 3386})


In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
#classifing with naive bayes
clf = GaussianNB()
scores = cross_val_score(clf, data_X, data_Y, cv=10) 
predicts = cross_val_predict(clf, data_X, data_Y, cv=10) 

print("Accuracy:", scores.mean()*100)
print(classification_report(data_Y, predicts))

Accuracy: 69.45836220959455
              precision    recall  f1-score   support

       False       0.63      0.99      0.77      3762
        True       0.97      0.37      0.53      3386

    accuracy                           0.69      7148
   macro avg       0.80      0.68      0.65      7148
weighted avg       0.79      0.69      0.66      7148



In [14]:
from sklearn.neighbors import KNeighborsClassifier

#for knn we need to scale data and use pca reduction
scaler = sk.preprocessing.MinMaxScaler()
pca = PCA()
knn = KNeighborsClassifier()

pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])

param_grid = {
    'knn__n_neighbors': list(range(2, 30)),  
    'pca__n_components': list(range(2, 11))
}

grid_search = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid_search.fit(data_X, data_Y)

print(grid_search.best_score_*100)
print(grid_search.best_params_)

99.79015109121433
{'knn__n_neighbors': 2, 'pca__n_components': 3}


In [None]:

#predicts = cross_val_predict(grid_search, data_X, data_Y, cv=10) 
#print(classification_report(data_Y, predicts))
nested_score = cross_val_score(grid_search, data_X, data_Y, cv=10)

print("Accuracy:", nested_score.mean()*100)