In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report

import tensorflow
tensorflow.keras.__version__
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
incidents = pd.read_csv("../incident_data/yearly_incident_data/all_incidents.csv")
incidents = incidents.dropna()
incidents.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010/01/01,00:30
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010/01/01,01:05
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010/01/01,04:18
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010/01/01,03:00
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010/01/01,02:23


In [3]:
incidents['incidentDate'] = pd.to_datetime(incidents['incidentDate'], format='%Y/%m/%d')
incidents['month']= incidents['incidentDate'].dt.month 
incidents['weekday'] =  incidents['incidentDate'].dt.dayofweek
incidents['day'] =  incidents['incidentDate'].dt.day
incidents.head()

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,neighborhood,incidentDate,incidentTime,month,weekday,day
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,Windom,2010-01-01,00:30,1,4,1
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,Windom,2010-01-01,01:05,1,4,1
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,Bryant,2010-01-01,04:18,1,4,1
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,Loring Park,2010-01-01,03:00,1,4,1
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,Folwell,2010-01-01,02:23,1,4,1


In [4]:
# One hot encode the neighborhoods
one_hot = pd.get_dummies(incidents['neighborhood'])
incidents = incidents.drop('neighborhood', axis=1)
incidents =incidents.join(one_hot)
incidents.head()

Unnamed: 0,publicAddress,caseNumber,precinct,offense,description,ucrCode,lat,lon,incidentDate,incidentTime,...,Ventura Village,Victory,Waite Park,Webber - Camden,Wenonah,West Calhoun,Whittier,Willard - Hay,Windom,Windom Park
0,0056Xx Lyndale Av S,MP2010000051,5,CSCR,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,2010-01-01,00:30,...,0,0,0,0,0,0,0,0,1,0
1,0001Xx 62 St W,MP2010000099,5,ROBPAG,Robbery Per Agg,4.0,44.890636,-93.280041,2010-01-01,01:05,...,0,0,0,0,0,0,0,0,1,0
2,0039Xx 5 Av S,MP2010000373,3,BURGD,Burglary Of Dwelling,6.0,44.931448,-93.268841,2010-01-01,04:18,...,0,0,0,0,0,0,0,0,0,0
3,0013Xx Nicollet Av S,MP2010000305,1,ASLT2,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,2010-01-01,03:00,...,0,0,0,0,0,0,0,0,0,0
4,0034Xx Oliver Av N,MP2010000229,4,BURGD,Burglary Of Dwelling,6.0,45.017746,-93.306988,2010-01-01,02:23,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# One hot encode the offenses
one_hot = pd.get_dummies(incidents['offense'])
incidents = incidents.drop('offense', axis=1)
incidents =incidents.join(one_hot)
incidents.head()

Unnamed: 0,publicAddress,caseNumber,precinct,description,ucrCode,lat,lon,incidentDate,incidentTime,month,...,TFMV,TFMV.1,TFPER,TFPER.1,THEFT,THEFT.1,THFTSW,THFTSW.1,TMVP,TMVP.1
0,0056Xx Lyndale Av S,MP2010000051,5,Crim Sex Cond-Rape,3.0,44.900291,-93.288239,2010-01-01,00:30,1,...,0,0,0,0,0,0,0,0,0,0
1,0001Xx 62 St W,MP2010000099,5,Robbery Per Agg,4.0,44.890636,-93.280041,2010-01-01,01:05,1,...,0,0,0,0,0,0,0,0,0,0
2,0039Xx 5 Av S,MP2010000373,3,Burglary Of Dwelling,6.0,44.931448,-93.268841,2010-01-01,04:18,1,...,0,0,0,0,0,0,0,0,0,0
3,0013Xx Nicollet Av S,MP2010000305,1,Asslt W/Dngrs Weapon,5.0,44.970506,-93.277714,2010-01-01,03:00,1,...,0,0,0,0,0,0,0,0,0,0
4,0034Xx Oliver Av N,MP2010000229,4,Burglary Of Dwelling,6.0,45.017746,-93.306988,2010-01-01,02:23,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# First run using the neighborhoods: .547 NB, .5402 DNN(100epochs)
# selected_features = incidents.drop(['publicAddress', 'caseNumber', 'offense', 'description', 'lat', 'lon', 'incidentDate', 'incidentTime', 'ucrCode'], axis=1)

# Second run using offenses and neighborhoods: .999 NB, .999 DNN(10epochs)
selected_features = incidents.drop(['publicAddress', 'caseNumber', 'description', 'lat', 'lon', 'incidentDate', 'incidentTime', 'ucrCode'], axis=1)

selected_features.head()

Unnamed: 0,precinct,month,weekday,day,Armatage,Audubon Park,Bancroft,Beltrami,Bottineau,Bryant,...,TFMV,TFMV.1,TFPER,TFPER.1,THEFT,THEFT.1,THFTSW,THFTSW.1,TMVP,TMVP.1
0,5,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,4,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = selected_features
y = incidents['ucrCode']

## Start of Naive Bayes work

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB()

In [10]:
predictions = nb.predict(X_test)
print(predictions)

[7. 7. 8. ... 6. 8. 7.]


In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.999889993839655

## Deep Learning NN Model

In [12]:
# Scale the x data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Encode the y data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [14]:
# Convert the y to categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [15]:
print(X_train_scaled.shape, y_train_categorical.shape)

(181807, 162) (181807, 8)


In [16]:
# Set up the model
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=162))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=8, activation='softmax'))

In [17]:
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               16300     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 808       
Total params: 27,208
Trainable params: 27,208
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(X_train_scaled, y_train_categorical, epochs=10, shuffle=True, verbose=2)

Train on 181807 samples
Epoch 1/10
181807/181807 - 7s - loss: 0.0014 - accuracy: 0.9928
Epoch 2/10
181807/181807 - 6s - loss: 4.3585e-06 - accuracy: 1.0000
Epoch 3/10
181807/181807 - 6s - loss: 1.1146e-06 - accuracy: 1.0000
Epoch 4/10
181807/181807 - 6s - loss: 1.9904e-07 - accuracy: 1.0000
Epoch 5/10
181807/181807 - 6s - loss: 3.3982e-10 - accuracy: 1.0000
Epoch 6/10
181807/181807 - 6s - loss: 1.6005e-10 - accuracy: 1.0000
Epoch 7/10
181807/181807 - 6s - loss: 8.9624e-11 - accuracy: 1.0000
Epoch 8/10
181807/181807 - 6s - loss: 6.0938e-11 - accuracy: 1.0000
Epoch 9/10
181807/181807 - 6s - loss: 4.6338e-11 - accuracy: 1.0000
Epoch 10/10
181807/181807 - 6s - loss: 3.7430e-11 - accuracy: 1.0000


<tensorflow.python.keras.callbacks.History at 0x216b3eada90>

In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

45452/45452 - 3s - loss: 3.8540e-06 - accuracy: 1.0000
Loss: 3.853987780982381e-06, Accuracy: 0.9999780058860779
