In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import pickle
import joblib
import csv
from sklearn import preprocessing

## Import and Check Chicago Crime Datasets

In [2]:
# Import 2016-2019 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv")
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv")
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv")

crime_2016_df_final = pd.read_csv(crime_2016)
crime_2017_df_final = pd.read_csv(crime_2017)
crime_2018_df_final = pd.read_csv(crime_2018)

# 2019 is the test data
test_data = pd.read_csv(crime_2019)

# Join datasets for 2016, 2017, and 2018 for the training data
join1 = crime_2016_df_final.append(crime_2017_df_final)
training_data = join1.append(crime_2018_df_final)
X = training_data.append(test_data).copy()

In [4]:
number = preprocessing.LabelEncoder()
c = X['location_description'].unique()
b = X['primary_type'].unique()
encoded_loc = number.fit_transform(c).astype('int')
encoded_type = number.fit_transform(b).astype('int')
loc_dict = dict(zip(c,encoded_loc))
type_dict = dict(zip(b,encoded_type))

file = os.path.join("..","Resources", "loc_dict.csv") 

with open(file, "w", newline='') as outfile:
    w = csv.writer(outfile)
    w.writerow(['loc', 'val'])
    for key, val in loc_dict.items():
        w.writerow([key, val])

file = os.path.join("..","Resources", "type_dict.csv") 

with open(file, "w", newline='') as outfile:
    w = csv.writer(outfile)
    w.writerow(['type', 'val'])
    for key, val in type_dict.items():
        w.writerow([key, val])   

In [26]:
#function to Convert data to numbers
def convert(data):
    number = preprocessing.LabelEncoder()
    data['date'] = number.fit_transform(data.date)
    data['time'] = number.fit_transform(data.time)
    data['block'] = number.fit_transform(data.block)
    data['description'] = number.fit_transform(data.description)
    data['location_description'] = number.fit_transform(data.location_description)
    data['iucr'] = number.fit_transform(data.iucr)
    data['fbi_code'] = number.fit_transform(data.fbi_code)
    data['primary_type'] = number.fit_transform(data.primary_type)
    data['domestic'] = number.fit_transform(data.domestic)
    data['latitude'] = number.fit_transform(data.latitude)
    data['longitude'] = number.fit_transform(data.longitude)
    data['arrest'] = number.fit_transform(data.arrest)
    return data

In [27]:
#function to drop unneeded columns/keeping only features needed for model
def set_data(data):
    data = (data[[
              #'date'
              #'day'
              'month'
              #,'year'
              #,'time'
              ,'hour'
              #,'month_day'
              ,'day_of_week'
              #,'district'
              #,'block'
              #,'ward'
              #,'beat'
              #,'community_area'
              #,'description'
              ,'location_description'
              #,'x_coordinate'
              #,'y_coordinate'
              #,'iucr'
              #,'fbi_code'
              ,'primary_type'
              #,'domestic'
              #,'latitude'
              #,'longitude'
            ]])
    return data

In [28]:
#LabelEncode the data
X = convert(X)
y = X["arrest"]

In [29]:
X = set_data(X)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2,test_size=0.2, random_state=42)

In [32]:
#Scale the data for SVM, requires normalization
#reference: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html
from sklearn.preprocessing import StandardScaler
y_train = np.array(y_train).reshape((len(y_train), 1),order='C')
y_test = np.array(y_test).reshape((len(y_test), 1),order='C')

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
#model is faster with C-contiguous data
#reference: https://scikit-learn.org/stable/modules/svm.html
X_train_scaled = np.asarray(X_train_scaled, order='C')
X_test_scaled = np.asarray(X_test_scaled, order='C')

In [34]:
X_train_scaled.shape

(208961, 5)

In [35]:
y_train.ravel().shape

(208961,)

In [36]:
y_train

array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [37]:
y_test.shape

(208962, 1)

In [38]:
# Support vector machine linear classifier
from sklearn.neural_network import MLPClassifier

model = MLPClassifier( max_iter=175,activation='tanh',alpha=.00081,hidden_layer_sizes=(50,100,50), solver= 'adam'
                      ,learning_rate='invscaling')

model = model.fit(X_train_scaled, y_train.ravel())

In [39]:
# save the model to disk
filename = 'MLP_model.pkl'

with open(filename, 'wb') as file:
    pickle.dump(model, file)

In [40]:
# save the model to disk
filename = 'MLP_model.joblib'

joblib.dump(model, filename) 

['MLP_model.joblib']

In [41]:
#Predicting y for X_val
y_pred = model.predict(X_test_scaled)

#Importing classification_report
from sklearn.metrics import classification_report

#Printing the accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))
print(classification_report(y_test, y_pred))

Test Acc: 0.874
              precision    recall  f1-score   support

           0       0.88      0.98      0.92    166376
           1       0.83      0.47      0.60     42586

    accuracy                           0.87    208962
   macro avg       0.86      0.72      0.76    208962
weighted avg       0.87      0.87      0.86    208962



In [31]:
import matplotlib.pyplot as plt
loss_values = model.loss_curve_
print (loss_values)
plt.plot(loss_values)
plt.show()

[0.3647481362652441, 0.32509049086755293, 0.3169068254465498, 0.3120769166524505, 0.3092045557147338, 0.30719124418026267, 0.3052612536726263, 0.30379741662418264, 0.30285946548081916, 0.3018359848485192, 0.3009467013722943, 0.30048011025995097, 0.29970846964696174, 0.2989561096050603, 0.2987986684045336, 0.29824317631337616, 0.2975749714510516, 0.2970053606544441, 0.29683140160374766, 0.29642338558921555, 0.2962407705149295, 0.295907192313715, 0.2952308772554499, 0.29523863710227566, 0.29482320874883605, 0.2945885672159632, 0.29428670354159203, 0.2938374488906878, 0.29362858712632073, 0.29333043572230666, 0.2930975205976804, 0.2928814049700022, 0.29248791357499077, 0.29226418702599494, 0.2921369894885515, 0.2918686556722558, 0.29146126931734956, 0.2912393758120024, 0.29092525767423766, 0.29087303123654856, 0.2905196115137932, 0.29022895866051335, 0.2900912459722708, 0.28989868256208395, 0.2894068526360305, 0.28933342677564844, 0.2891259729130389, 0.2887390204438351, 0.2886100721382923

<Figure size 640x480 with 1 Axes>

In [42]:
file = os.path.join("..","Resources", "loc_dict.csv")
reader = csv.reader(open(file, 'r'))
loc_dict = {}
for row in reader:
    #print(row)
    k, v = row
    loc_dict[k] = v

In [43]:
file = os.path.join("..","Resources", "type_dict.csv")
reader = csv.reader(open(file, 'r'))
type_dict = {}
for row in reader:
    #print(row)
    k, v = row
    type_dict[k] = v

In [47]:
month = 6
hour = 15
day_of_week = 4
location_description = int(loc_dict.get('AIRPORT BUILDING NON-TERMINAL - SECURE AREA'))
primary_type =int(type_dict.get('THEFT'))

filename='MLP_model.pkl'
# Load from file
with open(filename, 'rb') as file:
    model = pickle.load(file)
    
print(model.predict([[month, hour, day_of_week,location_description,primary_type]]))

[0]


In [48]:
month = 6
hour = 15
day_of_week = 4
location_description = int(loc_dict.get('AIRPORT BUILDING NON-TERMINAL - SECURE AREA'))
primary_type =int(type_dict.get('THEFT'))

filename='RF_model.sav'
# Load from file
model = joblib.load(filename)
    
print(model.predict([[month, hour, day_of_week,location_description,primary_type]]))




[0]


