In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

## Import and Check Chicago Crime Datasets

In [2]:
# Import 2016-2019 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv")
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv")
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv")

crime_2016_df_final = pd.read_csv(crime_2016)
crime_2017_df_final = pd.read_csv(crime_2017)
crime_2018_df_final = pd.read_csv(crime_2018)

# 2019 is the test data
test_data = pd.read_csv(crime_2019)

# Join datasets for 2016, 2017, and 2018 for the training data
join1 = crime_2016_df_final.append(crime_2017_df_final)
training_data = join1.append(crime_2018_df_final)

In [3]:
#number training records 2016-2018
training_data.id.count()

787901

In [4]:
#data types
training_data.dtypes

id                        int64
date                     object
day                       int64
month                     int64
year                      int64
time                     object
hour                      int64
month_day                 int64
day_of_week               int64
district                  int64
block                    object
ward                      int64
beat                      int64
community_area            int64
description              object
location_description     object
x_coordinate              int64
y_coordinate              int64
iucr                     object
fbi_code                 object
primary_type             object
domestic                   bool
latitude                float64
longitude               float64
arrest                     bool
dtype: object

In [5]:
#number test records 2019
test_data.id.count()

256908

In [6]:
#function to Convert data to numbers
from sklearn import preprocessing
def convert(data):
    number = preprocessing.LabelEncoder()
    data['date'] = number.fit_transform(data.date)
    data['time'] = number.fit_transform(data.time)
    data['block'] = number.fit_transform(data.block)
    data['description'] = number.fit_transform(data.description)
    data['location_description'] = number.fit_transform(data.location_description)
    data['iucr'] = number.fit_transform(data.iucr)
    data['fbi_code'] = number.fit_transform(data.fbi_code)
    data['primary_type'] = number.fit_transform(data.primary_type)
    data['domestic'] = number.fit_transform(data.domestic)
    data['latitude'] = number.fit_transform(data.latitude)
    data['longitude'] = number.fit_transform(data.longitude)
    #data['arrest'] = number.fit_transform(data.arrest)
    return data

In [7]:
#function to drop unneeded columns/keeping only features needed for model
def set_data(data):
    data = (data[[
              #'date'
              #'day'
              'month'
              #,'year'
              #,'time'
              ,'hour'
              #,'month_day'
              ,'day_of_week'
              ,'district'
              #,'block'
              ,'ward'
              ,'beat'
              ,'community_area'
              ,'description'
              ,'location_description'
              #,'x_coordinate'
              #,'y_coordinate'
              ,'iucr'
              ,'fbi_code'
              ,'primary_type'
              ,'domestic'
              #,'latitude'
              #,'longitude'
            ]])
    return data

In [8]:
#LabelEncode the data
train_enc_data = convert(training_data)
test_enc_data = convert(test_data)

In [9]:
# set the training and test labels/features
X_train=set_data(train_enc_data)
X_test=set_data(test_enc_data)

In [10]:
X_train

Unnamed: 0,month,hour,day_of_week,district,ward,beat,community_area,description,location_description,iucr,fbi_code,primary_type,domestic
0,12,23,5,16,41,1651,76,236,3,80,7,32,0
1,12,23,5,5,34,522,49,46,124,32,5,2,0
2,12,23,5,19,32,1932,6,195,124,269,23,16,0
3,12,23,5,1,42,111,32,135,93,49,10,2,1
4,12,23,5,6,6,623,69,321,124,143,16,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
260100,1,0,0,9,15,924,61,2,124,211,22,23,0
260101,1,0,0,7,16,713,67,149,124,119,13,9,0
260102,1,0,0,8,18,835,70,176,124,118,13,9,0
260103,1,0,0,5,9,533,54,3,124,189,19,30,1


In [11]:
#set the target to be arrest with target name true/false
y_train = train_enc_data["arrest"]
y_test = test_enc_data["arrest"]
#target_names = ["True", "False"]

In [12]:
feature_names = X_train.columns
X_train.head()

Unnamed: 0,month,hour,day_of_week,district,ward,beat,community_area,description,location_description,iucr,fbi_code,primary_type,domestic
0,12,23,5,16,41,1651,76,236,3,80,7,32,0
1,12,23,5,5,34,522,49,46,124,32,5,2,0
2,12,23,5,19,32,1932,6,195,124,269,23,16,0
3,12,23,5,1,42,111,32,135,93,49,10,2,1
4,12,23,5,6,6,623,69,321,124,143,16,6,1


In [13]:
# features/labels
feature_names

Index(['month', 'hour', 'day_of_week', 'district', 'ward', 'beat',
       'community_area', 'description', 'location_description', 'iucr',
       'fbi_code', 'primary_type', 'domestic'],
      dtype='object')

In [14]:
#preview test data
X_test.head()

Unnamed: 0,month,hour,day_of_week,district,ward,beat,community_area,description,location_description,iucr,fbi_code,primary_type,domestic
0,12,23,1,7,6,731,69,327,134,148,17,31,0
1,12,23,1,7,16,724,68,54,130,30,5,2,0
2,12,23,1,15,29,1533,25,327,146,148,17,31,0
3,12,23,1,11,28,1133,27,327,134,148,17,31,0
4,12,23,1,22,34,2232,73,166,134,309,25,23,0


In [15]:
#shape training data
X_train.shape

(787901, 13)

In [16]:
#shape test data
X_test.shape

(256908, 13)

In [17]:
#Scale the data for SVM, requires normalization
#reference: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html
from sklearn.preprocessing import StandardScaler
y_train = np.array(y_train).reshape((len(y_train), 1),order='C')
y_test = np.array(y_test).reshape((len(y_test), 1),order='C')

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
#model is faster with C-contiguous data
#reference: https://scikit-learn.org/stable/modules/svm.html
X_train_scaled = np.asarray(X_train_scaled, order='C')
X_test_scaled = np.asarray(X_test_scaled, order='C')

In [19]:
# Support vector machine Linear Support Vector Classification 
# reference: https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/modules/generated/sklearn.svm.LinearSVC.html
# Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, 
# so it has more flexibility in the choice of penalties and loss functions and should scale better to
# large numbers of samples.
# dual: Select the algorithm to either solve the dual or primal optimization problem. 
# Prefer dual=False when n_samples > n_features.

from sklearn.svm import LinearSVC 
model = LinearSVC(dual=False)
model.fit(X_train_scaled, y_train.ravel())

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [20]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.787


In [21]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.79      1.00      0.88    201952
        True       0.72      0.01      0.01     54956

    accuracy                           0.79    256908
   macro avg       0.75      0.50      0.45    256908
weighted avg       0.77      0.79      0.69    256908



In [22]:
# Output to table
class_report = classification_report(y_test, predictions, output_dict=True)

class_report_df = pd.DataFrame(class_report).transpose()
class_report_df = class_report_df.round(2)
class_report_df["support"] = class_report_df["support"].astype(int)
class_report_df.to_csv("SVM_report")