In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import os

## Import and Check Chicago Crime Datasets

In [None]:
# Import 2016-2019 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv")
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv")
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv")

crime_2016_df_final = pd.read_csv(crime_2016)
crime_2017_df_final = pd.read_csv(crime_2017)
crime_2018_df_final = pd.read_csv(crime_2018)

# 2019 is the test data
test_data = pd.read_csv(crime_2019)

# Join datasets for 2016, 2017, and 2018 for the training data
join1 = crime_2016_df_final.append(crime_2017_df_final)
training_data = join1.append(crime_2018_df_final)

In [None]:
#number training records 2016-2018
training_data.id.count()

In [None]:
#data types
training_data.dtypes

In [None]:
#number test records 2019
test_data.id.count()

In [None]:
#function to Convert data to numbers
from sklearn import preprocessing
def convert(data):
    number = preprocessing.LabelEncoder()
    data['date'] = number.fit_transform(data.date)
    data['time'] = number.fit_transform(data.time)
    data['block'] = number.fit_transform(data.block)
    data['description'] = number.fit_transform(data.description)
    data['location_description'] = number.fit_transform(data.location_description)
    data['iucr'] = number.fit_transform(data.iucr)
    data['fbi_code'] = number.fit_transform(data.fbi_code)
    data['primary_type'] = number.fit_transform(data.primary_type)
    data['domestic'] = number.fit_transform(data.domestic)
    data['latitude'] = number.fit_transform(data.latitude)
    data['longitude'] = number.fit_transform(data.longitude)
    #data['arrest'] = number.fit_transform(data.arrest)
    return data

In [None]:
#function to drop unneeded columns/keeping only features needed for model
def set_data(data):
    data = (data[[
              #'date'
              #'day'
              'month'
              #,'year'
              #,'time'
              ,'hour'
              #,'month_day'
              ,'day_of_week'
              ,'district'
              #,'block'
              ,'ward'
              ,'beat'
              ,'community_area'
              ,'description'
              ,'location_description'
              #,'x_coordinate'
              #,'y_coordinate'
              ,'iucr'
              ,'fbi_code'
              ,'primary_type'
              ,'domestic'
              #,'latitude'
              #,'longitude'
            ]])
    return data

In [None]:
#LabelEncode the data
train_enc_data = convert(training_data)
test_enc_data = convert(test_data)

In [None]:
# set the training and test labels/features
X_train=set_data(train_enc_data)
X_test=set_data(test_enc_data)

In [None]:
X_train

In [None]:
#set the target to be arrest with target name true/false
y_train = train_enc_data["arrest"]
y_test = test_enc_data["arrest"]
#target_names = ["True", "False"]

In [None]:
feature_names = X_train.columns
X_train.head()

In [None]:
# features/labels
feature_names

In [None]:
#preview test data
X_test.head()

In [None]:
#shape training data
X_train.shape

In [None]:
#shape test data
X_test.shape

In [None]:
#Scale the data for SVM, requires normalization
#reference: https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html
from sklearn.preprocessing import StandardScaler
y_train = np.array(y_train).reshape((len(y_train), 1),order='C')
y_test = np.array(y_test).reshape((len(y_test), 1),order='C')

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#model is faster with C-contiguous data
#reference: https://scikit-learn.org/stable/modules/svm.html
X_train_scaled = np.asarray(X_train_scaled, order='C')
X_test_scaled = np.asarray(X_test_scaled, order='C')

In [None]:
# Support vector machine Linear Support Vector Classification 
# reference: https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/modules/generated/sklearn.svm.LinearSVC.html
# Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, 
# so it has more flexibility in the choice of penalties and loss functions and should scale better to
# large numbers of samples.
# dual: Select the algorithm to either solve the dual or primal optimization problem. 
# Prefer dual=False when n_samples > n_features.

from sklearn.svm import LinearSVC 
model = LinearSVC(dual=False)
model.fit(X_train_scaled, y_train.ravel())

In [None]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions))