In [1]:
# Import Dependencies
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

## Data Import and Pre-Processing

In [2]:
# Import 2016, 2017 and 2018 crime data
crime_2016 = os.path.join("..","Resources", "crime_clean_2016.csv") 
crime_2016_data = pd.read_csv(crime_2016, index_col="id")

crime_2017 = os.path.join("..","Resources", "crime_clean_2017.csv") 
crime_2017_data = pd.read_csv(crime_2017, index_col="id")

crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv") 
crime_2018_data = pd.read_csv(crime_2018, index_col="id")

In [3]:
# Check if columns in dataframes are identical 
np.testing.assert_array_equal(crime_2016_data.columns.values, crime_2017_data.columns.values)
np.testing.assert_array_equal(crime_2016_data.columns.values, crime_2018_data.columns.values)

In [4]:
# Join datasets from 2016, 2017, and 2018
join1 = crime_2016_data.append(crime_2017_data)
training_data = join1.append(crime_2018_data)

# Check sum of lengths of individual datasets match combined length
length_combined = len(crime_2016_data) + len(crime_2017_data) + len(crime_2018_data)
if length_combined == len(training_data):
    print("Same length")
else: 
    print("Different length")

Same length


In [5]:
# Examine data features
for value in training_data.columns.values:
    print(f"Column name: {value}, Unique Values: {len(training_data[value].unique())}")

# Drop columns with high cardinality / many unique values or redundant ones
training_data_clean = training_data.drop(columns=["date", "day", "year", "time", "month_day", "block", "beat", "iucr", 
                                                "x_coordinate", "y_coordinate", "latitude", "longitude"], axis=1)

Column name: date, Unique Values: 349467
Column name: day, Unique Values: 31
Column name: month, Unique Values: 12
Column name: year, Unique Values: 3
Column name: time, Unique Values: 2276
Column name: hour, Unique Values: 24
Column name: month_day, Unique Values: 368
Column name: day_of_week, Unique Values: 7
Column name: district, Unique Values: 23
Column name: block, Unique Values: 32024
Column name: ward, Unique Values: 50
Column name: beat, Unique Values: 274
Column name: community_area, Unique Values: 77
Column name: description, Unique Values: 360
Column name: location_description, Unique Values: 168
Column name: x_coordinate, Unique Values: 62949
Column name: y_coordinate, Unique Values: 100331
Column name: iucr, Unique Values: 351
Column name: fbi_code, Unique Values: 26
Column name: primary_type, Unique Values: 34
Column name: domestic, Unique Values: 2
Column name: latitude, Unique Values: 259596
Column name: longitude, Unique Values: 259512
Column name: arrest, Unique Valu

In [6]:
# Check data type of each feature
training_data_clean.dtypes

month                    int64
hour                     int64
day_of_week              int64
district                 int64
ward                     int64
community_area           int64
description             object
location_description    object
fbi_code                object
primary_type            object
domestic                  bool
arrest                    bool
dtype: object

### Encoding

In [7]:
# Label encode categorical features
from sklearn.preprocessing import LabelEncoder

# Choose categorical features
objects_training_data = training_data_clean[["description", "location_description", 
                                             "fbi_code", "primary_type", "domestic", "arrest"]]
numerical_training_data = training_data_clean[["month", "hour", "day_of_week", "district", 
                                               "ward", "community_area"]]

# Encode categorical features
cat_objects_training_data = objects_training_data.apply(LabelEncoder().fit_transform)
cat_objects_training_data

Unnamed: 0_level_0,description,location_description,fbi_code,primary_type,domestic,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10819224,236,3,7,32,0,0
10801137,46,124,5,2,0,0
10801110,195,124,23,16,0,1
10802006,135,93,10,2,1,0
10801865,321,124,16,6,1,0
...,...,...,...,...,...,...
11459757,2,124,22,23,0,0
11315895,149,124,13,9,0,0
11196173,176,124,13,9,0,0
11533503,3,124,19,30,1,0


In [8]:
# One hot encode features
encoded_object_df = pd.get_dummies(cat_objects_training_data, columns=["description", "location_description", 
                                                                       "fbi_code", "primary_type"])
encoded_object_df2 = pd.get_dummies(numerical_training_data, columns=["month", "hour", "day_of_week",
                                                                      "district", "ward", "community_area"])

In [9]:
# Join encoded categorical and numerical dataframes
training_data_final = encoded_object_df.join(encoded_object_df2)
training_data_final

Unnamed: 0_level_0,domestic,arrest,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,community_area_68,community_area_69,community_area_70,community_area_71,community_area_72,community_area_73,community_area_74,community_area_75,community_area_76,community_area_77
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10819224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10801137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10801110,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10802006,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10801865,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11459757,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11315895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11196173,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11533503,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Import Testing Data

In [10]:
# Import 2019 crime data and prepare for testing
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv") 
crime_2019_data = pd.read_csv(crime_2019, index_col='id')

In [11]:
# Drop columns that will not be used as features / reflecting process during model building
crime_2019_df = crime_2019_data.drop(columns=["date", "day", "year", "time", "month_day", "block", "beat", "iucr", 
                                                "x_coordinate", "y_coordinate", "latitude", "longitude"], axis=1)
crime_2019_df.head()

Unnamed: 0_level_0,month,hour,day_of_week,district,ward,community_area,description,location_description,fbi_code,primary_type,domestic,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11938228,12,23,1,7,6,69,UNLAWFUL POSS OF HANDGUN,STREET,15,WEAPONS VIOLATION,False,True
11940078,12,23,1,7,16,68,AGGRAVATED:KNIFE/CUTTING INSTR,SIDEWALK,04B,BATTERY,False,False
11938240,12,23,1,15,29,25,UNLAWFUL POSS OF HANDGUN,VEHICLE NON-COMMERCIAL,15,WEAPONS VIOLATION,False,True
11937967,12,23,1,11,28,27,UNLAWFUL POSS OF HANDGUN,STREET,15,WEAPONS VIOLATION,False,False
11938124,12,23,1,22,34,73,GUN OFFENDER: DUTY TO REPORT CHANGE OF INFORMA...,STREET,26,OTHER OFFENSE,False,True


In [12]:
# Choose categorical features
objects_training_data_2019 = crime_2019_df[["description", "location_description", 
                                                  "fbi_code", "primary_type", "domestic", "arrest"]]
numerical_training_data_2019 = crime_2019_df[["month", "hour", "day_of_week", "district", 
                                                    "ward", "community_area"]]

# Encode categorical features
cat_objects_training_data_2019 = objects_training_data_2019.apply(LabelEncoder().fit_transform)

In [13]:
# One hot encode features
encoded_object_2019_df = pd.get_dummies(cat_objects_training_data_2019, columns=["description", "location_description", 
                                                                                 "fbi_code", "primary_type"])
encoded_object_2019_df2 = pd.get_dummies(numerical_training_data_2019, columns=["month", "hour", "day_of_week",
                                                                                "district", "ward", "community_area"])

In [14]:
# Join encoded categorical and numerical dataframes in the testing data
crime_2019_final = encoded_object_2019_df.join(encoded_object_2019_df2)
testing_data_cols = crime_2019_final.columns.values

crime_2019_final.head()

Unnamed: 0_level_0,domestic,arrest,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,community_area_68,community_area_69,community_area_70,community_area_71,community_area_72,community_area_73,community_area_74,community_area_75,community_area_76,community_area_77
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
11940078,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
11938240,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11937967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11938124,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Standardize testing and training data

In [15]:
# Keep only columns in training data that also exist in testing data
training_data_final = training_data_final[testing_data_cols]

In [16]:
# Assert training and testing data equal
np.testing.assert_array_equal(crime_2019_final.columns.values, training_data_final.columns.values)

## Data Exploration

In [17]:
# Examine distribution of target variable
arrest_count = training_data_final["arrest"].value_counts(normalize=True)
arrest_count
# arrest count target has moderately imbalanced classes, can proceed with model building

0    0.801126
1    0.198874
Name: arrest, dtype: float64

In [18]:
# Determine which features to keep

# Separate data by arrest (1) and no arrest (0)
Arrest_1_df = training_data_final.loc[training_data_final["arrest"] == 1]
Arrest_0_df = training_data_final.loc[training_data_final["arrest"] == 0]

# Find means for each feature in both datasets
Arrest_1_mean = Arrest_1_df.mean()
Arrest_0_mean = Arrest_0_df.mean()

# Calculate normalized differences of the features for the two datasets
# Values that differ most between two datasets are more likely to be useful as a differentiator between the two classes 
differences = (Arrest_1_mean.subtract(Arrest_0_mean))
normalized_diffs = differences.abs().sort_values()

pd.set_option("max_rows", None)
df = pd.DataFrame(normalized_diffs)
df

Unnamed: 0,0
description_17,4.485072e-08
description_293,8.970145e-08
location_description_12,1.36001e-06
location_description_127,1.494562e-06
description_188,1.494562e-06
location_description_136,1.539413e-06
description_32,1.539413e-06
location_description_134,1.584264e-06
description_289,1.584264e-06
description_280,1.584264e-06


In [19]:
# Keep only features with differences greater than 1e-02 between arrests and non-arrests in training model
columns_to_keep = normalized_diffs.loc[lambda x: x > 1.0e-02].keys().tolist()
print(f"Count of columns kept: {len(columns_to_keep)}")

training_df_final = training_data_final[columns_to_keep]
training_df_final.head()

Count of columns kept: 100


Unnamed: 0_level_0,community_area_27,primary_type_1,month_2,location_description_79,location_description_119,month_3,ward_43,fbi_code_10,district_1,community_area_28,...,district_11,location_description_143,description_0,location_description_124,fbi_code_16,primary_type_6,fbi_code_7,fbi_code_20,primary_type_18,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10819224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10801137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10801110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
10802006,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
10801865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,0


In [20]:
# Examine pearson correlation matrix to see if any more variables should be excluded from analysis
training_df_final.corr(method='pearson').unstack().sort_values().drop_duplicates()
# Customary to keep variables with correlation coefficients smaller than absolute value of 0.8

fbi_code_7                fbi_code_10                -0.242142
fbi_code_16               fbi_code_7                 -0.196728
fbi_code_7                description_296            -0.195971
                          domestic                   -0.191390
                          fbi_code_25                -0.177953
location_description_124  location_description_17    -0.172403
primary_type_1            fbi_code_7                 -0.157318
primary_type_6            fbi_code_10                -0.152760
primary_type_9            fbi_code_7                 -0.147183
fbi_code_13               fbi_code_7                 -0.138717
fbi_code_10               fbi_code_25                -0.138181
description_0             fbi_code_10                -0.137799
description_325           fbi_code_7                 -0.136248
fbi_code_7                arrest                     -0.135146
location_description_143  location_description_124   -0.135133
location_description_124  fbi_code_7                 -0

In [21]:
# Keep only features with differences greater than 1e-02 in testing model
crime_2019_model1 = crime_2019_final[columns_to_keep]
crime_2019_model1.head()

Unnamed: 0_level_0,community_area_27,primary_type_1,month_2,location_description_79,location_description_119,month_3,ward_43,fbi_code_10,district_1,community_area_28,...,district_11,location_description_143,description_0,location_description_124,fbi_code_16,primary_type_6,fbi_code_7,fbi_code_20,primary_type_18,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11940078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11938240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11937967,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
11938124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
# Assert arrays equal for model training data and 2019 testing data
np.testing.assert_array_equal(crime_2019_model1.columns.values, training_df_final.columns.values)

## Model Building

In [23]:
# Split training data into training and validation sets
X = training_df_final.drop("arrest", axis=1)
y = training_df_final["arrest"].values.reshape(-1,1)
print(X.shape, y.shape)

(787901, 99) (787901, 1)


In [24]:
# Split into training and testing
from sklearn.model_selection import train_test_split

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Fit data 
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier()
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [34]:
# Test model using validation dataset
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Evaluation Data Score: {classifier.score(X_eval, y_eval)}")

Training Data Score: 0.8754204213732707
Evaluation Data Score: 0.8757400955698974


In [35]:
# Take a look at roc auc score
from sklearn.metrics import roc_auc_score

predictions = classifier.predict(X_eval)
roc_auc_score(y_eval, predictions)

0.7005352565970304

## Model Evaluation

### Make Predictions on 2019 data

In [36]:
# Split 2019 into X(features) and y(target)
X_2019 = crime_2019_model1.drop("arrest", axis=1)
y_2019 = crime_2019_model1["arrest"].values.reshape(-1,1)
print(X_2019.shape, y_2019.shape)

(256908, 99) (256908, 1)


In [37]:
# Use model to predict 2019 arrests
predictions_2019 = classifier.predict(X_2019)
predictions_2019

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
# Take a look at roc auc score
roc_auc_score(y_2019, predictions_2019)

0.6647819451164567

In [39]:
# Examine confusion matrix
pd.crosstab(y_2019.ravel(), predictions_2019.ravel(), rownames=['True'], colnames=['Predicted'], margins=True) 

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,201252,700,201952
1,36654,18302,54956
All,237906,19002,256908


In [40]:
from sklearn.metrics import classification_report
print(classification_report(y_2019, predictions_2019))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92    201952
           1       0.96      0.33      0.49     54956

    accuracy                           0.85    256908
   macro avg       0.90      0.66      0.71    256908
weighted avg       0.87      0.85      0.83    256908



In [None]:
# Model good at not labeling non-arrests as arrests, but only labeled true arrests 33% of the time.