# SGD Classifier with Ensemble Methods: Bootstrap Aggregation

In [1]:
# Import Dependencies
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

In [2]:
# Import 2018 crime data
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv") 
crime_2018_data = pd.read_csv(crime_2018, index_col="id")

In [3]:
# Process data to get desired format
training_2018 = crime_2018_data.drop(columns=["date", "day", "year", "time", "month_day", "block", "beat", "iucr", 
                                                "x_coordinate", "y_coordinate", "latitude", "longitude"], axis=1)

In [4]:
# Label encode categorical features
from sklearn.preprocessing import LabelEncoder

# Choose categorical features
objects_training_data_2018 = training_2018[["description", "location_description", 
                                             "fbi_code", "primary_type", "domestic", "arrest"]]
numerical_training_data_2018 = training_2018[["month", "hour", "day_of_week", "district", 
                                               "ward", "community_area"]]

# Encode categorical features
cat_objects_training_data_2018 = objects_training_data_2018.apply(LabelEncoder().fit_transform)
cat_objects_training_data_2018

Unnamed: 0_level_0,description,location_description,fbi_code,primary_type,domestic,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11556487,295,129,16,6,0,0
11552699,291,17,16,6,0,0
11552724,13,96,10,2,0,1
11552731,126,17,10,2,0,0
11552715,16,129,5,2,0,0
...,...,...,...,...,...,...
11459757,2,108,22,22,0,0
11315895,139,108,13,9,0,0
11196173,163,108,13,9,0,0
11533503,3,108,19,29,1,0


In [5]:
# One hot encode features
encoded_object_2018_df = pd.get_dummies(cat_objects_training_data_2018, columns=["description", "location_description", 
                                                                                 "fbi_code", "primary_type"])
encoded_object_2018_df2 = pd.get_dummies(numerical_training_data_2018, columns=["month", "hour", "day_of_week",
                                                                                "district", "ward", "community_area"])

In [6]:
# Join encoded categorical and numerical dataframes
training_data_2018_final = encoded_object_2018_df.join(encoded_object_2018_df2)
training_data_2018_final

Unnamed: 0_level_0,domestic,arrest,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,community_area_68,community_area_69,community_area_70,community_area_71,community_area_72,community_area_73,community_area_74,community_area_75,community_area_76,community_area_77
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11556487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
11552699,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11552724,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11552731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11552715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11459757,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11315895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11196173,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11533503,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Import testing data

In [7]:
# Import 2019 crime data and prepare for testing
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv") 
crime_2019_data = pd.read_csv(crime_2019, index_col='id')

In [8]:
# Drop columns that will not be used as features / reflecting process during model building
crime_2019_df = crime_2019_data.drop(columns=["date", "day", "year", "time", "month_day", "block", "beat", "iucr", 
                                                "x_coordinate", "y_coordinate", "latitude", "longitude"], axis=1)

In [9]:
# Choose categorical features
objects_training_data_2019 = crime_2019_df[["description", "location_description", 
                                                  "fbi_code", "primary_type", "domestic", "arrest"]]
numerical_training_data_2019 = crime_2019_df[["month", "hour", "day_of_week", "district", 
                                                    "ward", "community_area"]]

# Encode categorical features
cat_objects_training_data_2019 = objects_training_data_2019.apply(LabelEncoder().fit_transform)

In [10]:
# One hot encode features
encoded_object_2019_df = pd.get_dummies(cat_objects_training_data_2019, columns=["description", "location_description", 
                                                                                 "fbi_code", "primary_type"])
encoded_object_2019_df2 = pd.get_dummies(numerical_training_data_2019, columns=["month", "hour", "day_of_week",
                                                                                "district", "ward", "community_area"])

In [11]:
# Join encoded categorical and numerical dataframes in the testing data
crime_2019_final = encoded_object_2019_df.join(encoded_object_2019_df2)

crime_2019_final.head()

Unnamed: 0_level_0,domestic,arrest,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,...,community_area_68,community_area_69,community_area_70,community_area_71,community_area_72,community_area_73,community_area_74,community_area_75,community_area_76,community_area_77
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
11940078,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
11938240,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11937967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11938124,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Standardize Training and Testing Data

In [12]:
# Keep only columns in testing data that also exist in training data
training_data_2018_final = training_data_2018_final.drop(['primary_type_32'], axis=1)
training_data_cols = training_data_2018_final.columns.values
testing_data_final = crime_2019_final[training_data_cols]

In [13]:
# Assert training and testing data equal
np.testing.assert_array_equal(testing_data_final.columns.values, training_data_2018_final.columns.values)

## Data Exploration

In [14]:
# Determine which features to keep

# Separate data by arrest (1) and no arrest (0)
Arrest_1_df = training_data_2018_final.loc[training_data_2018_final["arrest"] == 1]
Arrest_0_df = training_data_2018_final.loc[training_data_2018_final["arrest"] == 0]

# Find means for each feature in both datasets
Arrest_1_mean = Arrest_1_df.mean()
Arrest_0_mean = Arrest_0_df.mean()

# Calculate normalized differences of the features for the two datasets
# Values that differ most between two datasets are more likely to be useful as a differentiator between the two classes 
differences = (Arrest_1_mean.subtract(Arrest_0_mean))
normalized_diffs = differences.abs().sort_values()

pd.set_option("max_rows", None)
df = pd.DataFrame(normalized_diffs)
df

Unnamed: 0,0
description_106,1.063464e-07
location_description_119,1.063464e-07
description_157,1.063464e-07
description_18,3.190392e-07
description_71,4.5984e-06
description_322,4.704746e-06
description_285,4.704746e-06
location_description_133,4.811092e-06
description_20,4.811092e-06
location_description_135,4.811092e-06


In [15]:
# Keep only features with differences greater than 1e-02 between arrests and non-arrests in training model
columns_to_keep = normalized_diffs.loc[lambda x: x > 1.0e-02].keys().tolist()
print(f"Count of columns kept: {len(columns_to_keep)}")

training_df_final = training_data_2018_final[columns_to_keep]
training_df_final.head()

Count of columns kept: 111


Unnamed: 0_level_0,ward_17,location_description_105,ward_2,district_24,ward_9,description_232,month_2,community_area_8,description_315,description_223,...,location_description_108,location_description_125,fbi_code_16,primary_type_6,description_0,primary_type_31,fbi_code_7,fbi_code_20,primary_type_18,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11556487,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
11552699,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
11552724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11552731,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11552715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Keep only features with differences greater than 1e-02 in testing model
testing_data_final = testing_data_final[columns_to_keep]
testing_data_final.head()

Unnamed: 0_level_0,ward_17,location_description_105,ward_2,district_24,ward_9,description_232,month_2,community_area_8,description_315,description_223,...,location_description_108,location_description_125,fbi_code_16,primary_type_6,description_0,primary_type_31,fbi_code_7,fbi_code_20,primary_type_18,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
11940078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11938240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
11937967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
11938124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Build SGD Classifier with Bagging

In [17]:
# Split training dataset into features(X) and target(y) dataframes
X = training_df_final.drop("arrest", axis=1)
y = training_df_final["arrest"].values.reshape(-1,1)
print(X.shape, y.shape)

(260105, 110) (260105, 1)


In [18]:
# Split 2019 into features(X_2019) and target(y_2019) dataframes
X_2019 = testing_data_final.drop("arrest", axis=1)
y_2019 = testing_data_final["arrest"].values.reshape(-1,1)
print(X_2019.shape, y_2019.shape)

(256908, 110) (256908, 1)


In [19]:
# Fit data
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier

bagging_fit = BaggingClassifier(base_estimator=SGDClassifier(), n_estimators=50).fit(X, y.ravel())

In [20]:
# Score model
print(f"Training Model Score: {bagging_fit.score(X, y)}")

Training Model Score: 0.8779608235135811


In [21]:
# Use bagging to predict 2019 arrests
bagging_preds_2019 = bagging_fit.predict(X_2019)
bagging_preds_2019

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
# Take a look at roc auc score
from sklearn.metrics import roc_auc_score

roc_auc_score(y_2019, bagging_preds_2019)

0.6563480001384897

In [23]:
# Examine confusion matrix
pd.crosstab(y_2019.ravel(), bagging_preds_2019.ravel(), rownames=['True'], colnames=['Predicted'], margins=True) 

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,197772,4180,201952
1,36634,18322,54956
All,234406,22502,256908


In [24]:
# Print classification report
from sklearn.metrics import classification_report

print(classification_report(y_2019, bagging_preds_2019))

              precision    recall  f1-score   support

           0       0.84      0.98      0.91    201952
           1       0.81      0.33      0.47     54956

    accuracy                           0.84    256908
   macro avg       0.83      0.66      0.69    256908
weighted avg       0.84      0.84      0.81    256908



Bagging model has worse overall accuracy than simple SGD classifier. Especially worse in labeling non arrests as arrests than SGD Classifer where 15 pts lower (field: precision 1).

## Ensemble Learning with Decision Tree and Random Forest Classifiers

In [25]:
# Perform bootstrap aggregation with different models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
clf2 = RandomForestClassifier()    

bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=50).fit(X, y.ravel())
bagging2 = BaggingClassifier(base_estimator=clf2, n_estimators=50).fit(X, y.ravel())

In [26]:
# Show model performance
from sklearn.model_selection import cross_val_score

label = ['DecisionTree', 'RandomForest', 
         'BaggedDecisionTree', 'BaggedRandomForest']
clf_list = [clf1, clf2, bagging1, bagging2]

for clf, label in zip(clf_list, label):
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    
    clf.fit(X, y)
    prediction = clf.predict(X_2019)
    print(pd.crosstab(y_2019.ravel(), prediction, rownames=['True'], colnames=['Predicted'], margins=True))
    print(classification_report(y_2019.ravel(), prediction))

Accuracy: 0.85 (+/- 0.00) [DecisionTree]
Predicted       0      1     All
True                            
0          201947      5  201952
1           40752  14204   54956
All        242699  14209  256908
              precision    recall  f1-score   support

           0       0.83      1.00      0.91    201952
           1       1.00      0.26      0.41     54956

    accuracy                           0.84    256908
   macro avg       0.92      0.63      0.66    256908
weighted avg       0.87      0.84      0.80    256908



  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Accuracy: 0.88 (+/- 0.00) [RandomForest]


  if sys.path[0] == '':


Predicted       0      1     All
True                            
0          189881  12071  201952
1           31787  23169   54956
All        221668  35240  256908
              precision    recall  f1-score   support

           0       0.86      0.94      0.90    201952
           1       0.66      0.42      0.51     54956

    accuracy                           0.83    256908
   macro avg       0.76      0.68      0.71    256908
weighted avg       0.81      0.83      0.81    256908



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.85 (+/- 0.00) [BaggedDecisionTree]


  y = column_or_1d(y, warn=True)


Predicted       0      1     All
True                            
0          201947      5  201952
1           40752  14204   54956
All        242699  14209  256908
              precision    recall  f1-score   support

           0       0.83      1.00      0.91    201952
           1       1.00      0.26      0.41     54956

    accuracy                           0.84    256908
   macro avg       0.92      0.63      0.66    256908
weighted avg       0.87      0.84      0.80    256908



  y = column_or_1d(y, warn=True)
MemoryError: Unable to allocate array with shape (173403,) and data type float64

  y = column_or_1d(y, warn=True)
MemoryError: Unable to allocate array with shape (173403,) and data type int64

  y = column_or_1d(y, warn=True)
MemoryError: Unable to allocate array with shape (173404,) and data type float64



Accuracy: nan (+/- nan) [BaggedRandomForest]


  y = column_or_1d(y, warn=True)


Predicted       0      1     All
True                            
0          191394  10558  201952
1           31649  23307   54956
All        223043  33865  256908
              precision    recall  f1-score   support

           0       0.86      0.95      0.90    201952
           1       0.69      0.42      0.52     54956

    accuracy                           0.84    256908
   macro avg       0.77      0.69      0.71    256908
weighted avg       0.82      0.84      0.82    256908

