# SGD Classifier with Ensemble Methods: Bootstrap Aggregation

In [1]:
# Import Dependencies
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

In [2]:
# Import 2018 crime data
crime_2018 = os.path.join("..","Resources", "crime_clean_2018.csv") 
crime_2018_data = pd.read_csv(crime_2018, index_col="id")

In [5]:
# Process data to get desired format
training_2018 = crime_2018_data.drop(columns=["date", "day", "year", "time", "month_day", "district", "block", "ward", "beat",
                                              "community_area", "description","iucr", "x_coordinate", "y_coordinate", "fbi_code", 
                                              "domestic", "latitude", "longitude"], axis=1)

Unnamed: 0_level_0,month,hour,day_of_week,location_description,primary_type,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11556487,12,23,0,STREET,CRIMINAL DAMAGE,False
11552699,12,23,0,APARTMENT,CRIMINAL DAMAGE,False
11552724,12,23,0,OTHER,BATTERY,True
11552731,12,23,0,APARTMENT,BATTERY,False
11552715,12,23,0,STREET,BATTERY,False


In [7]:
# Label encode categorical features
from sklearn.preprocessing import LabelEncoder

# Choose categorical features
objects_training_data_2018 = training_2018[["location_description", "primary_type", "arrest"]]
numerical_training_data_2018 = training_2018[["month", "hour", "day_of_week"]]

# Encode categorical features
cat_objects_training_data_2018 = objects_training_data_2018.apply(LabelEncoder().fit_transform)
cat_objects_training_data_2018

Unnamed: 0_level_0,location_description,primary_type,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11556487,129,6,0
11552699,17,6,0
11552724,96,2,1
11552731,17,2,0
11552715,129,2,0
...,...,...,...
11459757,108,22,0
11315895,108,9,0
11196173,108,9,0
11533503,108,29,0


In [8]:
# One hot encode features
encoded_object_2018_df = pd.get_dummies(cat_objects_training_data_2018, columns=["location_description", "primary_type"])
encoded_object_2018_df2 = pd.get_dummies(numerical_training_data_2018, columns=["month", "hour", "day_of_week"])

In [9]:
# Join encoded categorical and numerical dataframes
training_data_2018_final = encoded_object_2018_df.join(encoded_object_2018_df2)
training_data_2018_final

Unnamed: 0_level_0,arrest,location_description_0,location_description_1,location_description_2,location_description_3,location_description_4,location_description_5,location_description_6,location_description_7,location_description_8,...,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11556487,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
11552699,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
11552724,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
11552731,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
11552715,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11459757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11315895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11196173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11533503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


### Import testing data

In [10]:
# Import 2019 crime data and prepare for testing
crime_2019 = os.path.join("..","Resources", "crime_clean_2019.csv") 
crime_2019_data = pd.read_csv(crime_2019, index_col='id')

In [11]:
# Drop columns that will not be used as features / reflecting process during model building
crime_2019_df = crime_2019_data.drop(columns=["date", "day", "year", "time", "month_day", "district", "block", "ward", "beat",
                                              "community_area", "description","iucr", "x_coordinate", "y_coordinate", "fbi_code", 
                                              "domestic", "latitude", "longitude"], axis=1)

In [12]:
# Choose categorical features
objects_training_data_2019 = crime_2019_df[["location_description", "primary_type", "arrest"]]
numerical_training_data_2019 = crime_2019_df[["month", "hour", "day_of_week"]]

# Encode categorical features
cat_objects_training_data_2019 = objects_training_data_2019.apply(LabelEncoder().fit_transform)

In [13]:
# One hot encode features
encoded_object_2019_df = pd.get_dummies(cat_objects_training_data_2019, columns=["location_description", "primary_type"])
encoded_object_2019_df2 = pd.get_dummies(numerical_training_data_2019, columns=["month", "hour", "day_of_week"])

In [14]:
# Join encoded categorical and numerical dataframes in the testing data
crime_2019_final = encoded_object_2019_df.join(encoded_object_2019_df2)

crime_2019_final.head()

Unnamed: 0_level_0,arrest,location_description_0,location_description_1,location_description_2,location_description_3,location_description_4,location_description_5,location_description_6,location_description_7,location_description_8,...,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11940078,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11938240,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11937967,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11938124,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


### Standardize Training and Testing Data

In [16]:
# Keep only columns in testing data that also exist in training data
training_data_2018_final = training_data_2018_final.drop(['primary_type_32'], axis=1)
training_data_cols = training_data_2018_final.columns.values
testing_data_final = crime_2019_final[training_data_cols]
testing_data_final

Unnamed: 0_level_0,arrest,location_description_0,location_description_1,location_description_2,location_description_3,location_description_4,location_description_5,location_description_6,location_description_7,location_description_8,...,hour_21,hour_22,hour_23,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11940078,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11938240,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11937967,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
11938124,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11951364,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
11937662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
11556297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
11752915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [17]:
# Assert training and testing data equal (if no output, then equal)
np.testing.assert_array_equal(testing_data_final.columns.values, training_data_2018_final.columns.values)

## Data Exploration

In [18]:
# Determine which features to keep

# Separate data by arrest (1) and no arrest (0)
Arrest_1_df = training_data_2018_final.loc[training_data_2018_final["arrest"] == 1]
Arrest_0_df = training_data_2018_final.loc[training_data_2018_final["arrest"] == 0]

# Find means for each feature in both datasets
Arrest_1_mean = Arrest_1_df.mean()
Arrest_0_mean = Arrest_0_df.mean()

# Calculate normalized differences of the features for the two datasets
# Values that differ most between two datasets are more likely to be useful as a differentiator between the two classes 
differences = (Arrest_1_mean.subtract(Arrest_0_mean))
normalized_diffs = differences.abs().sort_values()

pd.set_option("max_rows", None)
df = pd.DataFrame(normalized_diffs)
df

Unnamed: 0,0
location_description_119,1.063464e-07
location_description_135,4.811092e-06
primary_type_23,4.811092e-06
location_description_58,4.811092e-06
location_description_70,4.811092e-06
location_description_85,4.811092e-06
location_description_93,4.811092e-06
location_description_89,4.811092e-06
location_description_133,4.811092e-06
location_description_138,4.811092e-06


In [19]:
# Keep only features with differences greater than 1e-02 between arrests and non-arrests in training model
columns_to_keep = normalized_diffs.loc[lambda x: x > 1.0e-02].keys().tolist()
print(f"Count of columns kept: {len(columns_to_keep)}")

training_df_final = training_data_2018_final[columns_to_keep]
training_df_final.head()

Count of columns kept: 30


Unnamed: 0_level_0,location_description_105,month_2,hour_9,location_description_69,location_description_115,hour_11,primary_type_27,primary_type_25,location_description_113,location_description_15,...,primary_type_3,location_description_17,primary_type_8,primary_type_9,location_description_108,location_description_125,primary_type_6,primary_type_31,primary_type_18,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11556487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
11552699,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
11552724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11552731,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
11552715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Keep only features with differences greater than 1e-02 in testing model
testing_data_final = testing_data_final[columns_to_keep]
testing_data_final.head()

Unnamed: 0_level_0,location_description_105,month_2,hour_9,location_description_69,location_description_115,hour_11,primary_type_27,primary_type_25,location_description_113,location_description_15,...,primary_type_3,location_description_17,primary_type_8,primary_type_9,location_description_108,location_description_125,primary_type_6,primary_type_31,primary_type_18,arrest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11938228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
11940078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11938240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
11937967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
11938124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## SGD Classifier

In [45]:
# Split training data into training and validation sets
X_SGD = training_df_final.drop("arrest", axis=1)
y_SGD = training_df_final["arrest"].values.reshape(-1,1)
print(X_SGD.shape, y_SGD.shape)

(260105, 29) (260105, 1)


In [25]:
# Split into training and validation
from sklearn.model_selection import train_test_split

X_train, X_eval, y_train, y_eval = train_test_split(X_SGD, y_SGD, test_size=0.2, random_state=42)

In [54]:
# Fit data 
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier()
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [55]:
# Test model using validation dataset
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Evaluation Data Score: {classifier.score(X_eval, y_eval)}")

Training Data Score: 0.8591578401030353
Evaluation Data Score: 0.8574037407969858


In [56]:
# Take a look at roc auc score
from sklearn.metrics import roc_auc_score

predictions = classifier.predict(X_eval)
roc_auc_score(y_eval, predictions)

0.6741635062260868

### Make predictions on 2019 data

In [57]:
# Split 2019 into X(features) and y(target)
X_SGD2019 = testing_data_final.drop("arrest", axis=1)
y_SGD2019 = testing_data_final["arrest"].values.reshape(-1,1)
print(X_SGD2019.shape, y_SGD2019.shape)

(256908, 29) (256908, 1)


In [58]:
# Use model to predict 2019 arrests
predictions_SGD2019 = classifier.predict(X_SGD2019)
predictions_SGD2019

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [59]:
# Take a look at roc auc score
roc_auc_score(y_SGD2019, predictions_SGD2019)

0.6576947232870205

In [60]:
# Examine confusion matrix
pd.crosstab(y_SGD2019.ravel(), predictions_SGD2019.ravel(), rownames=['True'], colnames=['Predicted'], margins=True) 

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,191558,10394,201952
1,34795,20161,54956
All,226353,30555,256908


In [61]:
from sklearn.metrics import classification_report
print(classification_report(y_SGD2019, predictions_SGD2019))

              precision    recall  f1-score   support

           0       0.85      0.95      0.89    201952
           1       0.66      0.37      0.47     54956

    accuracy                           0.82    256908
   macro avg       0.75      0.66      0.68    256908
weighted avg       0.81      0.82      0.80    256908



## Build SGD Classifier with Bagging

In [62]:
# Split training dataset into features(X) and target(y) dataframes
X = training_df_final.drop("arrest", axis=1)
y = training_df_final["arrest"].values.reshape(-1,1)
print(X.shape, y.shape)

(260105, 29) (260105, 1)


In [63]:
# Split 2019 into features(X_2019) and target(y_2019) dataframes
X_2019 = testing_data_final.drop("arrest", axis=1)
y_2019 = testing_data_final["arrest"].values.reshape(-1,1)
print(X_2019.shape, y_2019.shape)

(256908, 29) (256908, 1)


In [70]:
# Fit data
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier

bagging_fit = BaggingClassifier(base_estimator=SGDClassifier(), n_estimators=50).fit(X, y.ravel())

In [71]:
# Score model
print(f"Training Model Score: {bagging_fit.score(X, y)}")

Training Model Score: 0.8588070202418254


In [72]:
# Use bagging to predict 2019 arrests
bagging_preds_2019 = bagging_fit.predict(X_2019)
bagging_preds_2019

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [73]:
# Take a look at roc auc score
from sklearn.metrics import roc_auc_score

roc_auc_score(y_2019, bagging_preds_2019)

0.6576947232870205

In [74]:
# Examine confusion matrix
pd.crosstab(y_2019.ravel(), bagging_preds_2019.ravel(), rownames=['True'], colnames=['Predicted'], margins=True) 

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,191558,10394,201952
1,34795,20161,54956
All,226353,30555,256908


In [75]:
# Print classification report
from sklearn.metrics import classification_report

print(classification_report(y_2019, bagging_preds_2019))

              precision    recall  f1-score   support

           0       0.85      0.95      0.89    201952
           1       0.66      0.37      0.47     54956

    accuracy                           0.82    256908
   macro avg       0.75      0.66      0.68    256908
weighted avg       0.81      0.82      0.80    256908



Bagging with SGD model no different than simple SGD classifier. 

## Ensemble Learning: Bagging with Decision Tree and Random Forest Classifiers

In [43]:
# Perform bootstrap aggregation with different models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
clf2 = RandomForestClassifier()    

bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=50).fit(X, y.ravel())
bagging2 = BaggingClassifier(base_estimator=clf2, n_estimators=50).fit(X, y.ravel())

In [44]:
# Show model performance
from sklearn.model_selection import cross_val_score

label = ['DecisionTree', 'RandomForest', 
         'BaggedDecisionTree', 'BaggedRandomForest']
clf_list = [clf1, clf2, bagging1, bagging2]

for clf, label in zip(clf_list, label):
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    
    clf.fit(X, y)
    prediction = clf.predict(X_2019)
    print(pd.crosstab(y_2019.ravel(), prediction, rownames=['True'], colnames=['Predicted'], margins=True))
    print(classification_report(y_2019.ravel(), prediction))

Accuracy: 0.85 (+/- 0.00) [DecisionTree]
Predicted       0      1     All
True                            
0          201947      5  201952
1           40752  14204   54956
All        242699  14209  256908
              precision    recall  f1-score   support

           0       0.83      1.00      0.91    201952
           1       1.00      0.26      0.41     54956

    accuracy                           0.84    256908
   macro avg       0.92      0.63      0.66    256908
weighted avg       0.87      0.84      0.80    256908



  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Accuracy: 0.86 (+/- 0.00) [RandomForest]


  if sys.path[0] == '':


Predicted       0      1     All
True                            
0          194905   7047  201952
1           35219  19737   54956
All        230124  26784  256908
              precision    recall  f1-score   support

           0       0.85      0.97      0.90    201952
           1       0.74      0.36      0.48     54956

    accuracy                           0.84    256908
   macro avg       0.79      0.66      0.69    256908
weighted avg       0.82      0.84      0.81    256908



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.85 (+/- 0.00) [BaggedDecisionTree]


  y = column_or_1d(y, warn=True)


Predicted       0      1     All
True                            
0          201947      5  201952
1           40752  14204   54956
All        242699  14209  256908
              precision    recall  f1-score   support

           0       0.83      1.00      0.91    201952
           1       1.00      0.26      0.41     54956

    accuracy                           0.84    256908
   macro avg       0.92      0.63      0.66    256908
weighted avg       0.87      0.84      0.80    256908



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.86 (+/- 0.00) [BaggedRandomForest]


  y = column_or_1d(y, warn=True)


Predicted       0      1     All
True                            
0          197099   4853  201952
1           35486  19470   54956
All        232585  24323  256908
              precision    recall  f1-score   support

           0       0.85      0.98      0.91    201952
           1       0.80      0.35      0.49     54956

    accuracy                           0.84    256908
   macro avg       0.82      0.67      0.70    256908
weighted avg       0.84      0.84      0.82    256908

