## Import Libraries

In [201]:
import boto3
import sagemaker
import time
import io
from sklearn.neighbors import KNeighborsClassifier
from time import gmtime, strftime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, plot_confusion_matrix
import pandas as pd
import numpy as np
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect
from tqdm import tqdm  
from sklearn.metrics import accuracy_score

[0m

## Setup SageMaker Boto3 Connection

In [175]:
region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [176]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [177]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


In [178]:
sess = sagemaker.Session()
bucket = '{}'.format(bucket)
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket

'sagemaker-us-east-1-346023323361'

In [179]:
# Assign database name
database_name = "ads508"

In [180]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [181]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [182]:
# Create new database 'ads508'
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
create_db = pd.read_sql(statement, conn)
create_db

In [183]:
# Verify database creation
q = "SHOW DATABASES"
db_show = pd.read_sql(q, conn)
db_show

Unnamed: 0,database_name
0,ads508
1,default


In [184]:
# Set Athena parameters
database_name = "ads508"
model_table_name_csv = "modeling"
model_s3_path = "s3://{}/modeling_data".format(bucket)
model_s3_path="s3://sagemaker-studio-458903497716-h2kl4ff3dz/modeling/data_for_modeling.csv"
model_s3_path ="https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/data_for_modeling.csv"
print(model_s3_path)

https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/data_for_modeling.csv


In [185]:
#df = pd.read_csv("{}/data_for_modeling.csv".format(model_s3_path))
#df = pd.read_csv("s3://sagemaker-studio-458903497716-h2kl4ff3dz/modeling/data_for_modeling.csv")
df = pd.read_csv(model_s3_path)
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,latitude,longitude,n_guns_involved,target_class,group_Democrat,suspect_age,ohe_drug,ohe_officer,ohe_gang,ohe_accident,...,suspect_age_group_Adult,suspect_age_group_Senior,region_East South Central,region_Middle Atlantic,region_Mountain,region_New England,region_Pacific,region_South Atlantic,region_West North Central,region_West South Central
0,40.3467,-79.8559,1.0,1,1,Adult 18+,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,40.4555,-79.897,1.0,1,1,,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,33.909,-118.333,1.0,1,1,,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
3,33.8447,-118.307,1.0,1,1,Adult 18+,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,33.9454,-118.399,1.0,1,1,,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [186]:
# statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
#  latitude string,
#  longitude string,
#  n_guns_involved string,
#  target_class string,
#  group_Democrat string,
#  suspect_age string,
#  ohe_drug string,
#  ohe_officer string,
#  ohe_gang string,
#  ohe_accident string,
#  ohe_murder string,
#  ohe_suicide string,
#  ohe_arrest string,
#  ohe_brandishing string,
#  ohe_felon string,
#  ohe_drive string,
#  ohe_home_invasion string,
#  ohe_stolen string,
#  ohe_misc string,
#  ohe_drugs string,
#  ohe_car_jacking string,
#  ohe_defensive string,
#  ohe_robbery string,
#  ohe_family string,
#  ohe_institution string,
#  ohe_child string,
#  ohe_mass string,
#  ohe_domestic string,
#  suspect_age_group_Teen string,
#  suspect_age_group_Young_Adult string,
#  suspect_age_group_Mid-Adult string,
#  suspect_age_group_Adult string,
#  suspect_age_group_Senior string,
#  region_East_South_Central string,
#  region_Middle_Atlantic string,
#  region_Mountain string,
#  region_New_England string,
#  region_Pacific string,
#  region_South_Atlantic string,
#  region_West_North_Central string,
#  region_West_South_Central string
 
# ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
# TBLPROPERTIES ('skip.header.line.count'='1')""".format(
#     database_name, model_table_name_csv, model_s3_path
# )

# create_table = pd.read_sql(statement, conn)
# create_table

In [187]:
# drop all rows containing nan
df_knn = df.dropna()

### Splitting the train, test, and validation first so that when the balancing takes place next, the validation and test datasets are not affected.

In [188]:
from sklearn.model_selection import train_test_split

# Splitting all data into 90% train and 10% holdout
df_train, df_holdout = train_test_split(
        df,
        test_size=0.10,
    stratify=df['target_class'])

# Splitting holdout data into 50% validation and 50% test
df_validation, df_test = train_test_split(
        df_holdout,
        test_size=0.50,
        stratify=df_holdout['target_class'])

### Creating files for each subsection of the data: train, test, and output

In [189]:
# specifying the output file path
df_train_output = "../generated_data/df_train.csv"
df_test_output = "../generated_data/df_test.csv"
df_validation_output = "../generated_data/df_validation.csv"

# saving the DataFrame to a CSV file
df_train.to_csv(df_train_output, index=False)
df_test.to_csv(df_test_output, index=False)
df_validation.to_csv(df_validation_output, index=False)


### Displaying the initial count of the target class variable in the training dataset.
We find that the class of 1 - which indicates someone was either injured or killed - is the majority class.

In [190]:
# count the number of 0s and 1s in the 'outcome' column
value_counts = df_train['target_class'].value_counts()

# print the results
print(value_counts)

1    118401
0     79910
Name: target_class, dtype: int64


### Undersampling the majority target_class of '1' 

In [191]:
df_grouped_by = df_train.groupby(["target_class"])
df_balanced = df_grouped_by.apply(
    lambda x: x.sample(df_grouped_by.size().min())\
    .reset_index(drop=True)
)

In [192]:
# showing the balanced value counts of the newly created dataframe
value_counts_balanced = df_balanced['target_class'].value_counts()

# print the results
print(value_counts_balanced)

0    79910
1    79910
Name: target_class, dtype: int64


## Neural Network

In [193]:
# Splitting all data into 90% train and 10% holdout
df_knn_train, df_knn_holdout = train_test_split(
        df_knn,
        test_size=0.10,
    stratify=df_knn['target_class'])

# Splitting holdout data into 50% validation and 50% test
df_knn_validation, df_knn_test = train_test_split(
        df_knn_holdout,
        test_size=0.50,
        stratify=df_knn_holdout['target_class'])

df_knn_grouped_by = df_knn_train.groupby(["target_class"])
df_knn_balanced = df_knn_grouped_by.apply(
    lambda x: x.sample(df_knn_grouped_by.size().min())\
    .reset_index(drop=True)
)

In [194]:
# Transfer Training Data to S3 bucket - exclude header and index
s3_client = boto3.client("s3")
BUCKET='sagemaker-us-east-1-898900188658'
BUCKET='sagemaker-us-east-1-346023323361'
KEY='knn/train/train.csv'
#response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

df_knn_y_train = df_knn_train['target_class']
df_knn_features = df_knn_train.drop(columns=['target_class'])
df_knn_features = df_knn_features.drop(columns=['suspect_age'])

# rearrange before uploading for AWS training job format
df_knn_train = df_knn_train[['target_class', 'latitude', 'longitude', 'n_guns_involved', 'group_Democrat', 'ohe_drug', 'ohe_officer', 'ohe_gang',
       'ohe_accident', 'ohe_murder', 'ohe_suicide', 'ohe_arrest', 'ohe_brandishing', 'ohe_felon', 'ohe_drive', 'ohe_home_invasion', 'ohe_stolen', 'ohe_misc', 'ohe_drugs', 'ohe_car_jacking',
       'ohe_defensive', 'ohe_robbery', 'ohe_family', 'ohe_institution', 'ohe_child', 'ohe_mass', 'ohe_domestic', 'suspect_age_group_Teen', 'suspect_age_group_Young Adult', 'suspect_age_group_Mid-Adult',
       'suspect_age_group_Adult', 'suspect_age_group_Senior', 'region_East South Central', 'region_Middle Atlantic', 'region_Mountain', 'region_New England', 'region_Pacific',
       'region_South Atlantic', 'region_West North Central', 'region_West South Central']]

#df_y_test = df_test['target_class']
#df_test = df_test.drop(columns=['target_class'])
df_knn_test = df_knn_test.drop(columns=['suspect_age'])
df_knn_y_test = df_knn_test['target_class']
df_knn_test = df_knn_test.drop(columns=['target_class'])

with io.StringIO() as csv_buffer:
    df_knn_train.to_csv(csv_buffer, index=False, header=False)

    response = s3_client.put_object(
        Bucket=BUCKET, Key=KEY, Body=csv_buffer.getvalue()
    )

In [195]:
print(df_knn_features.shape)
print(df_knn_y_train.shape)
#print(df_features.columns)
print(df_knn_train.head(5))
#df_knn_features = df_knn_features.drop(columns=['suspect_age'])
print(df_knn_train.columns)

(100150, 39)
(100150,)
        target_class  latitude  longitude  n_guns_involved  group_Democrat  \
191170             0   42.8383   -77.6414              1.0               0   
202303             0   24.5559   -81.7916              1.0               0   
160917             0   28.5257   -81.4204              1.0               1   
37589              0   29.2360   -81.0531              1.0               0   
191073             1   31.0881   -97.7398              1.0               0   

        ohe_drug  ohe_officer  ohe_gang  ohe_accident  ohe_murder  ...  \
191170         0            0         0             0           0  ...   
202303         0            0         0             0           0  ...   
160917         0            0         0             0           0  ...   
37589          0            0         0             0           0  ...   
191073         0            0         0             0           0  ...   

        suspect_age_group_Adult  suspect_age_group_Senior  \
19

In [148]:
parameters = {'solver':('adam', 'sgd'), 'activation':('relu', 'tanh'), 
              'hidden_layer_sizes':[2, 4], 'max_iter': [200, 400]} 
nn = GridSearchCV(MLPClassifier(), parameters) #finetune t find best parameters
nn.fit(df_knn_features, df_knn_y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_gr

In [61]:
nn.best_params_ #best parameters

{'activation': 'relu',
 'hidden_layer_sizes': 4,
 'max_iter': 200,
 'solver': 'adam'}

In [62]:
nn.best_score_ #best score

0.7942386420369446

In [199]:
y = df_knn_validation['target_class']
X = df_knn_validation.drop(columns=['target_class', 'suspect_age'])

y_pred = nn.best_estimator_.predict(X)
print(confusion_matrix(y, y_pred))

[[2191  607]
 [ 528 2238]]


In [202]:
accuracy_score(y, y_pred)

0.7960100647016535

# KNN

In [79]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(df_knn_features, df_knn_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [80]:
print(knn.predict(df_knn_test))
print(knn.score(df_knn_test, df_knn_y_test))

[1 1 1 ... 0 1 0]
0.7735442127965493


In [None]:
y = df_knn_validation['target_class']
X = df_knn_validation.drop(columns=['target_class', 'suspect_age'])

y_pred = nn.best_estimator_.predict(X)
print(confusion_matrix(y, y_pred))

In [None]:
accuracy_score(y, y_pred)