# Set up 

## Check Environment 

In [52]:
import boto3

region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Update IAM Roles and Policies

In [53]:
import sagemaker
import time
from time import gmtime, strftime

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [54]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


## Import Libraries

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
! pip install descartes
! pip install geopandas
!pip install wordcloud
import geopandas as gpd
from tqdm import tqdm  
from geopandas import GeoDataFrame, points_from_xy
from wordcloud import WordCloud

[0m

In [56]:
s3_public_path = "s3://sagemaker-studio-458903497716-h2kl4ff3dz/data"

In [57]:
%store s3_public_path

Stored 's3_public_path' (str)


In [58]:
s3_private_path = "s3://{}/gunData".format(bucket)
s3_private_path

's3://sagemaker-us-east-1-898900188658/gunData'

In [59]:
%store s3_private_path

Stored 's3_private_path' (str)


In [60]:
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --include "*" 

copy: s3://sagemaker-studio-458903497716-h2kl4ff3dz/data/sub_est2018_all.csv to s3://sagemaker-us-east-1-898900188658/gunData/sub_est2018_all.csv
copy: s3://sagemaker-studio-458903497716-h2kl4ff3dz/data/gun_violence.csv to s3://sagemaker-us-east-1-898900188658/gunData/gun_violence.csv


# Create Database Schema in Athena

In [61]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = '{}'.format(bucket)
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket

'sagemaker-us-east-1-898900188658'

In [62]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

[0m

In [63]:
#assign database name
database_name = "ads508"

In [64]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [65]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [66]:
#create new database 'ads508'
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
create_db = pd.read_sql(statement, conn)
create_db

In [67]:
#verify database creation
q = "SHOW DATABASES"
db_show = pd.read_sql(q, conn)
db_show

Unnamed: 0,database_name
0,ads508
1,default
2,dsoaws


## Create Athena Table

In [68]:
# Set Athena parameters
database_name = "ads508"
table_name_csv = "gun_violence"
s3_path = "s3://{}".format(bucket)
print(s3_path)

s3://sagemaker-us-east-1-898900188658


In [69]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         incident_id int,
         date date,
         state string,
         city_or_county string,
         address string,
         n_killed int,
         n_injured int,
         incident_url string,
         source_url string,
         incident_url_fields_missing string,
         congressional_district int,
         gun_stolen string,
         gun_type string,
         incident_characteristics string,
         latitude int,
         location_description string,
         longitude int,
         n_guns_involved int,
         notes string,
         participant_age string,
         participant_age_group string,
         participant_gender string,
         participant_name string,
         participant_relationship string,
         participant_status string,
         participant_type string,
         sources string,
         state_house_district int,
         state_senate_district int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_path
)

create_table = pd.read_sql(statement, conn)
create_table

In [70]:
table_name_csv_1 = "est2018"
s3_path_1 = "s3://{}/census2010_data".format(bucket)
print(s3_path_1)

s3://sagemaker-us-east-1-898900188658/census2010_data


In [71]:
# SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
    sumlev int,
    state int,
    county int,
    place int,
    cousub int,
    concit int,
    primgeo_flag int,
    funcstat int,
    name string,
    stname string,
    census2010pop int,
    estimatesbase2010 int,
    popestimates2010 int,
    popestimates2011 int,
    popestimates2012 int,
    popestimates2013 int,
    popestimates2014 int,
    popestimates2015 int,
    popestimates2016 int,
    popestimates2017 int,
    popestimates2018 int
         
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv_1, s3_path_1
)

create_table_1 = pd.read_sql(statement, conn)
create_table_1

In [72]:
statement = "SHOW TABLES in ads508"
tables = pd.read_sql(statement, conn)
tables

Unnamed: 0,tab_name
0,est2018
1,gun_violence


In [73]:
statement = "SELECT * from ads508.gun_violence LIMIT 100"
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district


In [74]:
statement = "SELECT * from ads508.est2018 LIMIT 100"
df_pop = pd.read_sql(statement, conn)
df_pop.head(5)

Unnamed: 0,sumlev,state,county,place,cousub,concit,primgeo_flag,funcstat,name,stname,...,estimatesbase2010,popestimates2010,popestimates2011,popestimates2012,popestimates2013,popestimates2014,popestimates2015,popestimates2016,popestimates2017,popestimates2018


# Insert Dataset into Pandas Dataframe

In [99]:
df = pd.read_csv("{}/preprocessed/df_project.csv".format(s3_path))
df.head()

Unnamed: 0.1,Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,...,state_house_district,state_senate_district,target_class,geometry,victim_count,suspect_count,n_total,suspect_age,suspect_age_value,suspect_gender
0,0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,...,,,1,POINT (-79.8559 40.3467),4.0,1.0,4,Adult 18+,,Female
1,1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,...,62.0,35.0,1,POINT (-118.333 33.909),4.0,1.0,4,,,Gender Unknown
2,2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,...,56.0,13.0,1,POINT (-82.1377 41.4455),3.0,2.0,4,Adult 18+,31.0,Male
3,3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,...,40.0,28.0,1,POINT (-104.802 39.6518),3.0,1.0,4,Adult 18+,33.0,Male
4,4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,...,62.0,27.0,1,POINT (-79.9569 36.114),3.0,1.0,4,Adult 18+,47.0,Female


In [100]:
print(df.columns)

Index(['Unnamed: 0', 'incident_id', 'date', 'state', 'city_or_county',
       'address', 'n_killed', 'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude',
       'location_description', 'longitude', 'n_guns_involved', 'notes',
       'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status',
       'participant_type', 'sources', 'state_house_district',
       'state_senate_district', 'target_class', 'geometry', 'victim_count',
       'suspect_count', 'n_total', 'suspect_age', 'suspect_age_value',
       'suspect_gender'],
      dtype='object')


# Handle Null Values

In [101]:
100*(df.isnull().sum())/len(df.index)

Unnamed: 0                      0.000000
incident_id                     0.000000
date                            0.000000
state                           0.000000
city_or_county                  0.000000
address                         6.883013
n_killed                        0.000000
n_injured                       0.000000
incident_url                    0.000000
source_url                      0.195263
incident_url_fields_missing     0.000000
congressional_district          4.983373
gun_stolen                     41.513370
gun_type                       41.493760
incident_characteristics        0.136016
latitude                        3.305699
location_description           82.439283
longitude                       3.305699
n_guns_involved                41.493760
notes                          33.802576
participant_age                38.509327
participant_age_group          17.573234
participant_gender             15.171251
participant_name               51.007397
participant_rela

## Feature Engineer

In [102]:
"""
Break up the date timestamp into month, day and year columns for training
"""
df['year'] = ''
df['month'] = ''
df['day'] = ''

for i,row in df.iterrows():
    date_parts = str(row['date']).split('-')
    df.at[i, 'year'] = date_parts[0]
    df.at[i, 'month'] = date_parts[1]
    df.at[i, 'day'] = date_parts[2]

df = df.drop(columns=['date'])
#df_fitted['suspect_gender'] = df_fitted.suspect_gender.map({'Male':0, 'Female':1, 'Gender Unknown':2})
#df_fitted['suspect_age'] = df_fitted.suspect_age.map({'Adult 18+':0, 'Teen 12-17':1, 'Child 0-11':2, '':3})

## One Hot Encode Categorical Values

In [103]:
df = pd.get_dummies(df, columns=['state'], drop_first=True)
df = pd.get_dummies(df, columns=['suspect_gender'], drop_first=True)

In [104]:
"""
Bin the suspect age into custom age groups - provides more granularity than the original dataset's age group column
"""
labels=['Child','Teen', 'Young Adult', 'Mid-Adult', 'Adult', 'Senior']
df['suspect_age_group']  = pd.cut(df['suspect_age_value'], bins=[1,12,18,25,35,65,200],labels=labels)

## Impute Missing Values

In [105]:
print(df.columns)

drop_cols = ['Unnamed: 0', 'incident_id', 'city_or_county', 'address', 'incident_url', 'source_url', 'incident_url_fields_missing', 'gun_stolen', 'gun_type', 'incident_characteristics', 'latitude', 'location_description', 'longitude', 'notes', 'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status', 'participant_type', 'sources', 'geometry', 'suspect_age']
cols = ['n_killed', 'n_injured', 'state_house_district', 'state_senate_district', 'victim_count', 'suspect_count', 'n_total', 'suspect_age_value', 'suspect_gender', 'year', 'month', 'day']

Index(['Unnamed: 0', 'incident_id', 'city_or_county', 'address', 'n_killed',
       'n_injured', 'incident_url', 'source_url',
       'incident_url_fields_missing', 'congressional_district', 'gun_stolen',
       'gun_type', 'incident_characteristics', 'latitude',
       'location_description', 'longitude', 'n_guns_involved', 'notes',
       'participant_age', 'participant_age_group', 'participant_gender',
       'participant_name', 'participant_relationship', 'participant_status',
       'participant_type', 'sources', 'state_house_district',
       'state_senate_district', 'target_class', 'geometry', 'victim_count',
       'suspect_count', 'n_total', 'suspect_age', 'suspect_age_value', 'year',
       'month', 'day', 'state_Alaska', 'state_Arizona', 'state_Arkansas',
       'state_California', 'state_Colorado', 'state_Connecticut',
       'state_Delaware', 'state_District of Columbia', 'state_Florida',
       'state_Georgia', 'state_Hawaii', 'state_Idaho', 'state_Illinois',
       'stat

In [111]:
"""
suspect_age_value about 40% missing
"""
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

df_nona = df.dropna()
y = df_nona.pop("suspect_age_group")

X_train, X_test, y_train, y_test = train_test_split(df_nona, y, test_size=0.2, random_state=42)

X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

clf = HistGradientBoostingClassifier(min_samples_leaf=1,
                                       max_depth=2,
                                       learning_rate=1,
                                       max_iter=1).fit(X_train, y_train)
df_test = df[ df.notnull() ]
df_test['pred_suspect_age_group'] = clf.predict(df_test.drop(columns=['suspect_age_group']))
df_test.to_csv('result-histboost2.csv')
print("Training Score", clf.score(X_train, y_train))
print("Test Score", clf.score(X_test, y_test))

Index(['n_killed', 'n_injured', 'congressional_district', 'n_guns_involved',
       'state_house_district', 'state_senate_district', 'target_class',
       'victim_count', 'suspect_count', 'n_total', 'suspect_age_value', 'year',
       'month', 'day', 'state_Alaska', 'state_Arizona', 'state_Arkansas',
       'state_California', 'state_Colorado', 'state_Connecticut',
       'state_Delaware', 'state_District of Columbia', 'state_Florida',
       'state_Georgia', 'state_Hawaii', 'state_Idaho', 'state_Illinois',
       'state_Indiana', 'state_Iowa', 'state_Kansas', 'state_Kentucky',
       'state_Louisiana', 'state_Maine', 'state_Maryland',
       'state_Massachusetts', 'state_Michigan', 'state_Minnesota',
       'state_Mississippi', 'state_Missouri', 'state_Montana',
       'state_Nebraska', 'state_Nevada', 'state_New Hampshire',
       'state_New Jersey', 'state_New Mexico', 'state_New York',
       'state_North Carolina', 'state_North Dakota', 'state_Ohio',
       'state_Oklahoma', 'sta

ValueError: could not convert string to float: 'Mckeesport'

In [None]:
# drop columns used to feature engineer new columns

df.drop(columns=['participant_age', 'participant_age_group', 'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type'])
df.drop(columns=['sources','notes', 'incident_characteristics'])

## Balance Dataset

## Train Test Split

## Training And Modeling

## Logistic Regression

## XGBoost