# Set up 

## Check Environment 

In [2]:
import boto3

region = boto3.Session().region_name
session = boto3.session.Session()

ec2 = boto3.Session().client(service_name="ec2", region_name=region)
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Update IAM Roles and Policies

In [3]:
import sagemaker
import time
from time import gmtime, strftime

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = 'sagemaker-studio-458903497716-h2kl4ff3dz'
region = boto3.Session().region_name

from botocore.config import Config

config = Config(retries={"max_attempts": 10, "mode": "adaptive"})

iam = boto3.client("iam", config=config)

In [4]:
role_name = role.split("/")[-1]

print("Role name: {}".format(role_name))

Role name: LabRole


## Import Libraries

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
! pip install descartes
! pip install geopandas
import geopandas as gpd
from geopandas import GeoDataFrame, points_from_xy

[0m

# Create Database Schema in Athena

In [6]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = 'sagemaker-studio-458903497716-h2kl4ff3dz'
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [7]:
print(bucket)

sagemaker-studio-458903497716-h2kl4ff3dz


In [8]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

[0m

In [9]:
#assign database name
database_name = "ads508"

In [10]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [11]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [12]:
#create new database 'ads508'
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
create_db = pd.read_sql(statement, conn)
create_db

Failed to execute query.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pyathena/common.py", line 305, in _execute
    **request
  File "/opt/conda/lib/python3.7/site-packages/pyathena/util.py", line 84, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 379, in __call__
    do = self.iter(retry_state=retry_state)
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 314, in iter
    return fut.result()
  File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 428, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 382, in __call__
    result = fn(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 530, in _api_call
    return

DatabaseError: Execution failed on sql: CREATE DATABASE IF NOT EXISTS ads508
An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: Unable to verify/create output bucket sagemaker-studio-458903497716-h2kl4ff3dz
unable to rollback

In [None]:
#verify database creation
q = "SHOW DATABASES"
db_show = pd.read_sql(q, conn)
db_show

## Create Athena Table

In [None]:
# Set Athena parameters
database_name = "ads508"
table_name_csv = "gun_violence"
s3_path = "s3://{}/data".format(bucket)
print(s3_path)

In [None]:
statement = """ DROP TABLE ads508.gun_violence_csv"""
execute=pd.read_sql(statement, conn)

In [None]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         incident_id int,
         date date,
         state string,
         city_or_county string,
         address string,
         n_killed int,
         n_injured int,
         incident_url string,
         source_url string,
         incident_url_fields_missing string,
         congressional_district int,
         gun_stolen string,
         gun_type string,
         incident_characteristics string,
         latitude int,
         location_description string,
         longitude int,
         n_guns_involved int,
         notes string,
         participant_age string,
         participant_age_group string,
         participant_gender string,
         participant_name string,
         participant_relationship string,
         participant_status string,
         participant_type string,
         sources string,
         state_house_district int,
         state_senate_district int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_path
)

create_table = pd.read_sql(statement, conn)
create_table

In [None]:
statement = "SHOW TABLES in ads508"
tables = pd.read_sql(statement, conn)
tables

In [None]:
statement = "SELECT * from ads508.gun_violence LIMIT 100"
df = pd.read_sql(statement, conn)
df.head(5)

# Insert Dataset into Pandas Dataframe

In [13]:
df = pd.read_csv("s3://sagemaker-studio-458903497716-h2kl4ff3dz/data/gun_violence.csv")
df.head()

PermissionError: Forbidden

In [None]:
print(df.shape)

### The dataset contains 23,9677 rows and 29 attributes. 

## Show null values

In [None]:
df.isnull().sum()

## Show percentage of null values

In [None]:
rows = len(df.index)
print ("address: " + str(round(((16497/rows)*100),2)) + "%")
print ("source_url: " + str(round(((468/rows)*100),2)) + "%")
print ("congressional district: " + str(round(((11944/rows)*100),2)) + "%")
print ("gun_stolen: " + str(round(((99498/rows)*100),2)) + "%")
print ("gun_type: " + str(round(((99451/rows)*100),2)) + "%")
print ("incident_characteristics: " + str(round(((326/rows)*100),2)) + "%")
print ("latitude: " + str(round(((7923/rows)*100),2)) + "%")
print ("location_description: " + str(round(((197588/rows)*100),2)) + "%")
print ("longitude: " + str(round(((7923/rows)*100),2)) + "%")
print ("n_guns_involved: " + str(round(((99451/rows)*100),2)) + "%")
print ("notes: " + str(round(((81017/rows)*100),2)) + "%")
print ("participant_age: " + str(round(((92298/rows)*100),2)) + "%")
print ("participant_age_group: " + str(round(((42119/rows)*100),2)) + "%")
print ("participant_gender: " + str(round(((36362/rows)*100),2)) + "%")
print ("participant_name: " + str(round(((122253/rows)*100),2)) + "%")
print ("participant_relationship: " + str(round(((223903/rows)*100),2)) + "%")
print ("participant_status: " + str(round(((27626/rows)*100),2)) + "%")
print ("participant_type: " + str(round(((24863/rows)*100),2)) + "%")
print ("sources: " + str(round(((609/rows)*100),2)) + "%")
print ("state_house_district: " + str(round(((38772/rows)*100),2)) + "%")
print ("state_senate_district: " + str(round(((32335/rows)*100),2)) + "%")


## Show data types

In [None]:
df.dtypes

# Use GeoPandas to map all gun violence incidences

In [None]:
countries = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
countries.head()

In [None]:
# wrap df w/ a GeoDataFrame by creating a geometry attribute
df_geo = GeoDataFrame(df, geometry=points_from_xy(df.longitude, df.latitude))
df_geo.head()

In [None]:
plt.rcParams['figure.figsize'] = [25,25]
df_geo.plot(color='red')
plt.show()

### There are 4 incidences that appear around latitide 40 longitude 100 that appear outside the US. We will explore these further.

In [None]:
statement = """SELECT * FROM ads508.gun_violence WHERE longitude BETWEEN 80 and 100 LIMIT 10"""
execute=pd.read_sql(statement, conn)
execute.head()

It seems that these extries did appear in the US (specifically in N. Carolina, Nebraska, Kansas, and Missouri). Maybe something went wrong during mapping? 

In [None]:
north_america = GeoDataFrame.from_file('north_america_shapefile/')
north_america.plot(color='lightgrey')
plt.show()

In [None]:
axes = north_america.plot(color='lightgrey')
df_geo.plot(ax=axes, color='red')
plt.show()

In [None]:
### Parse gun_type column and display histogram of gun type frequency

In [None]:
###############
# Gun Type
###############
import re
import math

guns_list = []

for item in df['gun_type']:

    if type(item) == float and math.isnan(item):
        continue

    gun = re.findall("\d*::\d* (?P<gun>\w*)", item)

    if len(gun) == 0:
        gun = re.findall("\d*::(?P<gun>\w*)", item)

    guns_list.extend(gun)

print(guns_list)

plt.xticks(rotation="vertical")
plt.hist(guns_list)
plt.show()