# Dependencies

In [84]:
import sagemaker
import boto3
from datetime import datetime
from io import StringIO
import pandas as pd

from pyathena import connect

In [3]:
sess = sagemaker.Session() #Intialzie session

bucket = sess.default_bucket()
RD_Bucket = 'policedatasetbucket' #Raw Data
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource


print('Bucket - > ',bucket)
print('Role - > ',role)
print('Region - > ',region)
print('S3 - > ',s3)
print('S3 Resource - > ',s3_resource)

Bucket - >  sagemaker-us-east-1-859074047513
Role - >  arn:aws:iam::859074047513:role/LabRole
Region - >  us-east-1
S3 - >  <botocore.client.S3 object at 0x7f04fb63df90>
S3 Resource - >  s3.ServiceResource()


In [4]:
# Function to verify if bucket exist, if not create
def verify_create_bucket(bucket_name):
    response = s3.list_buckets()
    for bucket in response['Buckets']:
        if bucket['Name'] == bucket_name:
            print(f"The {bucket_name} bucket exists.")
            break
    else:
        print(f"The {bucket_name} bucket does not exist, creating")
        !aws s3 mb s3://{RD_Bucket}/

In [5]:
verify_create_bucket(RD_Bucket)


The policedatasetbucket bucket exists.


In [None]:
#function to download yearly data
def Get_Data(year):
    url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
    df = pd.read_csv(url)
    return df

In [None]:
# List of years since 2018
Years = list(range(2018,datetime.now().year+1))
Years

In [None]:
#iterate over years
for year in Years:
    year_df = Get_Data(year)
    csv_buffer = StringIO()
    year_df.to_csv(csv_buffer, index=False)
    file = 'SDPD_Calls_' + str(year) + '.csv'
    s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
    print(file,'loaded in',RD_Bucket,'bucket')

In [None]:
def URL_2_Bucket(url,file_name):
    df = pd.read_csv(url, low_memory=False)
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    file = file_name + '.csv'
    s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
    print(file,'loaded in',RD_Bucket,'bucket')
    

In [None]:
type_url = f"http://seshat.datasd.org/pd/pd_cfs_calltypes_datasd.csv"
dipo_url = f"http://seshat.datasd.org/pd/pd_dispo_codes_datasd.csv"
ripa_stops_url = "https://seshat.datasd.org/pd/ripa_stops_datasd.csv"
ripa_stops_dic = "https://seshat.datasd.org/pd/ripa_stops_dictionary_datasd.csv"

In [None]:
URL_2_Bucket(type_url,'Type')
URL_2_Bucket(dipo_url,'Dispo')
URL_2_Bucket(ripa_stops_url,'Ripa_Stops')
URL_2_Bucket(ripa_stops_dic,'Ripa_Stops_Dic')

In [6]:
ingest_create_athena_db_passed = False

# Create Athena Database

In [9]:
database_name = "sdpolicedb"

In [10]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [11]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [12]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS sdpolicedb


In [13]:
pd.read_sql(statement, conn)

In [14]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,sdpolicedb


In [15]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [16]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


In [79]:
policedatasetsd_path = 's3://policedatasetbucket/SDPD_Calls_2021.csv'

In [80]:
database_name = "sdpolicedb"
table_name = "SDPD_Calls_2021"

In [83]:
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name} (
             incident_num string,
             date_time date,
             day_of_week int,
             address_number_primary int,
             address_dir_primary string,
             address_road_primary string,
             address_sfx_primary string,
             address_dir_intersecting int,
             address_road_intersecting string,
             address_sfx_intersecting int,
             call_type string,
             disposition string,
             beat int,
             priority int
             )
                
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            LOCATION 's3://policedatasetbucket/policedatasetsd/SDPD_Calls_2021.csv'
            TBLPROPERTIES ('skip.header.line.count'='1')
"""

pd.read_sql(create_table, conn)

pd.read_sql(f'SELECT * FROM {database_name}.{table_name}', conn)

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority


In [77]:
# SQL statement to execute police calls for 2021
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         incident_num string,
         date_time date,
         day_of_week int,
         address_number_primary int,
         address_dir_primary string,
         address_road_primary string,
         address_sfx_primary string,
         address_dir_intersecting int,
         address_road_intersecting string,
         address_sfx_intersecting int,
         call_type string,
         disposition string,
         beat int,
         priority int
         
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name, policedatasetsd_path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS sdpolicedb.SDPD_Calls_2021(
         incident_num string,
         date_time date,
         day_of_week int,
         address_number_primary int,
         address_dir_primary string,
         address_road_primary string,
         address_sfx_primary string,
         address_dir_intersecting int,
         address_road_intersecting string,
         address_sfx_intersecting int,
         call_type string,
         disposition string,
         beat int,
         priority int
         
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 's3://policedatasetbucket/policedatasetsd/'
TBLPROPERTIES ('skip.header.line.count'='1')


# Varify The table has Been Created Succesfully

In [63]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name


In [56]:
pd.read_sql(f'SELECT * FROM {database_name}.{table_name} LIMIT 5', conn)


DatabaseError: Execution failed on sql: SELECT * FROM sdpolicedb.SDPD_Calls_2021 LIMIT 5
SYNTAX_ERROR: line 1:15: Table awsdatacatalog.sdpolicedb.sdpd_calls_2021 does not exist
unable to rollback