In [2]:
import sagemaker
import boto3
from datetime import datetime
from io import StringIO
import pandas as pd

from pyathena import connect

In [3]:
sess = sagemaker.Session() #Intialzie session

bucket = sess.default_bucket()
RD_Bucket = 'policedatasetbucket' #Raw Data
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource


print('Bucket - > ',bucket)
print('Role - > ',role)
print('Region - > ',region)
print('S3 - > ',s3)
print('S3 Resource - > ',s3_resource)

Bucket - >  sagemaker-us-east-1-859074047513
Role - >  arn:aws:iam::859074047513:role/LabRole
Region - >  us-east-1
S3 - >  <botocore.client.S3 object at 0x7f379c4258d0>
S3 Resource - >  s3.ServiceResource()


In [4]:
# Function to verify if bucket exist, if not create
def verify_create_bucket(bucket_name):
    response = s3.list_buckets()
    for bucket in response['Buckets']:
        if bucket['Name'] == bucket_name:
            print(f"The {bucket_name} bucket exists.")
            break
    else:
        print(f"The {bucket_name} bucket does not exist, creating")
        !aws s3 mb s3://{bucket}/

In [5]:
verify_create_bucket(bucket)

The sagemaker-us-east-1-859074047513 bucket exists.


In [None]:
#function to download yearly data
# def Get_Data(year):
#     url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
#     df = pd.read_csv(url)
#     return df

In [None]:
# List of years since 2018
# Years = list(range(2018,datetime.now().year+1))
# Years

In [None]:
#iterate over years
# for year in Years:
#     year_df = Get_Data(year)
#     csv_buffer = StringIO()
#     year_df.to_csv(csv_buffer, index=False)
#     file = 'SDPD_Calls_' + str(year) + '.csv'
#     s3_resource.Object(bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',bucket,'bucket')

In [None]:
# def URL_2_Bucket(url,file_name):
#     df = pd.read_csv(url, low_memory=False)
#     csv_buffer = StringIO()
#     df.to_csv(csv_buffer, index=False)
#     file = file_name + '.csv'
#     s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',RD_Bucket,'bucket')
    

In [None]:
# type_url = f"http://seshat.datasd.org/pd/pd_cfs_calltypes_datasd.csv"
# dipo_url = f"http://seshat.datasd.org/pd/pd_dispo_codes_datasd.csv"
# ripa_stops_url = "https://seshat.datasd.org/pd/ripa_stops_datasd.csv"
# ripa_stops_dic = "https://seshat.datasd.org/pd/ripa_stops_dictionary_datasd.csv"

In [None]:
# URL_2_Bucket(type_url,'Type')
# URL_2_Bucket(dipo_url,'Dispo')
# URL_2_Bucket(ripa_stops_url,'Ripa_Stops')
# URL_2_Bucket(ripa_stops_dic,'Ripa_Stops_Dic')

In [None]:
ingest_create_athena_db_passed = False

# Create Athena Database

In [11]:
database_name = "sd_police_db"

In [12]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [13]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)


In [14]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS sd_police_db


In [15]:
import pandas as pd

pd.read_sql(statement, conn)

In [18]:
# Show databases
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,sd_police_db


# Drop Database if Needed

In [17]:
drop_db_name = "dsoaw"
drop_db_query = f"DROP DATABASE IF EXISTS {drop_db_name}"

df_show = pd.read_sql(drop_db_query, conn)
df_show.head()

# Drop Table if Needed

In [19]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [20]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


# S3 Bucket Folder Information

In [27]:
s3_private_path_csv = "s3://{}/policedatasetsd/csv".format(bucket)
print(s3_private_path_csv)

s3://sagemaker-us-east-1-859074047513/policedatasetsd/csv


In [39]:
table_name = "table_sd_2021"

In [40]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name, s3_private_path_csv)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2021(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [41]:
import pandas as pd

pd.read_sql(statement, conn)

# Varify Tables in a Specified Database

In [42]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021


In [46]:
pd.read_sql(statement, conn)

pd.read_sql(f"SELECT * FROM {database_name}.{table_name} limit 10", conn)

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
0,E21010000001,2021-01-01 00:00:11,6,5000,,CHAPARRAL,WAY,,,,AU1,W,326,1
1,E21010000002,2021-01-01 00:00:14,6,500,,05TH,AVE,,,,1186,CAN,523,3
2,E21010000003,2021-01-01 00:00:26,6,3600,,38TH,ST,,,,AU1,DUP,839,1
3,E21010000004,2021-01-01 00:00:31,6,2700,,WORDEN,ST,,,,AU1,W,613,1
4,E21010000006,2021-01-01 00:01:20,6,800,,SAN DIEGO,PL,,,,INFO,W,121,2
5,E21010000007,2021-01-01 00:01:41,6,8500,,LAKE MURRAY,BLV,,,,FIREHZRD,CAN,324,2
6,E21010000008,2021-01-01 00:02:19,6,5400,,CHURCHWARD,ST,,,,1131A,U,432,1
7,E21010000009,2021-01-01 00:03:06,6,0,,INFO LOG,,,,,503CAR,W,-1,2
8,E21010000010,2021-01-01 00:03:38,6,0,,SHOPS INFO LOG,,,,,SHOPS,W,-1,4
9,E21010000011,2021-01-01 00:03:44,6,0,,38TH,ST,,UNIVERSITY,,AU1,DUP,838,1


# Drop Table if Needed

In [37]:
drop_table_name = "table_sd_2021" 

drop_table_query = f"DROP TABLE IF EXISTS {database_name}.{drop_table_name}"

df_show1 = pd.read_sql(drop_table_query, conn)
df_show1.head()