In [156]:
import sagemaker
import boto3
from datetime import datetime
from io import StringIO
import pandas as pd

from pyathena import connect

In [157]:
sess = sagemaker.Session() #Intialzie session

bucket = sess.default_bucket()
RD_Bucket = 'policedatasetbucket' #Raw Data
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource


print('Bucket - > ',bucket)
print('Role - > ',role)
print('Region - > ',region)
print('S3 - > ',s3)
print('S3 Resource - > ',s3_resource)

Bucket - >  sagemaker-us-east-1-859074047513
Role - >  arn:aws:iam::859074047513:role/LabRole
Region - >  us-east-1
S3 - >  <botocore.client.S3 object at 0x7f379d9552d0>
S3 Resource - >  s3.ServiceResource()


In [158]:
# Function to verify if bucket exist, if not create
def verify_create_bucket(bucket_name):
    response = s3.list_buckets()
    for bucket in response['Buckets']:
        if bucket['Name'] == bucket_name:
            print(f"The {bucket_name} bucket exists.")
            break
    else:
        print(f"The {bucket_name} bucket does not exist, creating")
        !aws s3 mb s3://{bucket}/

In [159]:
verify_create_bucket(bucket)

The sagemaker-us-east-1-859074047513 bucket exists.


In [160]:
#function to download yearly data
# def Get_Data(year):
#     url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
#     df = pd.read_csv(url)
#     return df

In [161]:
# List of years since 2018
# Years = list(range(2018,datetime.now().year+1))
# Years

In [162]:
#iterate over years
# for year in Years:
#     year_df = Get_Data(year)
#     csv_buffer = StringIO()
#     year_df.to_csv(csv_buffer, index=False)
#     file = 'SDPD_Calls_' + str(year) + '.csv'
#     s3_resource.Object(bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',bucket,'bucket')

In [163]:
# def URL_2_Bucket(url,file_name):
#     df = pd.read_csv(url, low_memory=False)
#     csv_buffer = StringIO()
#     df.to_csv(csv_buffer, index=False)
#     file = file_name + '.csv'
#     s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',RD_Bucket,'bucket')
    

In [164]:
# type_url = f"http://seshat.datasd.org/pd/pd_cfs_calltypes_datasd.csv"
# dipo_url = f"http://seshat.datasd.org/pd/pd_dispo_codes_datasd.csv"
# ripa_stops_url = "https://seshat.datasd.org/pd/ripa_stops_datasd.csv"
# ripa_stops_dic = "https://seshat.datasd.org/pd/ripa_stops_dictionary_datasd.csv"

In [165]:
# URL_2_Bucket(type_url,'Type')
# URL_2_Bucket(dipo_url,'Dispo')
# URL_2_Bucket(ripa_stops_url,'Ripa_Stops')
# URL_2_Bucket(ripa_stops_dic,'Ripa_Stops_Dic')

In [166]:
ingest_create_athena_db_passed = False

# Create Athena Database

In [167]:
database_name = "sd_police_db"

In [168]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [169]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)


In [170]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS sd_police_db


In [171]:
import pandas as pd

pd.read_sql(statement, conn)

In [172]:
# Show databases
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,sd_police_db


# Drop Database if Needed

In [173]:
# drop_db_name = "dsoaw"
# drop_db_query = f"DROP DATABASE IF EXISTS {drop_db_name}"

# df_show = pd.read_sql(drop_db_query, conn)
# df_show.head()

# Drop Table if Needed

In [174]:
# drop_table_name = "table_sd_ripa" 

# drop_table_query = f"DROP TABLE IF EXISTS {database_name}.{drop_table_name}"

# df_show1 = pd.read_sql(drop_table_query, conn)
# df_show1.head()

In [175]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [176]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


# S3 Bucket Folder Information for 2021

In [177]:
s3_private_path_csv = "s3://{}/policedatasetsd/csv".format(bucket)
print(s3_private_path_csv)

s3://sagemaker-us-east-1-859074047513/policedatasetsd/csv


In [178]:
table_name = "table_sd_2021"

In [179]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name, s3_private_path_csv)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2021(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [180]:
import pandas as pd

pd.read_sql(statement, conn)

# Varify Tables in a Specified Database

In [181]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023


In [206]:
pd.read_sql(statement, conn)

df_2021 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name}", conn)

# S3 Bucket Folder Information for 2022

In [183]:
s3_private_path_csv1 = "s3://{}/policedatasetsd22/csv".format(bucket)
print(s3_private_path_csv1)

s3://sagemaker-us-east-1-859074047513/policedatasetsd22/csv


In [184]:
table_name1 = "table_sd_2022"

In [185]:
# SQL statement to execute
statement_1 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name1, s3_private_path_csv1)

print(statement_1)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2022(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd22/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [186]:
import pandas as pd

pd.read_sql(statement_1, conn)

In [187]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023


In [208]:
pd.read_sql(statement_1, conn)

df_2022 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name1}", conn)

# S3 Bucket Folder Information for 2023

In [189]:
s3_private_path_csv2 = "s3://{}/policedatasetsd23/csv".format(bucket)
print(s3_private_path_csv2)

s3://sagemaker-us-east-1-859074047513/policedatasetsd23/csv


In [190]:
table_name2 = "table_sd_2023"

In [191]:
# SQL statement to execute
statement_2 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name2, s3_private_path_csv2)

print(statement_2)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2023(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd23/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [192]:
import pandas as pd

pd.read_sql(statement_2, conn)

In [193]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023


In [209]:
pd.read_sql(statement_2, conn)

df_2023 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name2}", conn)

# S3 Bucket Folder Information for RIPA Dataset

In [195]:
s3_private_path_csv_ripa = "s3://{}/ripapolicedatasetsd/csv".format(bucket)
print(s3_private_path_csv_ripa)

s3://sagemaker-us-east-1-859074047513/ripapolicedatasetsd/csv


In [196]:
table_name3 = "table_sd_ripa"

In [197]:
# SQL statement to execute
statement_3 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
`stop_id` string,
`ori` string,
`agency` string,
`exp_years` string,
`date_stop` string,
`time_stop` string,
`stopduration` string,
`stop_in_response_to_cfs` string,
`office_assignment_key` string,
`assignment` string,
`intersection` string,
`address_block` string,
`land_mark` string,
`address_street` string,
`highway_exit` string,
`isschool` string,
`school_name` string,
`address_city` string,
`beat` string,
`beat_name` string,
`pid` string,
`isstudent` string,
`perceived_limited_english` string,
`perceived_age` string,
`perceived_gender` string,
`gender_nonconforming` string,
`gend` string,
`gend_nc` string,
`perceived_lgbt` string
) 

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
WITH SERDEPROPERTIES ("separatorChar" = ",") 
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name3, s3_private_path_csv_ripa)


print(statement_3)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_ripa(
`stop_id` string,
`ori` string,
`agency` string,
`exp_years` string,
`date_stop` string,
`time_stop` string,
`stopduration` string,
`stop_in_response_to_cfs` string,
`office_assignment_key` string,
`assignment` string,
`intersection` string,
`address_block` string,
`land_mark` string,
`address_street` string,
`highway_exit` string,
`isschool` string,
`school_name` string,
`address_city` string,
`beat` string,
`beat_name` string,
`pid` string,
`isstudent` string,
`perceived_limited_english` string,
`perceived_age` string,
`perceived_gender` string,
`gender_nonconforming` string,
`gend` string,
`gend_nc` string,
`perceived_lgbt` string
) 

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
WITH SERDEPROPERTIES ("separatorChar" = ",") 
LOCATION 's3://sagemaker-us-east-1-859074047513/ripapolicedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [198]:
import pandas as pd

pd.read_sql(statement_3, conn)

In [199]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023
3,table_sd_ripa


In [210]:
pd.read_sql(statement_3, conn)

ripa_df = pd.read_sql(f"SELECT * FROM {database_name}.{table_name3}", conn)

# Join The Three datasets 2021, 2022, 2023

In [213]:
# shape of the dataframes
print(f'shape of the dataframe police dataframe {df_2021.shape}')
print(f'shape of the dataframe police dataframe {df_2022.shape}')
print(f'shape of the dataframe police dataframe {df_2023.shape}')

shape of the dataframe police dataframe (568947, 14)
shape of the dataframe police dataframe (499256, 14)
shape of the dataframe police dataframe (99811, 14)


In [231]:
join_df = pd.concat([df_2021, df_2022, df_2023], axis=0)
join_df

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
0,E21060046805,2021-06-28 07:36:28,2,5200,,52ND,PL,,,,SELENF,O,826,3
1,E21060046806,2021-06-28 07:36:33,2,3400,,SPORTS ARENA,BLV,,,,459A,K,611,3
2,E21060046807,2021-06-28 07:37:42,2,6200,,MADELINE,ST,,,,586,O,821,2
3,E21060046808,2021-06-28 07:38:22,2,1700,,02ND,AVE,,,,FU,O,529,2
4,E21060046809,2021-06-28 07:40:14,2,9200,,AERO,DR,,,,COURT,K,311,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99806,E23030022386,2023-03-15 23:25:50,4,2500,,IMPERIAL,AVE,,,,INFO,K,512,2
99807,E23030022400,2023-03-15 23:44:05,4,800,,27TH (SB),ST,,,,5150,K,724,1
99808,E23030022401,2023-03-15 23:44:23,4,14700,,VALLE DEL SUR,CT,,,,459A,CAN,937,2
99809,E23030022404,2023-03-15 23:46:26,4,400,,17TH,ST,,,,SELENF,K,521,3


In [232]:
# Data types 
join_df.dtypes

incident_num                 object
date_time                    object
day_of_week                   int64
address_number_primary        int64
address_dir_primary          object
address_road_primary         object
address_sfx_primary          object
address_dir_intersecting     object
address_road_intersecting    object
address_sfx_intersecting     object
call_type                    object
disposition                  object
beat                          int64
priority                      int64
dtype: object

In [233]:
from datetime import datetime

join_df["date_time"] = pd.to_datetime(join_df["date_time"])
join_df

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
0,E21060046805,2021-06-28 07:36:28,2,5200,,52ND,PL,,,,SELENF,O,826,3
1,E21060046806,2021-06-28 07:36:33,2,3400,,SPORTS ARENA,BLV,,,,459A,K,611,3
2,E21060046807,2021-06-28 07:37:42,2,6200,,MADELINE,ST,,,,586,O,821,2
3,E21060046808,2021-06-28 07:38:22,2,1700,,02ND,AVE,,,,FU,O,529,2
4,E21060046809,2021-06-28 07:40:14,2,9200,,AERO,DR,,,,COURT,K,311,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99806,E23030022386,2023-03-15 23:25:50,4,2500,,IMPERIAL,AVE,,,,INFO,K,512,2
99807,E23030022400,2023-03-15 23:44:05,4,800,,27TH (SB),ST,,,,5150,K,724,1
99808,E23030022401,2023-03-15 23:44:23,4,14700,,VALLE DEL SUR,CT,,,,459A,CAN,937,2
99809,E23030022404,2023-03-15 23:46:26,4,400,,17TH,ST,,,,SELENF,K,521,3


In [234]:
join_df.dtypes

incident_num                         object
date_time                    datetime64[ns]
day_of_week                           int64
address_number_primary                int64
address_dir_primary                  object
address_road_primary                 object
address_sfx_primary                  object
address_dir_intersecting             object
address_road_intersecting            object
address_sfx_intersecting             object
call_type                            object
disposition                          object
beat                                  int64
priority                              int64
dtype: object

In [236]:
join_df.sort_values(by="date_time", ascending = True)

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
8941,E21010000001,2021-01-01 00:00:11,6,5000,,CHAPARRAL,WAY,,,,AU1,W,326,1
8942,E21010000002,2021-01-01 00:00:14,6,500,,05TH,AVE,,,,1186,CAN,523,3
8943,E21010000003,2021-01-01 00:00:26,6,3600,,38TH,ST,,,,AU1,DUP,839,1
8944,E21010000004,2021-01-01 00:00:31,6,2700,,WORDEN,ST,,,,AU1,W,613,1
8945,E21010000006,2021-01-01 00:01:20,6,800,,SAN DIEGO,PL,,,,INFO,W,121,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99806,E23030022386,2023-03-15 23:25:50,4,2500,,IMPERIAL,AVE,,,,INFO,K,512,2
99807,E23030022400,2023-03-15 23:44:05,4,800,,27TH (SB),ST,,,,5150,K,724,1
99808,E23030022401,2023-03-15 23:44:23,4,14700,,VALLE DEL SUR,CT,,,,459A,CAN,937,2
99809,E23030022404,2023-03-15 23:46:26,4,400,,17TH,ST,,,,SELENF,K,521,3


In [238]:
# get unique values of the address_road_primary
join_df["address_road_primary"].nunique()

12705