# Load data in Athena

***

## Libraries

In [4]:
import boto3
import sagemaker
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect
import pandas as pd

[0m

## Variables

In [5]:
db_name = "sdpd"
Bucket = 'sdpd-bucket' 
region = boto3.Session().region_name
s3_staging_dir = "s3://{}/athena/staging".format(Bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
role = sagemaker.get_execution_role()
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource

print('S3_staging_dir - > ',s3_staging_dir)
print('Conn - > ',conn)
print('Region - > ',region)


S3_staging_dir - >  s3://sdpd-bucket/athena/staging
Conn - >  <pyathena.connection.Connection object at 0x7f88515c5cd0>
Region - >  us-east-1


## Database

### Create

In [6]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(db_name)
pd.read_sql(statement, conn)

statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
print(df_show.head(5))

if db_name in df_show.values:
    ingest_create_athena_db_passed = True
    
%store ingest_create_athena_db_passed

  database_name
0       default
1        dsoaws
2          sdpd
3       watersd
Stored 'ingest_create_athena_db_passed' (bool)


### DB Setup

In [7]:
DB_location = f"""
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
LOCATION 's3://{Bucket}/SQL'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
"""

In [8]:
DB_location = f"""
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
LOCATION 's3://{Bucket}'
"""

In [9]:
def Header(file):
    obj = s3.get_object(Bucket=Bucket, Key=file)
    header = obj['Body'].read(1000).decode().split('\n')[0]
    header_list = header.split(',')
    return header_list

In [10]:
def SQL_Table_Create(db_name,table_name,file):
    create_table_sql = f"CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}.{table_name} ("
    for col in Header(file):
        create_table_sql += f"{col} string,"
    create_table_sql = create_table_sql[:-1] + ")"  # Remove trailing comma and add closing parenthesis
    return create_table_sql

In [11]:
SQL_Table_Create(db_name,'calls','SDPD_Calls_2023.csv')

'CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.calls (incident_num string,date_time string,day_of_week string,address_number_primary string,address_dir_primary string,address_road_primary string,address_sfx_primary string,address_dir_intersecting string,address_road_intersecting string,address_sfx_intersecting string,call_type string,disposition string,beat string,priority string)'

In [12]:
SQL_Table_Create(db_name,'Ripa_Stops','Ripa_Stops.csv')

'CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.Ripa_Stops (stop_id string,ori string,agency string,exp_years string,date_stop string,time_stop string,stopduration string,stop_in_response_to_cfs string,officer_assignment_key string,assignment string,intersection string,address_block string,land_mark string,address_street string,highway_exit string,isschool string,school_name string,address_city string,beat string,beat_name string,pid string,isstudent string,perceived_limited_english string,perceived_age string,perceived_gender string,gender_nonconforming string,gend string,gend_nc string,perceived_lgbt string)'

In [13]:
SQL_Calls = """CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.calls (incident_num string,
date_time string,
day_of_week string,
address_number_primary string,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat string,
priority string,
year int)"""

In [14]:
SQL_Calls

'CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.calls (incident_num string,\ndate_time string,\nday_of_week string,\naddress_number_primary string,\naddress_dir_primary string,\naddress_road_primary string,\naddress_sfx_primary string,\naddress_dir_intersecting string,\naddress_road_intersecting string,\naddress_sfx_intersecting string,\ncall_type string,\ndisposition string,\nbeat string,\npriority string,\nyear int)'

In [15]:
SQL_Stop = """ CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.Ripa_Stops (stop_id string,
ori string,
agency string,
exp_years string,
date_stop string,
time_stop string,
stopduration string,
stop_in_response_to_cfs string,
officer_assignment_key string,
assignment string,
intersection string,
address_block string,
land_mark string,
address_street string,
highway_exit string,
isschool string,
school_name string,
address_city string,
beat string,
beat_name string,
pid string,
isstudent string,
perceived_limited_english string,
perceived_age string,
perceived_gender string,
gender_nonconforming string,
gend string,
gend_nc string,
perceived_lgbt string)"""

In [16]:
SQL_Stop

' CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.Ripa_Stops (stop_id string,\nori string,\nagency string,\nexp_years string,\ndate_stop string,\ntime_stop string,\nstopduration string,\nstop_in_response_to_cfs string,\nofficer_assignment_key string,\nassignment string,\nintersection string,\naddress_block string,\nland_mark string,\naddress_street string,\nhighway_exit string,\nisschool string,\nschool_name string,\naddress_city string,\nbeat string,\nbeat_name string,\npid string,\nisstudent string,\nperceived_limited_english string,\nperceived_age string,\nperceived_gender string,\ngender_nonconforming string,\ngend string,\ngend_nc string,\nperceived_lgbt string)'

In [17]:
SQL_Calls + DB_location

"CREATE EXTERNAL TABLE IF NOT EXISTS sdpd.calls (incident_num string,\ndate_time string,\nday_of_week string,\naddress_number_primary string,\naddress_dir_primary string,\naddress_road_primary string,\naddress_sfx_primary string,\naddress_dir_intersecting string,\naddress_road_intersecting string,\naddress_sfx_intersecting string,\ncall_type string,\ndisposition string,\nbeat string,\npriority string,\nyear int)\nROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' \nLINES TERMINATED BY '\n' \nLOCATION 's3://sdpd-bucket'\n"

### Create Tables

In [18]:
pd.read_sql(SQL_Calls + DB_location, conn)

In [19]:
pd.read_sql(SQL_Stop + DB_location, conn)

In [30]:
qry = """
CREATE EXTERNAL TABLE IF NOT EXISTS `sdpd`.`Calls_SDPD_2023` (
  `incident_num` string,
  `date_time` string,
  `day_of_week` string,
  `address_number_primary` string,
  `address_dir_primary` string,
  `address_road_primary` string,
  `address_sfx_primary` string,
  `address_dir_intersecting` string,
  `address_road_intersecting` string,
  `address_sfx_intersecting` string,
  `call_type` string,
  `disposition` string,
  `beat` string,
  `priority` string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('field.delim' = ',')
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://sdpd-bucket/2023/'
TBLPROPERTIES ('classification' = 'csv', 'skip.header.line.count'='1');
"""

In [31]:
pd.read_sql(qry, conn)

In [32]:
statement = "SHOW TABLES in {}".format(db_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,calls
1,calls_2023
2,calls_sdpd_2023
3,ripa_stops


In [33]:
statement = """SELECT * FROM {}.{}
     LIMIT 100""".format(
    db_name, 'calls_sdpd_2023'
)

print(statement)

SELECT * FROM sdpd.calls_sdpd_2023
     LIMIT 100


In [34]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
0,E23010000001,2023-01-01 00:00:05,1,0,,05TH,AVE,,G,,FD,CAN,523,2
1,E23010000002,2023-01-01 00:00:30,1,0,,SHOPS INFO LOG,,,,,SHOPS,W,-1,4
2,E23010000003,2023-01-01 00:00:57,1,0,,05TH,AVE,,G,,FD,DUP,523,2
3,E23010000004,2023-01-01 00:01:15,1,600,,FERGUS,ST,,,,AU1,K,433,1
4,E23010000005,2023-01-01 00:02:17,1,0,,CHP INFO LOG,,,,,INFOCHP,W,-1,4
