# Create Athena Tables

***

## Libraries

In [16]:
import boto3
import sagemaker
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect
import pandas as pd

[0m

## Variables

In [17]:
db_name = "sdpd"
Bucket = 'sdpd-bucket2' 
region = boto3.Session().region_name
s3_staging_dir = "s3://{}/athena/staging".format(Bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
role = sagemaker.get_execution_role()
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource

print('S3_staging_dir - > ',s3_staging_dir)
print('Conn - > ',conn)
print('Region - > ',region)


S3_staging_dir - >  s3://sdpd-bucket2/athena/staging
Conn - >  <pyathena.connection.Connection object at 0x7fc2daaaa2d0>
Region - >  us-east-1


## Functions

In [18]:
def Header(file):
    obj = s3.get_object(Bucket=Bucket, Key=file)
    header = obj['Body'].read(1000).decode().split('\n')[0]
    header_list = header.split(',')
    return header_list

In [19]:
def SQL_Tail(location):
    tail = f"""
    ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
    WITH SERDEPROPERTIES ('field.delim' = ',')
    STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION '{location}'
    TBLPROPERTIES (
      'classification' = 'csv',
      'skip.header.line.count' = '1'
    );
    """
    return tail


In [20]:
def SQL_Table_Create(db_name,table_name,location,file):
    create_table_sql = f"CREATE EXTERNAL TABLE IF NOT EXISTS `{db_name}`.`{table_name}` ("
    for col in Header(location+file):
        create_table_sql += f"`{col}` string,"
    create_table_sql = create_table_sql[:-1] + ")"  # Remove trailing comma and add closing parenthesis
    create_table_sql = create_table_sql + SQL_Tail(f's3://{Bucket}/{location}/')
    
    return create_table_sql

## Create

In [21]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(db_name)
pd.read_sql(statement, conn)

statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
print(df_show.head(5))

if db_name in df_show.values:
    ingest_create_athena_db_passed = True
    

  database_name
0       default
1          sdpd


## Tables

### Calls

In [22]:
Call_SQL = SQL_Table_Create(db_name,'Call_Data','Call_Data/','SDPD_Calls.csv')
Type_SQL = SQL_Table_Create(db_name,'Type_Data','Type_Data/','SDPD_Type.csv')
Dispo_SQL = SQL_Table_Create(db_name,'Dispo_Data','Dispo_Data/','SDPD_Dispo.csv')
Stops_SQL = SQL_Table_Create(db_name,'Stops_Data','Stops_Data/','SDPD_Stops.csv')
Stops_Dic_SQL = SQL_Table_Create(db_name,'Stops_Dic_Data','Stops_Dic_Data/','SDPD_Stops_Dic.csv')

Calls_Hr_SQL = SQL_Table_Create(db_name,'Calls_Hour','Calls/Calls_Hour/','Calls_Hour.csv')
Stops_HR_SQL= SQL_Table_Create(db_name,'Stops_Hourly','Stops/Stops_Hourly/','Stops_Hourly.csv')

NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [None]:
pd.read_sql(Call_SQL, conn)
pd.read_sql(Type_SQL, conn)
pd.read_sql(Dispo_SQL, conn)
pd.read_sql(Stops_SQL, conn)
pd.read_sql(Stops_Dic_SQL, conn)
pd.read_sql(Calls_Hr_SQL, conn)
pd.read_sql(Stops_HR_SQL, conn)

In [None]:
statement = """SELECT * FROM {}.{}
    order by date_time desc
     LIMIT 100
     """.format(
    db_name, 'Stops_Hourly'
)

print(statement)

In [None]:
df = pd.read_sql(statement, conn)
df.head(5)

<div class="alert alert-block alert-success">
<b>End:</b> Athena Buckets Created
</div>

In [None]:
Stops_Hourly_count = """SELECT count(*) FROM {}.{}
     """.format(
    db_name, 'Stops_Hourly'
)

print(Stops_Hourly_count)

In [None]:
calls = pd.read_sql(Stops_Hourly_count, conn)

In [None]:
calls