## Part IV - Merging Cleaned Data

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [3]:
!pip install awswrangler
!pip install pyathena

Collecting awswrangler
  Using cached awswrangler-3.9.1-py3-none-any.whl.metadata (17 kB)
Using cached awswrangler-3.9.1-py3-none-any.whl (381 kB)
Installing collected packages: awswrangler
Successfully installed awswrangler-3.9.1
[0mCollecting pyathena
  Using cached pyathena-3.9.0-py3-none-any.whl.metadata (6.3 kB)
Using cached pyathena-3.9.0-py3-none-any.whl (75 kB)
Installing collected packages: pyathena
Successfully installed pyathena-3.9.0
[0m

In [4]:
# Import libraries
import boto3
import sagemaker
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr
from pyathena import connect

In [18]:
# Initialize Sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [19]:
# Store the datalake path to csv data
%store -r s3_datalake_path_csv

# Check the path to the data is initialized
print(s3_datalake_path_csv)

# Store the datalake path to Parquet data
%store -r s3_datalake_path_parquet

# Check the path to the data is initialized
print(s3_datalake_path_parquet)

s3://sagemaker-us-east-1-757929513207/store-sales-forecasting/csv
s3://sagemaker-us-east-1-757929513207/store-sales-forecasting/parquet


In [20]:
# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Set Athena parameters
database_name = "aai540finalprojectdb"

# Connect to the Athena staging directory
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [21]:
# View the tables in the database
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,holidays
1,oil
2,sales
3,stores
4,transactions


In [45]:
# Test the query with a small number of results
statement = f"""
    SELECT 
        s.date,
        s.store_nbr,
        family,
        sales,
        onpromotion,
        dcoilwtico, 
        city, 
        state, 
        st.type 
        AS store_type, 
        cluster, 
        CASE
            WHEN locale = 'National' THEN 1
            WHEN locale_name = city THEN 1
            WHEN locale_name = state THEN 1
            ELSE 0
        END AS is_holiday,
        year
    FROM {database_name}.sales AS s
    LEFT JOIN {database_name}.oil AS o ON s.date = o.date
    LEFT JOIN {database_name}.stores AS st ON s.store_nbr = st.store_nbr
    LEFT JOIN {database_name}.holidays AS h ON s.date=h.date
    ORDER BY s.date ASC
    LIMIT 10"""

df = pd.read_sql(statement, conn)
df

  df = pd.read_sql(statement, conn)


Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,store_type,cluster,is_holiday,year
0,2013-01-01,1,CELEBRATION,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
1,2013-01-01,1,DELI,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
2,2013-01-01,1,BEVERAGES,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
3,2013-01-01,1,BREAD/BAKERY,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
4,2013-01-01,1,CLEANING,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
5,2013-01-01,1,DAIRY,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
6,2013-01-01,1,BABY CARE,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
7,2013-01-01,1,BEAUTY,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
8,2013-01-01,1,BOOKS,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
9,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,Quito,Pichincha,D,13,1,2013


In [46]:
# Create a new table in the database with the merged data
statement = f"""
    CREATE TABLE {database_name}.cleaned_data
    WITH (
        format = 'PARQUET', 
        partitioned_by = ARRAY['year'],
        external_location = '{s3_datalake_path_parquet}/cleaned_data_parquet_table/'
    ) AS 
    SELECT 
        s.date,
        s.store_nbr,
        family,
        sales,
        onpromotion,
        dcoilwtico, 
        city, 
        state, 
        st.type 
        AS store_type, 
        cluster, 
        CASE
            WHEN locale = 'National' THEN 1
            WHEN locale_name = city THEN 1
            WHEN locale_name = state THEN 1
            ELSE 0
        END AS is_holiday,
        year
    FROM {database_name}.sales AS s
    LEFT JOIN {database_name}.oil AS o ON s.date = o.date
    LEFT JOIN {database_name}.stores AS st ON s.store_nbr = st.store_nbr
    LEFT JOIN {database_name}.holidays AS h ON s.date=h.date"""

query_exec_id = wr.athena.start_query_execution(sql=statement, database=database_name, s3_output=s3_staging_dir, wait=True)

In [47]:
# Check that the operation succeeded
query_exec_id

{'QueryExecutionId': 'bc844487-4263-4ac0-b904-92578b179967',
 'Query': "CREATE TABLE aai540finalprojectdb.cleaned_data\n    WITH (\n        format = 'PARQUET', \n        partitioned_by = ARRAY['year'],\n        external_location = 's3://sagemaker-us-east-1-757929513207/store-sales-forecasting/parquet/cleaned_data_parquet_table/'\n    ) AS \n    SELECT \n        s.date,\n        s.store_nbr,\n        family,\n        sales,\n        onpromotion,\n        dcoilwtico, \n        city, \n        state, \n        st.type \n        AS store_type, \n        cluster, \n        CASE\n            WHEN locale = 'National' THEN 1\n            WHEN locale_name = city THEN 1\n            WHEN locale_name = state THEN 1\n            ELSE 0\n        END AS is_holiday,\n        year\n    FROM aai540finalprojectdb.sales AS s\n    LEFT JOIN aai540finalprojectdb.oil AS o ON s.date = o.date\n    LEFT JOIN aai540finalprojectdb.stores AS st ON s.store_nbr = st.store_nbr\n    LEFT JOIN aai540finalprojectdb.hol

In [48]:
# Check that data from the table loads correctly
table_name = "cleaned_data"

# Define SQL query
statement = """SELECT * FROM {}.{}
    LIMIT 5""".format(
    database_name, table_name
)

# Display the results
df = pd.read_sql(statement, conn)
df

  df = pd.read_sql(statement, conn)


Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,store_type,cluster,is_holiday,year
0,2015-01-01,1,AUTOMOTIVE,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
1,2014-01-01,1,AUTOMOTIVE,0.0,0,95.14,Quito,Pichincha,D,13,1,2014
2,2015-01-01,1,BEVERAGES,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
3,2015-01-01,1,BOOKS,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
4,2015-01-01,1,BABY CARE,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
