## Part IV - Merging Cleaned Data

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [1]:
# setup environment
%run 0-Environment_Setup.ipynb

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [2]:
# Check the path to the data is initialized
print(s3_datalake_path_csv)

# Check the path to the data is initialized
print(s3_datalake_path_parquet)

s3://sagemaker-us-east-1-343218227212/store-sales-forecasting/csv
s3://sagemaker-us-east-1-343218227212/store-sales-forecasting/parquet


In [3]:
# View the tables in the database
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,holidays
1,oil
2,sales
3,stores
4,transactions


In [4]:
# Test the query with a small number of results
statement = f"""
    SELECT 
        s.date,
        s.store_nbr,
        family,
        sales,
        onpromotion,
        dcoilwtico, 
        city, 
        state, 
        st.type 
        AS store_type, 
        cluster, 
        CASE
            WHEN locale = 'National' THEN 1
            WHEN locale_name = city THEN 1
            WHEN locale_name = state THEN 1
            ELSE 0
        END AS is_holiday,
        year
    FROM {database_name}.sales AS s
    LEFT JOIN {database_name}.oil AS o ON s.date = o.date
    LEFT JOIN {database_name}.stores AS st ON s.store_nbr = st.store_nbr
    LEFT JOIN {database_name}.holidays AS h ON s.date=h.date
    ORDER BY s.date ASC
    LIMIT 10"""

df = pd.read_sql(statement, conn)
df

  df = pd.read_sql(statement, conn)


Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,store_type,cluster,is_holiday,year
0,2013-01-01,1,CELEBRATION,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
1,2013-01-01,1,DELI,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
2,2013-01-01,1,BEVERAGES,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
3,2013-01-01,1,BREAD/BAKERY,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
4,2013-01-01,1,CLEANING,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
5,2013-01-01,1,DAIRY,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
6,2013-01-01,1,BABY CARE,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
7,2013-01-01,1,BEAUTY,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
8,2013-01-01,1,BOOKS,0.0,0,93.14,Quito,Pichincha,D,13,1,2013
9,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,Quito,Pichincha,D,13,1,2013


In [5]:
# Create a new table in the database with the merged data
statement = f"""
    CREATE TABLE {database_name}.cleaned_data
    WITH (
        format = 'PARQUET', 
        partitioned_by = ARRAY['year'],
        external_location = '{s3_datalake_path_parquet}/cleaned_data_parquet_table/'
    ) AS 
    SELECT 
        s.date,
        s.store_nbr,
        family,
        sales,
        onpromotion,
        dcoilwtico, 
        city, 
        state, 
        st.type 
        AS store_type, 
        cluster, 
        CASE
            WHEN locale = 'National' THEN 1
            WHEN locale_name = city THEN 1
            WHEN locale_name = state THEN 1
            ELSE 0
        END AS is_holiday,
        year
    FROM {database_name}.sales AS s
    LEFT JOIN {database_name}.oil AS o ON s.date = o.date
    LEFT JOIN {database_name}.stores AS st ON s.store_nbr = st.store_nbr
    LEFT JOIN {database_name}.holidays AS h ON s.date=h.date"""

query_exec_id = wr.athena.start_query_execution(sql=statement, database=database_name, s3_output=s3_staging_dir, wait=True)

In [6]:
# Check that the operation succeeded
query_exec_id

{'QueryExecutionId': '2e47a2f8-2451-4184-8276-41c8da5309af',
 'Query': "CREATE TABLE aai540finalprojectdb.cleaned_data\n    WITH (\n        format = 'PARQUET', \n        partitioned_by = ARRAY['year'],\n        external_location = 's3://sagemaker-us-east-1-343218227212/store-sales-forecasting/parquet/cleaned_data_parquet_table/'\n    ) AS \n    SELECT \n        s.date,\n        s.store_nbr,\n        family,\n        sales,\n        onpromotion,\n        dcoilwtico, \n        city, \n        state, \n        st.type \n        AS store_type, \n        cluster, \n        CASE\n            WHEN locale = 'National' THEN 1\n            WHEN locale_name = city THEN 1\n            WHEN locale_name = state THEN 1\n            ELSE 0\n        END AS is_holiday,\n        year\n    FROM aai540finalprojectdb.sales AS s\n    LEFT JOIN aai540finalprojectdb.oil AS o ON s.date = o.date\n    LEFT JOIN aai540finalprojectdb.stores AS st ON s.store_nbr = st.store_nbr\n    LEFT JOIN aai540finalprojectdb.hol

In [7]:
# Check that data from the table loads correctly
table_name = "cleaned_data"

# Define SQL query
statement = """SELECT * FROM {}.{}
    LIMIT 5""".format(
    database_name, table_name
)

# Display the results
df = pd.read_sql(statement, conn)
df

  df = pd.read_sql(statement, conn)


Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,store_type,cluster,is_holiday,year
0,2015-01-01,1,AUTOMOTIVE,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
1,2015-01-01,1,BABY CARE,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
2,2015-01-01,1,BEAUTY,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
3,2015-01-01,1,BEVERAGES,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
4,2015-01-01,1,BOOKS,0.0,0,52.72,Quito,Pichincha,D,13,1,2015
