## Part 0 - Setup Environment and Common Functions


University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

### Install and import libraries

In [None]:
!pip install awswrangler
!pip install pyathena
!pip install seaborn

In [1]:
# Import libraries
import boto3
import sagemaker
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import awswrangler as wr
from pyathena import connect
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.session import Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Initialize Variables

In [None]:
# initialize Sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)
s3_client = boto3.client("s3", region_name=region)

boto_session = boto3.Session(region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

# initialize sagemaker featurestore session
feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sm,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [None]:
# Default S3 bucket for offline feature store
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-store-sales"

In [None]:
# Default store sales feature store name
store_sales_feature_group_name = "store-sales-feature-group-offline"

In [None]:
# define default store sales feature group
store_sales_feature_group = FeatureGroup(
    name=store_sales_feature_group_name, sagemaker_session=feature_store_session
)

In [None]:
# initialize our private bucket path
s3_datalake_path_csv = "s3://{}/store-sales-forecasting/csv".format(bucket)
%store s3_datalake_path_csv

In [None]:
# initialize local data path
local_data_path_csv = os.getcwd() + '/store-sales-time-series-forecasting/'
%store local_data_path_csv

In [None]:
# Set the datalake path to Parquet data
s3_datalake_path_parquet = "s3://{}/store-sales-forecasting/parquet".format(bucket)
%store s3_datalake_path_parquet

In [None]:
# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [None]:
# Set Athena parameters
database_name = "aai540finalprojectdb"

In [None]:
# Connect to the Athena staging directory
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

### Global Functions

In [4]:
def get_cleaned_dataset_from_athena(db):
    # Check that data from the table loads correctly
    table_name = "cleaned_data"

    # Define SQL query
    statement = """SELECT * FROM {}.{}
        """.format(
        db, table_name
    )

    # get the results
    df = wr.athena.read_sql_query(statement, db)
    
    return df


In [None]:
# load an existing feature group
def load_feature_group(feature_group_name):
    try:
        feature_group = FeatureGroup(feature_group_name, sess)
        return feature_group
    except Exception as e:
        print(f"Error loading feature group: {e}")
        return None

In [None]:
# load raw dataset from an offline feature group
def get_raw_dataset_from_offline_feature_group(feature_group):
    query = feature_group.athena_query()
    table = query.table_name
    query_string = f'''
    SELECT *
    FROM
        "{table}"
    '''
    print("Running " + query_string)

    query.run(
        query_string=query_string,
        output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/dataset_query"
    )
    query.wait()
    df = query.as_dataframe()
    return df

In [None]:
# load raw dataset from an offline feature group
def get_store_dataset_from_offline_feature_group(feature_group):
    query = feature_group.athena_query()
    table = query.table_name
    query_string = f'''
    SELECT *
    FROM
        "{table}"
    ORDER BY
        store_nbr ASC, date ASC
    '''
    print("Running " + query_string)

    query.run(
        query_string=query_string,
        output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/dataset_query"
    )
    query.wait()
    df = query.as_dataframe()
    return df