## Part V - Feature Engineering and Feature Store

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [23]:
# setup environment
%run 0-Environment_Setup.ipynb

Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [95]:
import json
from time import gmtime, strftime

In [2]:
# get features stacked by store then date
sales_features_store = get_store_dataset_from_offline_feature_group(store_sales_feature_group)
sales_features_store.head()

Running 
    SELECT *
    FROM
        "store_sales_feature_group_offline_1727227039"
    ORDER BY
        store_nbr ASC, date ASC
    


Unnamed: 0,date,store_nbr,sales,oil,is_holiday,city,state,cluster,onpromotion,year,month,day,dow,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2013-01-01,1,0.0,93.14,1,18,12,13,0,2013,1,1,1,2013-01-01:1,1727227000.0,2024-09-25 01:23:03.697,2024-09-25 01:18:05.000,False
1,2013-01-02,1,7417.15,93.14,0,18,12,13,0,2013,1,2,2,2013-01-02:1,1727227000.0,2024-09-25 01:23:07.224,2024-09-25 01:18:06.000,False
2,2013-01-03,1,5873.24,92.97,0,18,12,13,0,2013,1,3,3,2013-01-03:1,1727227000.0,2024-09-25 01:23:03.964,2024-09-25 01:18:07.000,False
3,2013-01-04,1,5919.88,93.12,0,18,12,13,0,2013,1,4,4,2013-01-04:1,1727227000.0,2024-09-25 01:23:03.859,2024-09-25 01:18:07.000,False
4,2013-01-05,1,6318.79,93.12,1,18,12,13,0,2013,1,5,5,2013-01-05:1,1727227000.0,2024-09-25 01:23:03.726,2024-09-25 01:18:08.000,False


In [34]:
# capture the unique stores
unique_store_nbrs = sales_features_store['store_nbr'].unique()

In [92]:
# helper function to build the target time series for each store
def build_store_timeseries(store_sales, target_col):
    unique_stores = store_sales['store_nbr'].unique()
    store_timeseries = []
    for store_nbr in unique_stores:
        # get the sales data for this store and only keep the timestep and sales number
        store_data = store_sales[store_sales['store_nbr'] == store_nbr]
        store_data = store_data[['date', target_col]]

        # convert to datetime and then to series with timestep = 1d
        store_data['date'] = pd.to_datetime(store_data['date'])
        store_data = store_data.set_index('date')
        store_data = store_data.resample('D').sum()
        store_ts = np.trim_zeros(store_data.iloc[:, 0], trim='f')

        # add to list
        store_timeseries.append(store_ts)    
    return store_timeseries

# helper function to write ts datasets to json
def write_dicts_to_file(path, data):
    with open(path, "wb") as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode("utf-8"))

# helper function to write to S3
s3 = boto3.resource("s3")
def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith("s3://")
    split = s3_path.split("/")
    bucket = split[2]
    path = "/".join(split[3:])
    buk = s3.Bucket(bucket)

    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print(
                "File s3://{}/{} already exists.\nSet override to upload anyway.\n".format(
                    s3_bucket, s3_path
                )
            )
            return
        else:
            print("Overwriting existing file")
    with open(local_file, "rb") as data:
        print("Uploading file to {}".format(s3_path))
        buk.put_object(Key=path, Body=data)

In [44]:
# build the target timeseries
timeseries_stores_sales = build_store_timeseries(sales_features_store, 'sales')
timeseries_stores_oil = build_store_timeseries(sales_features_store, 'oil')
timeseries_stores_holidays = build_store_timeseries(sales_features_store, 'is_holiday')
timeseries_stores_promotions = build_store_timeseries(sales_features_store, 'onpromotion')

### Build Train and Test Datasets

In [47]:
# set timeseries parameters
# we use 2 hour frequency for the time series
freq = "1D"

# prediction window 7 days
prediction_length = 7

# window size/context length is 15 days
context_length = 15

In [53]:
# inspect date range of the series
print("Dataset date range: {} - {}".format(timeseries_stores_sales[0].index.min(), timeseries_stores_sales[0].index.max()))


Dataset date range: 2013-01-02 00:00:00 - 2017-08-15 00:00:00


In [76]:
# calculate the total days in the date range so we can split at 80% mark
import datetime
series_start_date = timeseries_stores_sales[0].index.min()
series_end_date = timeseries_stores_sales[0].index.max()
delta = series_end_date - series_start_date

# set training cutoff parameters
training_series_day_count = int(delta.days * .8)
start_training = series_start_date
end_training = series_start_date + datetime.timedelta(days=training_series_day_count)

# set test cutoff parameters
start_test = end_training + datetime.timedelta(days=1)
end_test = series_end_date
test_days = delta.days - training_series_day_count
test_weeks = int((delta.days - training_series_day_count) / 7)

print("Total days in series: {}".format(delta.days)) 
print("Days in training dataset: {}".format(training_series_day_count))
print("Days in test data: {}".format(test_days))
print("Weeks in test data: {}".format(test_weeks))

Total days in series: 1686
Days in training dataset: 1348
Days in test data: 338
Weeks in test data: 48


In [87]:
# generate training data
training_data = [
    {
        "start": str(start_training),
        "target": ts[start_training:end_training].tolist(),
        "dynamic_feat": [
            timeseries_stores_oil[i][start_training:end_training].tolist(),
            timeseries_stores_holidays[i][start_training:end_training].tolist(),
            timeseries_stores_promotions[i][start_training:end_training].tolist(),
        ],
    }
    for i, ts in enumerate(timeseries_stores_sales)
]
print(len(training_data))

54


In [88]:
# Rolling weekly evaluations until end of test set
num_test_windows = test_weeks - 1

test_data = [
    {
        "start": str(start_training),
        "target": ts[start_training : end_training + datetime.timedelta(days=k * prediction_length)].tolist(),
        "dynamic_feat": [
            timeseries_stores_oil[i][start_training : end_training + datetime.timedelta(days=k * prediction_length)].tolist(),
            timeseries_stores_holidays[i][start_training : end_training + datetime.timedelta(days=k * prediction_length)].tolist(),
            timeseries_stores_promotions[i][start_training : end_training + datetime.timedelta(days=k * prediction_length)].tolist(),
        ],
    }
    for k in range(1, num_test_windows + 1)
    for i, ts in enumerate(timeseries_stores_sales)
]

print(len(test_data))

2538


In [90]:
# write datasets to json files
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("test.json", test_data)

In [93]:
default_s3_bucket_name

'sagemaker-us-east-1-053585949834'

In [96]:
# setup S3 path to store train and test datasets
s3_dataset_path = "s3://{}/store-sales-forecasting/prepared/{}".format(bucket, strftime("%Y-%m-%d-%H-%M-%S", gmtime()))
s3_dataset_path

's3://sagemaker-us-east-1-053585949834/store-sales-forecasting/prepared/2024-09-29-05-27-53'

In [97]:
# copy the train/test files to S3
copy_to_s3("train.json", s3_dataset_path + "/train/train.json")
copy_to_s3("test.json", s3_dataset_path + "/test/test.json")

Uploading file to s3://sagemaker-us-east-1-053585949834/store-sales-forecasting/prepared/2024-09-29-05-27-53/train/train.json
Uploading file to s3://sagemaker-us-east-1-053585949834/store-sales-forecasting/prepared/2024-09-29-05-27-53/test/test.json
