## Part VI.2 - Prepare dataset for DeepAR model

University of San Diego - MS Applied AI

AAI-540 Team 5

October 21, 2024

In [6]:
# setup environment
%run 0-Environment_Setup.ipynb

Stored 's3_datalake_path_csv' (str)
Stored 'local_data_path_csv' (str)
Stored 's3_datalake_path_parquet' (str)


In [2]:
import json
from time import gmtime, strftime
from sklearn.preprocessing import StandardScaler

In [3]:
# get features stacked by store then date
sales_features_store = get_store_dataset_from_offline_feature_group(store_sales_feature_group)
sales_features_store.head()

Running 
    SELECT *
    FROM
        "store_sales_feature_group_offline_1728336748"
    ORDER BY
        store_nbr ASC, date ASC
    


Unnamed: 0,date,store_nbr,sales,oil,onpromotion,is_holiday,city,state,cluster,year,...,month_sin,day_cos,day_sin,dow_cos,dow_sin,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2013-01-01,1,0.0,93.14,0,1,18,12,13,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:1,1728337000.0,2024-10-07 21:38:10.967,2024-10-07 21:33:11.000,False
1,2013-01-02,1,7417.15,93.14,0,0,18,12,13,2013,...,0.5,0.918958,0.394356,-0.222521,0.974928,2013-01-02:1,1728337000.0,2024-10-07 21:38:11.435,2024-10-07 21:33:12.000,False
2,2013-01-03,1,5873.24,92.97,0,0,18,12,13,2013,...,0.5,0.820763,0.571268,-0.900969,0.433884,2013-01-03:1,1728337000.0,2024-10-07 21:38:10.869,2024-10-07 21:33:13.000,False
3,2013-01-04,1,5919.88,93.12,0,0,18,12,13,2013,...,0.5,0.688967,0.724793,-0.900969,-0.433884,2013-01-04:1,1728337000.0,2024-10-07 21:38:11.111,2024-10-07 21:33:14.000,False
4,2013-01-05,1,6318.79,93.12,0,1,18,12,13,2013,...,0.5,0.528964,0.848644,-0.222521,-0.974928,2013-01-05:1,1728337000.0,2024-10-07 21:38:11.141,2024-10-07 21:33:14.000,False


#### Set dataset split parameters


In [4]:
# sort features by date as copy to train scaler 
sales_features_date = sales_features_store.copy()
sales_features_date['date'] = pd.to_datetime(sales_features_date['date'])
sales_features_date = sales_features_date.sort_values(by=['date'])
sales_features_date.head()

Unnamed: 0,date,store_nbr,sales,oil,onpromotion,is_holiday,city,state,cluster,year,...,month_sin,day_cos,day_sin,dow_cos,dow_sin,sales_record_id,event_time,write_time,api_invocation_time,is_deleted
0,2013-01-01,1,0.0,93.14,0,1,18,12,13,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:1,1728337000.0,2024-10-07 21:38:10.967,2024-10-07 21:33:11.000,False
52204,2013-01-01,32,0.0,93.14,0,1,8,6,3,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:32,1728337000.0,2024-10-07 21:38:10.967,2024-10-07 21:33:12.000,False
53888,2013-01-01,33,0.0,93.14,0,1,17,9,3,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:33,1728337000.0,2024-10-07 21:38:11.047,2024-10-07 21:33:12.000,False
55572,2013-01-01,34,0.0,93.14,0,1,8,6,6,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:34,1728337000.0,2024-10-07 21:38:10.856,2024-10-07 21:33:12.000,False
57256,2013-01-01,35,0.0,93.14,0,1,15,6,3,2013,...,0.5,0.97953,0.201299,0.62349,0.781831,2013-01-01:35,1728337000.0,2024-10-07 21:38:11.069,2024-10-07 21:33:12.000,False


In [7]:
# calculate the total days in the date range so we can split at 80% mark
series_start_date = pd.to_datetime(sales_features_date['date'].min())
series_end_date = pd.to_datetime(sales_features_date['date'].max())
delta = series_end_date - series_start_date

# set training cutoff parameters
training_series_day_count = int(delta.days * .8)
start_training = series_start_date
end_training = series_start_date + datetime.timedelta(days=training_series_day_count)

# set test cutoff parameters
start_test = end_training + datetime.timedelta(days=1)
test_days = delta.days - training_series_day_count
test_weeks = int((delta.days - training_series_day_count) / 7)
val_weeks = int(test_weeks / 2)
test_weeks = val_weeks
end_test = start_test + datetime.timedelta(days=(test_weeks * 7))

print("Total days in series: {}".format(delta.days)) 
print("Days in training dataset: {}".format(training_series_day_count))
print("Days in test data: {}".format(test_days))
print("Weeks in test data: {}".format(test_weeks))
print("Weeks in validation data: {}".format(val_weeks))

Total days in series: 1687
Days in training dataset: 1349
Days in test data: 338
Weeks in test data: 24
Weeks in validation data: 24


#### Scale/Noarmalize the numeric features

In [8]:
# slice the raw data to train set (to fit scaler)
scale_train_set = sales_features_date[sales_features_date['date'] < start_test]

In [9]:
# initialize standard scaler
scaler = StandardScaler()
scaler.fit(scale_train_set[['sales', 'oil', 'onpromotion']])

In [53]:
# save the scaler locally
joblib.dump(scaler, 'deepAR-scaler.joblib')

['deepAR-scaler.joblib']

In [6]:
# only run if scaler is already fit and saved
LOAD_SCALER = False
if(LOAD_SCALER):
    scaler_filename = copy_file_from_s3(s3_deepar_gold_dataset_path + "/scaler/deepAR-scaler.joblib")
    scaler = joblib.load(scaler_filename) 

download: s3://sagemaker-us-east-1-053585949834/store-sales-forecasting/deepar/gold-dataset/scaler/deepAR-scaler.joblib to ./deepAR-scaler.joblib


In [7]:
# apply scaling to our full dataset
sales_features_store[['sales', 'oil', 'onpromotion']] = scaler.transform(sales_features_date[['sales', 'oil', 'onpromotion']])

In [10]:
# capture the unique stores
unique_store_nbrs = sales_features_store['store_nbr'].unique()
unique_store_nbrs

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54])

In [11]:
# helper function to build the target time series for each store
def build_store_timeseries(store_sales, target_col):
    unique_stores = store_sales['store_nbr'].unique()
    store_timeseries = []
    for store_nbr in unique_stores:
        # get the sales data for this store and only keep the timestep and sales number
        store_data = store_sales[store_sales['store_nbr'] == store_nbr]
        store_data = store_data[['date', target_col]]

        # convert to datetime and then to series with timestep = 1d
        store_data['date'] = pd.to_datetime(store_data['date'])
        
        store_data = store_data.set_index('date')
        store_data = store_data.resample('D').sum()
        store_ts = store_data.iloc[:, 0]

        # add to list
        store_timeseries.append(store_ts)    
    return store_timeseries

# helper function to write ts datasets to json
def write_dicts_to_file(path, data):
    with open(path, "wb") as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode("utf-8"))

In [12]:
# build the target timeseries
timeseries_stores_sales = build_store_timeseries(sales_features_store, 'sales')
timeseries_stores_oil = build_store_timeseries(sales_features_store, 'oil')
timeseries_stores_holidays = build_store_timeseries(sales_features_store, 'is_holiday')
timeseries_stores_promotions = build_store_timeseries(sales_features_store, 'onpromotion')

In [13]:
print(len(timeseries_stores_sales[0]))
print(len(timeseries_stores_oil[0]))
print(len(timeseries_stores_holidays[0]))
print(len(timeseries_stores_promotions[0]))

1688
1688
1688
1688


### Build Train and Test Datasets

In [14]:
# inspect date range of the series
print("Dataset date range: {} - {}".format(timeseries_stores_sales[0].index.min(), timeseries_stores_sales[0].index.max()))


Dataset date range: 2013-01-01 00:00:00 - 2017-08-15 00:00:00


In [15]:
# generate training data
training_data = [
    {
        "start": str(start_training),
        "target": ts[start_training:end_training].tolist(),
        "cat": [int(unique_store_nbrs[i]) - 1],
        "dynamic_feat": [
            timeseries_stores_oil[i][start_training:end_training].tolist(),
            timeseries_stores_holidays[i][start_training:end_training].tolist(),
            timeseries_stores_promotions[i][start_training:end_training].tolist(),
        ],
    }
    for i, ts in enumerate(timeseries_stores_sales)
]
print(len(training_data))

54


In [16]:
val_end = start_test + datetime.timedelta(days=(val_weeks*7))

# generate training data
val_data = [
    {
        "start": str(start_test),
        "target": ts[start_test:val_end].tolist(),
        "cat": [int(unique_store_nbrs[i]) - 1],
        "dynamic_feat": [
            timeseries_stores_oil[i][start_test:val_end].tolist(),
            timeseries_stores_holidays[i][start_test:val_end].tolist(),
            timeseries_stores_promotions[i][start_test:val_end].tolist(),
        ],
    }
    for i, ts in enumerate(timeseries_stores_sales)
]
print(len(val_data))

54


In [17]:
# Rolling weekly evaluations until end of test set and after val set
test_windows = test_weeks - 2

gen_test_start = start_test + datetime.timedelta(days=(val_weeks*7))
gen_test_end = gen_test_start + datetime.timedelta(days=(test_weeks*7))
cw = 7

test_data = [
    {
        "start": str(gen_test_start + datetime.timedelta(days=((k-1) * cw))),
        "target": ts[(gen_test_start + datetime.timedelta(days=((k-1) * cw))) : (gen_test_start + datetime.timedelta(days=((k * cw) - 1)))].tolist(),
        "cat": [int(unique_store_nbrs[i]) - 1],
        "dynamic_feat": [
            timeseries_stores_oil[i][(gen_test_start + datetime.timedelta(days=((k-1) * cw))) : (gen_test_start + datetime.timedelta(days=((k * cw) + deepar_prediction_length - 1)))].tolist(),
            timeseries_stores_holidays[i][(gen_test_start + datetime.timedelta(days=((k-1) * cw))) : (gen_test_start + datetime.timedelta(days=((k * cw) + deepar_prediction_length - 1)))].tolist(),
            timeseries_stores_promotions[i][(gen_test_start + datetime.timedelta(days=((k-1) * cw))) : (gen_test_start + datetime.timedelta(days=((k * cw) + deepar_prediction_length - 1)))].tolist(),
        ],
    }
    for k in range(1, test_windows + 1)
    for i, ts in enumerate(timeseries_stores_sales)
]

print(len(test_data))

1188


In [18]:
# write datasets to json files
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("val.json", val_data)
write_dicts_to_file("test.json", test_data)

In [19]:
# set flag to publish to "gold" dataset or keep in experiments
publish_to_gold = True

# setup S3 path to store train and test datasets
if(publish_to_gold):
    s3_dataset_path = s3_deepar_gold_dataset_path
else:
    s3_dataset_path = "s3://{}/store-sales-forecasting/deepar/experiments/{}".format(bucket, strftime("%Y-%m-%d-%H-%M-%S", gmtime()))
s3_dataset_path

's3://sagemaker-us-east-1-343218227212/store-sales-forecasting/deepar/gold-dataset'

In [20]:
# copy the train/test files to S3
copy_to_s3("train.json", s3_dataset_path + "/train/train.json", True)
copy_to_s3("val.json", s3_dataset_path + "/val/val.json", True)
copy_to_s3("test.json", s3_dataset_path + "/test/test.json", True)

Overwriting existing file
Uploading file to s3://sagemaker-us-east-1-343218227212/store-sales-forecasting/deepar/gold-dataset/train/train.json
Overwriting existing file
Uploading file to s3://sagemaker-us-east-1-343218227212/store-sales-forecasting/deepar/gold-dataset/val/val.json
Overwriting existing file
Uploading file to s3://sagemaker-us-east-1-343218227212/store-sales-forecasting/deepar/gold-dataset/test/test.json


In [None]:
# copy scaler to S3
copy_to_s3("deepAR-scaler.joblib", s3_dataset_path + "/scaler/deepAR-scaler.joblib")

In [21]:
# cleanup files
!rm train.json
!rm val.json
!rm test.json
!rm deepAR-scaler.joblib

rm: cannot remove 'deepAR-scaler.joblib': No such file or directory
