# Import Necessary Packages

In [1]:
# !pip install --upgrade category_encoders

In [2]:
# 
import numpy as np
import pandas as pd
import category_encoders as ce
import pickle as pkl
pd.options.mode.chained_assignment = None


# Data Loader
import sys

sys.path.extend(['..'])

import torch
import torch.utils.data
import torch.utils.data as data_utils
import pickle as pkl

# from utils.data_utils import *

# Functions Implementation

### Data

In [3]:
def read_data(input_data_dir='./m5-forecasting-uncertainty', output_dir='.'):
    train_data = pd.read_csv(f'{input_data_dir}/sales_train_evaluation.csv')
    sell_prices = pd.read_csv(f'{input_data_dir}/sell_prices.csv')
    calendar = pd.read_csv(f'{input_data_dir}/calendar.csv')

    # ---- process calendar features ---- #
    print('* Processing calendar features')

    calendar.date = pd.to_datetime(calendar.date)
    calendar['relative_year'] = 2016 - calendar.year

    # convert month, day and weekday to cyclic encodings
    calendar['month_sin'] = np.sin(2 * np.pi * calendar.month/12.0)
    calendar['month_cos'] = np.cos(2 * np.pi * calendar.month/12.0)
    calendar['day_sin'] = np.sin(2 * np.pi * calendar.date.dt.day/calendar.date.dt.days_in_month)
    calendar['day_cos'] = np.cos(2 * np.pi * calendar.date.dt.day/calendar.date.dt.days_in_month)
    calendar['weekday_sin'] = np.sin(2 * np.pi * calendar.wday/7.0)
    calendar['weekday_cos'] = np.cos(2 * np.pi * calendar.wday/7.0)

    # use same encoded labels for both the event name columns
    cal_label = ['event_name_1', 'event_name_2']
    cal_label_encoded_cols = ['event_name_1_enc', 'event_name_2_enc']
    calendar[cal_label_encoded_cols] = calendar[cal_label]
    cal_label_encoder = ce.OrdinalEncoder(cols=cal_label_encoded_cols)
    cal_label_encoder.fit(calendar)
    cal_label_encoder.mapping[1]['mapping'] = cal_label_encoder.mapping[0]['mapping']
    calendar = cal_label_encoder.transform(calendar)

    # subtract one from label encoded as pytorch uses 0-indexing
    for col in cal_label_encoded_cols:
        calendar[col] = calendar[col] - 1

    calendar_df = calendar[['wm_yr_wk', 'd', 'snap_CA', 'snap_TX', 'snap_WI', 'relative_year',
                            'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin', 'weekday_cos']
                           + cal_label_encoded_cols]

    # ---- Merge all dfs, keep calender_df features separate and just concat them for each batch ---- #
    train_data.id = train_data.id.str[:-11]
    sell_prices['id'] = sell_prices['item_id'] + '_' + sell_prices['store_id']

    # add empty columns for future data
    train_data = pd.concat([train_data, pd.DataFrame(columns=['d_'+str(i) for i in range(1942, 1970)])])

    # Encode categorical features using either one-hot or label encoding (for embeddings)
    print('* Encoding categorical features')
    label = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    label_encoded_cols = [str(i)+'_enc' for i in label]

    train_data[label_encoded_cols] = train_data[label]
    label_encoder = ce.OrdinalEncoder(cols=[str(i)+'_enc' for i in label])
    label_encoder.fit(train_data)
    train_data = label_encoder.transform(train_data)

    # subtract one from label encoded as pytorch uses 0-indexing
    for col in label_encoded_cols:
        train_data[col] = train_data[col] - 1

    # Reshape, change dtypes and add previous day sales
    print('* Add previous day sales and merge sell prices')
    data_df = pd.melt(train_data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                                           'item_id_enc', 'dept_id_enc', 'cat_id_enc', 'store_id_enc', 'state_id_enc'],
                      var_name='d', value_vars=['d_'+str(i) for i in range(1, 1970)], value_name='sales')

    # change dtypes to reduce memory usage
    data_df[['sales']] = data_df[['sales']].fillna(-2).astype(np.int16)  # fill future sales as -2
    calendar_df[['snap_CA', 'snap_TX', 'snap_WI', 'relative_year']] = calendar_df[
        ['snap_CA', 'snap_TX', 'snap_WI', 'relative_year']].astype(np.int8)
    calendar_df[cal_label_encoded_cols] = calendar_df[cal_label_encoded_cols].astype(np.int16)

    data_df[label_encoded_cols] = data_df[label_encoded_cols].astype(np.int16)

    # merge sell prices
    data_df = data_df.merge(right=calendar_df[['d', 'wm_yr_wk']], on=['d'], how='left')
    data_df = data_df.merge(right=sell_prices[['id', 'wm_yr_wk', 'sell_price']], on=['id', 'wm_yr_wk'], how='left')

    data_df.sell_price = data_df.sell_price.fillna(0.0)
    data_df['prev_day_sales'] = data_df.groupby(['id'])['sales'].shift(1)

    # remove data for d_1
    data_df.dropna(axis=0, inplace=True)
    calendar_df = calendar_df[calendar_df.d != 'd_1']

    # change dtypes
    data_df[['prev_day_sales']] = data_df[['prev_day_sales']].astype(np.int16)

    # ---- Add previous day totals of aggregated series as features ---- #
    # print('* Add previous day totals of aggregated series as features')
    # # total
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d'])[['prev_day_sales']].sum().astype(
    #                             np.int32).add_suffix('_all').reset_index(),
    #                         on=['d'], how='left')
    # # category level
    # data_df = data_df.merge(right=data_df.groupby(['d', 'cat_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='cat_id', values='prev_day_sales').add_prefix('prev_d_cat_'),
    #                         on=['d'], how='left')
    # # state level
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d', 'state_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='state_id', values='prev_day_sales').add_prefix('prev_d_state_'),
    #                         on=['d'], how='left')
    # # store level
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d', 'store_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='store_id', values='prev_day_sales').add_prefix('prev_d_store_'),
    #                         on=['d'], how='left')
    # # department level
    # data_df = data_df.merge(right=
    #                         data_df.groupby(['d', 'dept_id'])[['prev_day_sales']].sum().astype(
    #                             np.int32).reset_index().pivot(
    #                             index='d', columns='dept_id', values='prev_day_sales').add_prefix('prev_d_dept_'),
    #                         on=['d'], how='left')

    # remove category columns
    del data_df['wm_yr_wk']
    del data_df['item_id']
    del data_df['dept_id']
    del data_df['cat_id']
    del data_df['store_id']
    del data_df['state_id']

    num_samples = data_df.id.nunique()
    num_timesteps = data_df.d.nunique()
    data_df = data_df.set_index(['id', 'd'])
    
    ids = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    enc_dec_feats = ['sell_price'] + label_encoded_cols
    enc_only_feats = data_df.columns.difference(['sales', 'sell_price', 'prev_day_sales'] + enc_dec_feats)

    sales_data_ids = train_data[ids].values
    Y = data_df.sales.values.reshape(num_timesteps, num_samples).T
    X_enc_only_feats = np.array(data_df[enc_only_feats]).reshape(num_timesteps, num_samples, -1)
    X_enc_dec_feats = np.array(data_df[enc_dec_feats]).reshape(num_timesteps, num_samples, -1)
    X_prev_day_sales = data_df.prev_day_sales.values.reshape(num_timesteps, num_samples)
    calendar_index = calendar_df.d
    X_calendar = np.array(calendar_df.iloc[:, 2:])
    X_calendar_cols = list(calendar_df.columns[2:])

    # # for prev_day_sales and sales (y), set value as -1 for the period the product was not actively sold
    # for idx, first_non_zero_idx in enumerate((X_prev_day_sales != 0).argmax(axis=0)):
    #     X_prev_day_sales[:first_non_zero_idx, idx] = -1
    # for idx, first_non_zero_idx in enumerate((Y != 0).argmax(axis=1)):
    #     Y[idx, :first_non_zero_idx] = -1

    # ---- Save processed data ---- #
    print('* Save processed data')
    data_dict = {'sales_data_ids': sales_data_ids, 'calendar_index': calendar_index,
                 'X_prev_day_sales': X_prev_day_sales,
                 'X_enc_only_feats': X_enc_only_feats, 'X_enc_dec_feats' : X_enc_dec_feats,
                 'enc_dec_feat_names': enc_dec_feats, 'enc_only_feat_names': enc_only_feats,
                 'X_calendar': X_calendar, 'X_calendar_cols': X_calendar_cols,
                 'Y': Y,
                 'cal_label_encoder': cal_label_encoder, 'label_encoder': label_encoder}

    # pickle data
    with open(f'{output_dir}/data.pickle', 'wb') as f:
        pkl.dump(data_dict, f, protocol=pkl.HIGHEST_PROTOCOL)

### Data Utils

In [4]:
def get_aggregated_series(sales, sales_data_ids, agg_fn='sum'):
    """
    Aggregates 30,490 level 12 series to generate data for all 42,840 series

    Input data format:
    sales: np array of shape (30490, num_timesteps)
    sales_data_ids: np array of shape (30490, 5)
                    with 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' as the columns
    agg_fn: function to be used for getting aggregated series' values ('mean' or 'sum')
    """

    df = pd.DataFrame({col: sales_data_ids[:, i] for col, i in
                       zip(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], range(0, 5))})
    df = pd.concat([df, pd.DataFrame(sales)], axis=1)
    data_cols = [i for i in range(0, sales.shape[1])]

    agg_indices, agg_series, agg_series_id = [], [], []

    # Level 1
    agg = np.sum(sales, 0) if agg_fn == 'sum' else np.mean(sales, 0)
    agg_series.append(agg.reshape(1, -1))
    agg_series_id.append(np.array(['Level1_Total_X']))

    # Level 2
    agg = df.groupby(['state_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append(('Level2_' + agg.index.values + '_X'))

    # Level 3
    agg = df.groupby(['store_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append(('Level3_' + agg.index.values + '_X'))

    # Level 4
    agg = df.groupby(['cat_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append(('Level4_' + agg.index.values + '_X'))

    # Level 5
    agg = df.groupby(['dept_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append(('Level5_' + agg.index.values + '_X'))

    # Level 6
    agg = df.groupby(['state_id', 'cat_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append('Level6_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 7
    agg = df.groupby(['state_id', 'dept_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append('Level7_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 8
    agg = df.groupby(['store_id', 'cat_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append('Level8_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 9
    agg = df.groupby(['store_id', 'dept_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append('Level9_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 10
    agg = df.groupby(['item_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append(('Level10_' + agg.index.values + '_X'))

    # Level 11
    agg = df.groupby(['state_id', 'item_id'])[data_cols]
    agg_indices.append(agg.indices)
    agg = agg.agg(agg_fn)
    agg_series.append(agg.values)
    agg_series_id.append('Level11_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 12
    agg = df.set_index(['item_id', 'store_id'])[data_cols]
    agg_series.append(agg.values)
    agg_series_id.append('Level12_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Get affected_hierarchy_ids - all the series affected on updating each Level 12 series
    affected_hierarchy_ids = np.empty((30490, 12), np.int32)

    # Level 1
    affected_hierarchy_ids[:, 0] = 0
    fill_id, fill_col = 1, 1
    # Level 2
    for k, v in agg_indices[0].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 3
    for k, v in agg_indices[1].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 4
    for k, v in agg_indices[2].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 5
    for k, v in agg_indices[3].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 6
    for k, v in agg_indices[4].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 7
    for k, v in agg_indices[5].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 8
    for k, v in agg_indices[6].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 9
    for k, v in agg_indices[7].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 10
    for k, v in agg_indices[8].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 11
    for k, v in agg_indices[9].items():
        affected_hierarchy_ids[v, fill_col] = fill_id
        fill_id += 1
    fill_col += 1
    # Level 12
    affected_hierarchy_ids[:, fill_col] = fill_id + np.arange(0, 30490)

    return np.concatenate(agg_series, axis=0), np.concatenate(agg_series_id, axis=0).\
        astype('<U28'), affected_hierarchy_ids


def get_weights_all_levels(sales, sell_price, sales_data_ids):
    """
    Generates weights for all 42,840 series

    Input data format:
    sales: np array of shape (30490, 28)
    sell_price: np array of shape (30490, 28)

    sales_data_ids: np array of shape (30490, 5)
                with 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' as the columns
    """

    assert (sales.shape == sell_price.shape), "Sell price and Sales arrays have different sizes"
    assert (sales.shape[1] == 28), "Number of timesteps provided weight calculation is not equal to 28"

    # Get actual dollar sales for last 28 days for all 42,840 series
    dollar_sales = sales * sell_price
    agg_series, agg_series_id, _ = get_aggregated_series(dollar_sales, sales_data_ids)

    # Sum up the actual dollar sales for all 28 timesteps
    agg_series = agg_series.sum(1)

    # Calculate total sales for each level
    level_totals = agg_series[np.core.defchararray.find(agg_series_id, f'Level1_') == 0].sum()

    # Calculate weight for each series
    weights = agg_series / level_totals

    return weights, agg_series_id


def get_aggregated_encodings(encoded_feats, sales_data_ids):
    """
    Aggregates 30,490 level 12 series to generate encoding data for all 42,840 series
    The grouped features at each level are encoded as num_categories + 1

    Input data format:
    encoded_feats: np array of shape (30490, num_timesteps, 5)
                    with 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' as the columns
    sales_data_ids: np array of shape (30490, 5)
                    with 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id' as the columns
    """

    num_timesteps = encoded_feats.shape[1]
    # Note - assuming the encoded value is same for all the timesteps of a series
    encoded_feats = encoded_feats[:, 0]
    df = pd.DataFrame({col: sales_data_ids[:, i] for col, i in
                       zip(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], range(0, 5))})
    df = pd.concat([df, pd.DataFrame(encoded_feats)], axis=1)
    data_cols = [i for i in range(0, encoded_feats.shape[1])]
    encode_value = np.array([df[i].nunique() for i in data_cols])

    agg_series, agg_series_id = [], []

    # Level 1
    agg_series.append(encode_value.reshape(1, -1))
    agg_series_id.append(np.array(['Level1_Total_X']))

    # Level 2
    agg = df.groupby(['state_id'])[4].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, 4] = agg.values
    agg_series.append(encoded)
    agg_series_id.append(('Level2_' + agg.index.values + '_X'))

    # Level 3
    agg = df.groupby(['store_id'])[3].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, 3] = agg.values
    agg_series.append(encoded)
    agg_series_id.append(('Level3_' + agg.index.values + '_X'))

    # Level 4
    agg = df.groupby(['cat_id'])[2].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, 2] = agg.values
    agg_series.append(encoded)
    agg_series_id.append(('Level4_' + agg.index.values + '_X'))

    # Level 5
    agg = df.groupby(['dept_id'])[1].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, 1] = agg.values
    agg_series.append(encoded)
    agg_series_id.append(('Level5_' + agg.index.values + '_X'))

    # Level 6
    agg = df.groupby(['state_id', 'cat_id'])[[4, 2]].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, [4, 2]] = agg.values
    agg_series.append(encoded)
    agg_series_id.append('Level6_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 7
    agg = df.groupby(['state_id', 'dept_id'])[[4, 1]].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, [4, 1]] = agg.values
    agg_series.append(encoded)
    agg_series_id.append('Level7_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 8
    agg = df.groupby(['store_id', 'cat_id'])[[3, 2]].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, [3, 2]] = agg.values
    agg_series.append(encoded)
    agg_series_id.append('Level8_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 9
    agg = df.groupby(['store_id', 'dept_id'])[[3, 1]].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, [3, 1]] = agg.values
    agg_series.append(encoded)
    agg_series_id.append('Level9_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 10
    agg = df.groupby(['item_id'])[0].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, 0] = agg.values
    agg_series.append(encoded)
    agg_series_id.append(('Level10_' + agg.index.values + '_X'))

    # Level 11
    agg = df.groupby(['state_id', 'item_id'])[[4, 0]].mean()
    encoded = np.repeat(encode_value[np.newaxis, :], agg.shape[0], axis=0)
    encoded[:, [4, 0]] = agg.values
    agg_series.append(encoded)
    agg_series_id.append('Level11_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    # Level 12
    agg = df.set_index(['item_id', 'store_id'])[data_cols]
    agg_series.append(agg.values)
    agg_series_id.append('Level12_' + agg.index.get_level_values(0) + '_' + agg.index.get_level_values(1))

    agg_series = np.repeat(np.concatenate(agg_series, axis=0)[:, np.newaxis, :], num_timesteps, axis=1)

    return agg_series, np.concatenate(agg_series_id, axis=0).astype('<U28')


def update_preds_acc_hierarchy(prev_preds, preds, affected_ids):
    """
    prev_preds: Previously stored predictions for all 42,840 series (42840, n_timesteps)
    preds: Current batch predictions (batch_size, n_timesteps)
    affected_ids: the ids of all the series affected by the series in preds (30490, 12)
    """

    # get the change in predictions for the batch series
    change_preds = (preds - prev_preds[affected_ids[:, -1]]).repeat_interleave(12, dim=0)

    affected_ids = affected_ids.flatten()
    prev_preds = prev_preds.index_add(0, affected_ids, change_preds)

    return prev_preds

### Data Loader

In [5]:
# Dataset (Input Pipeline)
class CustomDataset(data_utils.Dataset):
    """
    Custom dataset

    Let:
    training period timesteps = [0, N]
    prediction period timesteps = [N+1, N+P]

    Arguments:
    X_prev_day_sales : previous day sales for training period ([0, N])
    X_enc_only_feats : aggregated series' previous day sales for training period ([0, N])
    X_enc_dec_feats : sell price and categorical features for training and prediction period ([0, N+P])
    X_calendar : calendar features for training and prediction period ([0, N+P])
    X_last_day_sales : the actual sales for the day before the start of the prediction period (for timestep N)
                       (this will serve as the first timestep's input for the decoder)
    Y : actual sales, denoting targets for prediction period ([N+1, N+P])

    Returns:
    List of torch arrays:
    x_enc: concatenated encoder features (except embedding)
    x_enc_emb: concatenated encoder embedding features
    x_dec: concatenated decoder features (except embedding)
    x_dec_emb: concatenated decoder embedding features
    x_last_day_sales: the actual sales for the day before the start of the prediction period
    y: targets (only in training phase)
    """

    def __init__(self, X_prev_day_sales, X_enc_only_feats, X_enc_dec_feats, X_calendar, norm_factor, norm_factor_sell_p,
                 window_time_range, lagged_feats=None, rolling_feats=None, Y=None, rmsse_denominator=None,
                 wrmsse_weights=None, window_id=None, config=None, is_training=True):

        self.X_prev_day_sales = X_prev_day_sales
        self.X_enc_only_feats = X_enc_only_feats
        self.X_enc_dec_feats = X_enc_dec_feats
        self.X_calendar = X_calendar
        self.norm_factor = norm_factor
        self.norm_factor_sell_p = norm_factor_sell_p
        self.window_time_range = window_time_range
        self.window_id = window_id
        self.lagged_feats = lagged_feats
        self.rolling_feats = rolling_feats
        self.config = config
        self.is_training = is_training

        if Y is not None:
            self.Y = torch.from_numpy(Y).float()
            self.rmsse_denominator = torch.from_numpy(rmsse_denominator).float()
            self.wrmsse_weights = torch.from_numpy(wrmsse_weights).float()
        else:
            self.Y = None

    def __len__(self):
        return self.norm_factor.shape[0]

    def __getitem__(self, idx):
        if self.window_id is not None:
            time_range = self.window_time_range[self.window_id[idx]]
            scale = self.rmsse_denominator[idx - (self.window_id[idx] * 42840)]
            weight = self.wrmsse_weights[idx - (self.window_id[idx] * 42840)]
            ids_idx = idx - (self.window_id[idx] * 42840)
            window_id = self.window_id[idx]
        else:
            time_range = self.window_time_range
            ids_idx = idx
            window_id = 0
            if self.Y is not None:
                scale = self.rmsse_denominator[idx]
                weight = self.wrmsse_weights[idx]

        # Filter data for time range of the selected window, also normalize prev_day_sales and sell_price
        norm_factor = self.norm_factor[idx]
        X_calendar = self.X_calendar[time_range[0]:time_range[2]]

        X_prev_day_sales = self.X_prev_day_sales[time_range[0]:time_range[1], ids_idx] / norm_factor
        X_prev_day_sales_dec = self.X_prev_day_sales[time_range[1]:time_range[2], ids_idx] / norm_factor
        X_prev_day_sales[X_prev_day_sales < 0] = -1.0
        X_prev_day_sales_dec[X_prev_day_sales_dec < 0] = -1.0

        if self.lagged_feats is not None:
            X_lag_feats_enc = self.lagged_feats[time_range[0]:time_range[1], ids_idx] / norm_factor
            X_lag_feats_dec = self.lagged_feats[time_range[1]:time_range[2], ids_idx] / norm_factor
            X_lag_feats_enc[X_lag_feats_enc < 0] = -1.0
            X_lag_feats_dec[X_lag_feats_dec < 0] = -1.0
            # rolling features for decoder will be calculated on the fly (by including predictions for the prev steps)
            X_roll_feats_enc = self.rolling_feats[time_range[0]:time_range[1], ids_idx] / norm_factor

        # If training and if enabled in config, multiply sales features by random noise
        # (new value will be lower bound by 0)
        if self.config.add_random_noise and self.is_training:
            if len(X_prev_day_sales[X_prev_day_sales >= 0]) > 0:
                random_noise = np.clip(np.random.normal(1, X_prev_day_sales[X_prev_day_sales >= 0].std(),
                                                        time_range[2] - time_range[0]), 0, None)
                noise = np.ones_like(random_noise)
                mask = np.random.choice([0, 1], size=noise.shape, p=((1 - self.config.noise_rate),
                                                                     self.config.noise_rate)).astype(np.bool)
                noise[mask] = random_noise[mask]

                X_prev_day_sales[X_prev_day_sales >= 0] *= noise[:time_range[1] - time_range[0]][X_prev_day_sales >= 0]
                X_prev_day_sales_dec[X_prev_day_sales_dec >= 0] *= noise[time_range[1]
                                                                         - time_range[2]:][X_prev_day_sales_dec >= 0]

            if self.lagged_feats is not None:
                # lagged features
                if len(X_lag_feats_enc[X_lag_feats_enc >= 0]) > 0:
                    random_noise = np.clip(np.random.normal(1, X_lag_feats_enc[X_lag_feats_enc >= 0].std(0),
                                                            [time_range[2] - time_range[0],
                                                             X_lag_feats_enc.shape[1]]), 0, None)
                    noise = np.ones_like(random_noise)
                    mask = np.random.choice([0, 1], size=noise.shape, p=((1 - self.config.noise_rate),
                                                                         self.config.noise_rate)).astype(np.bool)
                    noise[mask] = random_noise[mask]

                    X_lag_feats_enc[X_lag_feats_enc >= 0] *= noise[:time_range[1] - time_range[0]][X_lag_feats_enc >= 0]
                    X_lag_feats_dec[X_lag_feats_dec >= 0] *= noise[time_range[1]
                                                                   - time_range[2]:][X_lag_feats_dec >= 0]

                # rolling features
                random_noise = np.clip(np.random.normal(1, X_roll_feats_enc[:, :len(self.config.rolling)].std(0),
                                                        [time_range[1] - time_range[0],
                                                         len(self.config.rolling)]), 0, None)
                noise = np.ones_like(random_noise)
                mask = np.random.choice([0, 1], size=noise.shape, p=((1 - self.config.noise_rate),
                                                                     self.config.noise_rate)).astype(np.bool)
                noise[mask] = random_noise[mask]

                X_roll_feats_enc[:, :len(self.config.rolling)] *= noise
                X_roll_feats_enc[:, len(self.config.rolling):] *= noise

        X_enc_dec_feats = self.X_enc_dec_feats[time_range[0]:time_range[2], ids_idx]

        # Directly dividing the sell price column leads to memory explosion
        norm_factor_sell_p = np.ones_like(X_enc_dec_feats, np.float64)
        norm_factor_sell_p[:, 0] = self.norm_factor_sell_p[idx]
        X_enc_dec_feats = X_enc_dec_feats / norm_factor_sell_p

        if self.Y is not None:
            Y = self.Y[ids_idx, time_range[1]:time_range[2]]

        enc_timesteps = time_range[1] - time_range[0]
        dec_timesteps = time_range[2] - time_range[0] - enc_timesteps
        num_embedding = 5
        num_cal_embedding = 2

        # input data for encoder
        x_enc_dec_feats_enc = X_enc_dec_feats[:enc_timesteps, :-num_embedding].reshape(enc_timesteps, -1)

        x_prev_day_sales_enc = X_prev_day_sales.reshape(-1, 1)
        x_sales_feats_enc = x_prev_day_sales_enc if self.lagged_feats is None \
            else np.concatenate([x_prev_day_sales_enc, X_lag_feats_enc, X_roll_feats_enc], 1)

        x_calendar_enc = X_calendar[:enc_timesteps, :-num_cal_embedding]
        x_calendar_enc_emb = X_calendar[:enc_timesteps, -num_cal_embedding:].reshape(enc_timesteps, -1)

        x_enc = np.concatenate([x_enc_dec_feats_enc, x_calendar_enc, x_sales_feats_enc], axis=1)
        x_enc_emb = X_enc_dec_feats[:enc_timesteps, -num_embedding:].reshape(enc_timesteps, -1)

        # input data for decoder
        x_enc_dec_feats_dec = X_enc_dec_feats[enc_timesteps:, :-num_embedding].reshape(dec_timesteps, -1)
        x_calendar_dec = X_calendar[enc_timesteps:, :-num_cal_embedding]
        x_calendar_dec_emb = X_calendar[enc_timesteps:, -num_cal_embedding:].reshape(dec_timesteps, -1)

        x_prev_day_sales_dec = X_prev_day_sales_dec.reshape(-1, 1)
        x_sales_feats_dec = x_prev_day_sales_dec if self.lagged_feats is None \
            else np.concatenate([x_prev_day_sales_dec, X_lag_feats_dec], 1)

        x_dec = np.concatenate([x_enc_dec_feats_dec, x_calendar_dec], axis=1)
        x_dec_emb = X_enc_dec_feats[enc_timesteps:, -num_embedding:].reshape(dec_timesteps, -1)

        if self.Y is None:
            return [[torch.from_numpy(x_enc).float(), torch.from_numpy(x_enc_emb).long(),
                     torch.from_numpy(x_calendar_enc_emb).long(),
                     torch.from_numpy(x_dec).float(), torch.from_numpy(x_dec_emb).long(),
                     torch.from_numpy(x_calendar_dec_emb).long(),
                     torch.from_numpy(x_sales_feats_dec).float()], norm_factor]

        return [[torch.from_numpy(x_enc).float(), torch.from_numpy(x_enc_emb).long(),
                 torch.from_numpy(x_calendar_enc_emb).long(),
                 torch.from_numpy(x_dec).float(), torch.from_numpy(x_dec_emb).long(),
                 torch.from_numpy(x_calendar_dec_emb).long(),
                 torch.from_numpy(x_sales_feats_dec).float()],
                Y, torch.from_numpy(np.array(norm_factor)).float(),
                ids_idx,
                [scale, weight],
                window_id]


class DataLoader:
    def __init__(self, config):
        self.config = config

        # load data
        with open(f'{self.config.data_file}', 'rb') as f:
            data_dict = pkl.load(f)

        self.ids = data_dict['sales_data_ids']
        self.enc_dec_feat_names = data_dict['enc_dec_feat_names']
        self.sell_price_i = self.enc_dec_feat_names.index('sell_price')

        self.X_prev_day_sales, self.agg_ids, _ = get_aggregated_series(data_dict['X_prev_day_sales'].T, self.ids)
        self.X_prev_day_sales = self.X_prev_day_sales.T

        self.X_enc_dec_feats = data_dict['X_enc_dec_feats']
        self.sell_price_l12 = self.X_enc_dec_feats[:, :, self.sell_price_i]
        sell_price_all, _, _ = get_aggregated_series(self.X_enc_dec_feats[:, :, self.sell_price_i].T, self.ids, 'mean')
        encodings_all, _ = get_aggregated_encodings(self.X_enc_dec_feats[:, :, 1:].transpose(1, 0, 2), self.ids)
        self.X_enc_dec_feats = np.concatenate([sell_price_all[:, :, np.newaxis], encodings_all], axis=2) \
            .transpose(1, 0, 2)

        self.Y_l12 = data_dict['Y']
        self.Y, _, _ = get_aggregated_series(data_dict['Y'], self.ids)
        self.Y = self.Y

        # for prev_day_sales, set value as -1 for the period the product was not actively sold
        self.X_prev_day_sales_unsold_negative = self.X_prev_day_sales.copy()
        for idx, first_non_zero_idx in enumerate((self.X_prev_day_sales != 0).argmax(axis=0)):
            self.X_prev_day_sales_unsold_negative[:first_non_zero_idx, idx] = -1

        self.X_enc_only_feats = data_dict['X_enc_only_feats']
        self.X_calendar = data_dict['X_calendar']
        self.n_windows = 1

    def create_train_loader(self, data_start_t=None, horizon_start_t=None, horizon_end_t=None):
        if (data_start_t is None) | (horizon_start_t is None) | (horizon_end_t is None):
            data_start_t = self.config.training_ts['data_start_t']
            horizon_start_t = self.config.training_ts['horizon_start_t']
            horizon_end_t = self.config.training_ts['horizon_end_t']

        # Run a sliding window of length "window_length" and train for the next month of each window
        if self.config.sliding_window:
            window_length = self.config.window_length
            window_time_range, norm_factor, norm_factor_sell_p = [], [], []
            weights, scales = [], []

            for idx, i in enumerate(range(data_start_t + window_length, horizon_end_t, 28)):
                w_data_start_t, w_horizon_start_t = data_start_t + (idx * 28), i
                w_horizon_end_t = w_horizon_start_t + 28
                window_time_range.append([w_data_start_t - data_start_t, w_horizon_start_t - data_start_t,
                                          w_horizon_end_t - data_start_t])

                # calculate denominator for rmsse loss
                absolute_movement = np.absolute(self.Y.T[:w_horizon_start_t] -
                                                self.X_prev_day_sales[:w_horizon_start_t]).astype(np.int64)
                actively_sold_in_range = (self.X_prev_day_sales[:w_horizon_start_t] != 0).argmax(axis=0)
                rmsse_den = []
                for idx_active_sell, first_active_sell_idx in enumerate(actively_sold_in_range):
                    den = absolute_movement[first_active_sell_idx:, idx_active_sell].mean()
                    den = den if den != 0 else 1
                    rmsse_den.append(den)
                scales.append(np.array(rmsse_den))

                # Get weights for WRMSSE and SPL loss
                w_weights, _ = get_weights_all_levels(self.Y_l12[:, w_horizon_start_t - 28:w_horizon_start_t],
                                                      self.sell_price_l12[w_horizon_start_t - 28:w_horizon_start_t,
                                                      :].T,
                                                      self.ids)
                weights.append(w_weights)

                # Normalize sale features by dividing by mean of each series (as per the selected input window)
                w_X_prev_day_sales_calc = self.X_prev_day_sales[w_data_start_t:w_horizon_start_t]
                w_norm_factor = np.mean(w_X_prev_day_sales_calc, 0)
                w_norm_factor[w_norm_factor == 0] = 1.

                w_X_sell_p = self.X_enc_dec_feats[w_data_start_t:w_horizon_start_t, :, self.sell_price_i].copy().astype(
                    float)
                w_norm_factor_sell_p = np.median(w_X_sell_p, 0)
                w_norm_factor_sell_p[w_norm_factor_sell_p == 0] = 1.
                norm_factor.append(w_norm_factor)
                norm_factor_sell_p.append(w_norm_factor_sell_p)

            self.n_windows = idx + 1
            scales = np.concatenate(scales, 0)
            weights = np.concatenate(weights, 0)
            norm_factor = np.concatenate(norm_factor, 0)
            norm_factor_sell_p = np.concatenate(norm_factor_sell_p, 0)
            window_time_range = np.array(window_time_range)
            window_id = np.arange(idx + 1).repeat(self.X_enc_dec_feats.shape[1])

        else:
            # calculate denominator for rmsse loss
            absolute_movement = np.absolute(self.Y.T[:horizon_start_t] -
                                            self.X_prev_day_sales[:horizon_start_t]).astype(np.int64)
            actively_sold_in_range = (self.X_prev_day_sales[:horizon_start_t] != 0).argmax(axis=0)
            rmsse_den = []
            for idx, first_active_sell_idx in enumerate(actively_sold_in_range):
                den = absolute_movement[first_active_sell_idx:, idx].mean()
                den = den if den != 0 else 1
                rmsse_den.append(den)

            # Get weights for WRMSSE and SPL loss
            weights, _ = get_weights_all_levels(self.Y_l12[:, horizon_start_t - 28:horizon_start_t],
                                                self.sell_price_l12[horizon_start_t - 28:horizon_start_t, :].T,
                                                self.ids)

            # Normalize sale features by dividing by mean of each series (as per the selected input window)
            X_prev_day_sales_calc = self.X_prev_day_sales[data_start_t:horizon_start_t]
            norm_factor = np.mean(X_prev_day_sales_calc, 0)
            norm_factor[norm_factor == 0] = 1.

            X_sell_p = self.X_enc_dec_feats[data_start_t:horizon_start_t, :, self.sell_price_i].copy().astype(float)
            norm_factor_sell_p = np.median(X_sell_p, 0)
            norm_factor_sell_p[norm_factor_sell_p == 0] = 1.

            window_time_range = np.array([0, horizon_start_t - data_start_t, horizon_end_t - data_start_t])
            scales = np.array(rmsse_den)
            window_id = None

        # Add rolling and lag features
        if self.config.lag_and_roll_feats:
            max_prev_ts_req = max(self.config.lags + self.config.rolling)
            lagged_feats = []
            for lag_i in np.array(sorted(self.config.lags, reverse=True)):
                lag_i_feat = np.roll(self.X_prev_day_sales_unsold_negative[data_start_t - max_prev_ts_req:]
                                     .astype(np.int32), lag_i, axis=0)
                lag_i_feat[:lag_i] = 0
                lagged_feats.append(lag_i_feat)
            lagged_feats = np.stack(lagged_feats, axis=2)[max_prev_ts_req:]

            rolling_feats, roll_i_means, roll_i_stds = [], [], []
            roll_df = pd.DataFrame(self.X_prev_day_sales[data_start_t - max_prev_ts_req:].astype(np.int32))
            for roll_i in self.config.rolling:
                roll_i_feat_mean = pd.DataFrame(roll_df).rolling(roll_i, axis=0).mean().fillna(0).values
                roll_i_means.append(roll_i_feat_mean)
            for roll_i in self.config.rolling:
                roll_i_feat_std = pd.DataFrame(roll_df).rolling(roll_i, axis=0).std().fillna(0).values
                roll_i_stds.append(roll_i_feat_std)
            rolling_feats = np.stack(roll_i_means + roll_i_stds, 2)[max_prev_ts_req:]
        else:
            lagged_feats, rolling_feats = None, None

        dataset = CustomDataset(self.X_prev_day_sales_unsold_negative[data_start_t:],
                                self.X_enc_only_feats[data_start_t:],
                                self.X_enc_dec_feats[data_start_t:],
                                self.X_calendar[data_start_t:],
                                norm_factor, norm_factor_sell_p, window_time_range,
                                lagged_feats, rolling_feats,
                                Y=self.Y[:, data_start_t:],
                                rmsse_denominator=scales, wrmsse_weights=weights, window_id=window_id,
                                config=self.config)

        return torch.utils.data.DataLoader(dataset=dataset, batch_size=self.config.batch_size, shuffle=True,
                                           num_workers=3, pin_memory=True)

    def create_val_loader(self, data_start_t=None, horizon_start_t=None, horizon_end_t=None):
        if (data_start_t is None) | (horizon_start_t is None) | (horizon_end_t is None):
            data_start_t = self.config.validation_ts['data_start_t']
            horizon_start_t = self.config.validation_ts['horizon_start_t']
            horizon_end_t = self.config.validation_ts['horizon_end_t']

        # calculate denominator for rmsse loss
        absolute_movement = np.absolute(self.Y.T[:horizon_start_t] -
                                        self.X_prev_day_sales[:horizon_start_t]).astype(np.int64)
        actively_sold_in_range = (self.X_prev_day_sales[:horizon_start_t] != 0).argmax(axis=0)
        rmsse_den = []
        for idx, first_active_sell_idx in enumerate(actively_sold_in_range):
            den = absolute_movement[first_active_sell_idx:, idx].mean()
            den = den if den != 0 else 1
            rmsse_den.append(den)

        # Get weights for WRMSSE and SPL loss
        weights, _ = get_weights_all_levels(self.Y_l12[:, horizon_start_t - 28:horizon_start_t],
                                            self.sell_price_l12[horizon_start_t - 28:horizon_start_t, :].T,
                                            self.ids)

        # Normalize sale features by dividing by mean of each series (as per the selected input window)
        X_prev_day_sales_calc = self.X_prev_day_sales[data_start_t:horizon_start_t]
        norm_factor = np.mean(X_prev_day_sales_calc, 0)
        norm_factor[norm_factor == 0] = 1.

        X_sell_p = self.X_enc_dec_feats[data_start_t:horizon_start_t, :, self.sell_price_i].copy().astype(float)
        norm_factor_sell_p = np.median(X_sell_p, 0)
        norm_factor_sell_p[norm_factor_sell_p == 0] = 1.

        window_time_range = [0, horizon_start_t - data_start_t, horizon_end_t - data_start_t]

        # Add rolling and lag features
        if self.config.lag_and_roll_feats:
            max_prev_ts_req = max(self.config.lags + self.config.rolling)
            lagged_feats = []
            for lag_i in np.array(sorted(self.config.lags, reverse=True)):
                lag_i_feat = np.roll(self.X_prev_day_sales_unsold_negative[data_start_t - max_prev_ts_req:]
                                     .astype(np.int32), lag_i, axis=0)
                lag_i_feat[:lag_i] = 0
                lagged_feats.append(lag_i_feat)
            lagged_feats = np.stack(lagged_feats, axis=2)[max_prev_ts_req:]

            rolling_feats, roll_i_means, roll_i_stds = [], [], []
            roll_df = pd.DataFrame(self.X_prev_day_sales[data_start_t - max_prev_ts_req:].astype(np.int32))
            for roll_i in self.config.rolling:
                roll_i_feat_mean = pd.DataFrame(roll_df).rolling(roll_i, axis=0).mean().fillna(0).values
                roll_i_means.append(roll_i_feat_mean)
            for roll_i in self.config.rolling:
                roll_i_feat_std = pd.DataFrame(roll_df).rolling(roll_i, axis=0).std().fillna(0).values
                roll_i_stds.append(roll_i_feat_std)
            rolling_feats = np.stack(roll_i_means + roll_i_stds, 2)[max_prev_ts_req:]
        else:
            lagged_feats, rolling_feats = None, None

        dataset = CustomDataset(self.X_prev_day_sales_unsold_negative[data_start_t:],
                                self.X_enc_only_feats[data_start_t:],
                                self.X_enc_dec_feats[data_start_t:],
                                self.X_calendar[data_start_t:],
                                norm_factor, norm_factor_sell_p, window_time_range,
                                lagged_feats, rolling_feats,
                                Y=self.Y[:, data_start_t:],
                                rmsse_denominator=np.array(rmsse_den), wrmsse_weights=weights,
                                config=self.config, is_training=False)

        return torch.utils.data.DataLoader(dataset=dataset, batch_size=self.config.batch_size, num_workers=3,
                                           pin_memory=True)

    def create_test_loader(self, data_start_t=None, horizon_start_t=None, horizon_end_t=None):
        if (data_start_t is None) | (horizon_start_t is None) | (horizon_end_t is None):
            data_start_t = self.config.test_ts['data_start_t']
            horizon_start_t = self.config.test_ts['horizon_start_t']
            horizon_end_t = self.config.test_ts['horizon_end_t']

        # Normalize sale features by dividing by mean of each series (as per the selected input window)
        X_prev_day_sales_calc = self.X_prev_day_sales[data_start_t:horizon_start_t]
        norm_factor = np.mean(X_prev_day_sales_calc, 0)
        norm_factor[norm_factor == 0] = 1.

        X_sell_p = self.X_enc_dec_feats[data_start_t:horizon_start_t, :, self.sell_price_i].copy().astype(float)
        norm_factor_sell_p = np.median(X_sell_p, 0)
        norm_factor_sell_p[norm_factor_sell_p == 0] = 1.

        window_time_range = [0, horizon_start_t - data_start_t, horizon_end_t - data_start_t]

        # Add rolling and lag features
        if self.config.lag_and_roll_feats:
            max_prev_ts_req = max(self.config.lags + self.config.rolling)
            lagged_feats = []
            for lag_i in np.array(sorted(self.config.lags, reverse=True)):
                lag_i_feat = np.roll(self.X_prev_day_sales_unsold_negative[data_start_t - max_prev_ts_req:]
                                     .astype(np.int32), lag_i, axis=0)
                lag_i_feat[:lag_i] = 0
                lagged_feats.append(lag_i_feat)
            lagged_feats = np.stack(lagged_feats, axis=2)[max_prev_ts_req:]

            rolling_feats, roll_i_means, roll_i_stds = [], [], []
            roll_df = pd.DataFrame(self.X_prev_day_sales[data_start_t - max_prev_ts_req:].astype(np.int32))
            for roll_i in self.config.rolling:
                roll_i_feat_mean = pd.DataFrame(roll_df).rolling(roll_i, axis=0).mean().fillna(0).values
                roll_i_means.append(roll_i_feat_mean)
            for roll_i in self.config.rolling:
                roll_i_feat_std = pd.DataFrame(roll_df).rolling(roll_i, axis=0).std().fillna(0).values
                roll_i_stds.append(roll_i_feat_std)
            rolling_feats = np.stack(roll_i_means + roll_i_stds, 2)[max_prev_ts_req:]
        else:
            lagged_feats, rolling_feats = None, None

        dataset = CustomDataset(self.X_prev_day_sales_unsold_negative[data_start_t:],
                                self.X_enc_only_feats[data_start_t:],
                                self.X_enc_dec_feats[data_start_t:],
                                self.X_calendar[data_start_t:],
                                norm_factor, norm_factor_sell_p, window_time_range,
                                lagged_feats, rolling_feats, config=self.config, is_training=False)

        return torch.utils.data.DataLoader(dataset=dataset, batch_size=self.config.batch_size, num_workers=3,
                                           pin_memory=True)

### Config

In [6]:
class Config:

    resume_training = False
    resume_from_fold = 1  # In case of k-fold training [1, k]

    loss_fn = 'SPLLoss'
    metric = 'SPLMetric'
    secondary_metric = 'WRMSSEMetric'
    architecture = 'seq2seq_w_attn_on_hid'

    # Running a sliding window training will help increase the training data
    sliding_window = True  # Note: sliding window has not been tested with WRMSSELoss
    window_length = 28 * 13

    lag_and_roll_feats = True  # Note: Currently only works with dilated_seq2seq & seq2seq_w_attn_on_hid architectures
    lags = list(range(27, 42))
    rolling = [7, 14, 30, 60, 180]

    # Regularization
    add_random_noise = True
    noise_rate = 0.5

    # *** RNN *** #
    # hidden dimension and no. of layers will be the same for both encoder and decoder
    rnn_num_hidden = 128
    rnn_num_layers = 2
    bidirectional = True
    enc_rnn_dropout = 0.2
    dec_rnn_dropout = 0.0
    teacher_forcing_ratio = 0.0

    # *** Transformer *** #
    enc_nhead = 4
    enc_nlayers = 2
    enc_dropout = 0.1
    dec_nhead = 4
    dec_nlayers = 2
    dec_dropout = 0.1

    # num_epochs = 200
    num_epochs = 2
    batch_size = 160
    learning_rate = 0.0003

    # training, validation and test periods
    training_ts = {'data_start_t': 1969 - 1 - (28 * 29), 'horizon_start_t': 1969 - 1 - (28 * 3),
                   'horizon_end_t': 1969 - 1 - (28 * 2)}
    validation_ts = {'data_start_t': 1969 - 1 - (28 * 15), 'horizon_start_t': 1969 - 1 - (28 * 2),
                     'horizon_end_t': 1969 - 1 - (28 * 1)}
    test_ts = {'data_start_t': 1969 - 1 - (28 * 14), 'horizon_start_t': 1969 - 1 - (28 * 1),
               'horizon_end_t': 1969 - 1 - (28 * 0)}

    # Parameters for k-fold training
    k_fold = True
    k_fold_splits = [(f_train_ts, f_val_ts) for f_train_ts, f_val_ts in
                     zip([
                         {'data_start_t': 1969 - 1 - (28 * 31), 'horizon_start_t': 1969 - 1 - (28 * 5),
                          'horizon_end_t': 1969 - 1 - (28 * 4)},
                         {'data_start_t': 1969 - 1 - (28 * 30), 'horizon_start_t': 1969 - 1 - (28 * 4),
                          'horizon_end_t': 1969 - 1 - (28 * 3)},
                         {'data_start_t': 1969 - 1 - (28 * 29), 'horizon_start_t': 1969 - 1 - (28 * 3),
                          'horizon_end_t': 1969 - 1 - (28 * 2)}
                     ], [
                         {'data_start_t': 1969 - 1 - (28 * 17), 'horizon_start_t': 1969 - 1 - (28 * 4),
                          'horizon_end_t': 1969 - 1 - (28 * 3)},
                         {'data_start_t': 1969 - 1 - (28 * 16), 'horizon_start_t': 1969 - 1 - (28 * 3),
                          'horizon_end_t': 1969 - 1 - (28 * 2)},
                         {'data_start_t': 1969 - 1 - (28 * 15), 'horizon_start_t': 1969 - 1 - (28 * 2),
                          'horizon_end_t': 1969 - 1 - (28 * 1)}
                     ])]

    data_file = './data.pickle'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Training Utils

In [7]:
import torch
import shutil
import os
import numpy as np


class ModelCheckpoint:
    def __init__(self, weight_dir='./weights', config=Config):
        self.weight_dir = weight_dir
        self.config = config
        file_prefix = '' if config.fold is None else f'fold_{config.fold}_'
        self.filename = os.path.join(self.weight_dir, file_prefix + 'model_latest_checkpoint.pth.tar')
        self.best_filename = os.path.join(self.weight_dir, file_prefix + 'model_best.pth.tar')

    def save(self, is_best, min_val_error, num_bad_epochs, epoch, model, optimizer, scheduler=None):
        scheduler_save = scheduler if scheduler is None else scheduler.state_dict()
        save_dict = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
            'min_val_error': min_val_error,
            'num_bad_epochs': num_bad_epochs,
            'scheduler': scheduler_save
        }
        torch.save(save_dict, self.filename)
        if is_best:
            shutil.copyfile(self.filename, self.best_filename)

    def load(self, model, optimizer=None, scheduler=None, load_best=False):
        load_filename = self.best_filename if load_best else self.filename
        if os.path.isfile(load_filename):
            checkpoint = torch.load(load_filename, map_location=self.config.device)
            model.load_state_dict(checkpoint['model'])
            if optimizer is not None:
                optimizer.load_state_dict(checkpoint['optimizer'])
            if scheduler is not None:
                scheduler.load_state_dict(checkpoint['scheduler'])
            start_epoch = checkpoint['epoch'] + 1
            min_val_error = checkpoint['min_val_error']
            num_bad_epochs = checkpoint['num_bad_epochs']
        else:
            raise FileNotFoundError(f'No checkpoint found at {load_filename}')

        return model, optimizer, scheduler, [start_epoch, min_val_error, num_bad_epochs]


class EarlyStopping(object):
    """
    author:https://github.com/stefanonardo
    source: https://gist.github.com/stefanonardo/693d96ceb2f531fa05db530f3e21517d
    """
    def __init__(self, mode='min', min_delta=0, patience=10, percentage=False):
        self.mode = mode
        self.min_delta = min_delta
        self.patience = patience
        self.best = None
        self.num_bad_epochs = 0
        self.is_better = None
        self._init_is_better(mode, min_delta, percentage)

        if patience == 0:
            self.is_better = lambda a, b: True
            self.step = lambda a: False

    def step(self, metrics):
        if self.best is None:
            self.best = metrics
            return False

        if np.isnan(metrics):
            return True

        if self.is_better(metrics, self.best):
            self.num_bad_epochs = 0
            self.best = metrics
        else:
            self.num_bad_epochs += 1

        if self.num_bad_epochs >= self.patience:
            return True

        return False

    def _init_is_better(self, mode, min_delta, percentage):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if not percentage:
            if mode == 'min':
                self.is_better = lambda a, best: a < best - min_delta
            if mode == 'max':
                self.is_better = lambda a, best: a > best + min_delta
        else:
            if mode == 'min':
                self.is_better = lambda a, best: a < best - (
                            best * min_delta / 100)
            if mode == 'max':
                self.is_better = lambda a, best: a > best + (
                            best * min_delta / 100)

# WorkFlow

### Read Data

In [8]:
print('Processing Data:\n')
read_data()
print('\nCompleted')

Processing Data:

* Processing calendar features
* Encoding categorical features
* Add previous day sales and merge sell prices
* Save processed data

Completed


### Train Data

In [9]:
import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from importlib import import_module
import shutil
import glob
import os
import sys

# from data_loader.data_generator import DataLoader
# from utils.data_utils import *
# from utils.training_utils import ModelCheckpoint, EarlyStopping
from losses_and_metrics import loss_functions, metrics
# from config import Config

seed = 0
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
np.random.seed(seed)


class Trainer:
    def __init__(self, config):
        self.config = config
        self.terminal_width = shutil.get_terminal_size((80, 20)).columns

        # Model
        print(f' Model: {self.config.architecture} '.center(self.terminal_width, '*'))
        model_type = import_module('models.' + self.config.architecture)
        create_model = getattr(model_type, 'create_model')
        self.model = create_model(self.config)
        print(self.model, end='\n\n')

        # Loss, Optimizer and LRScheduler
        self.criterion = getattr(loss_functions, self.config.loss_fn)(self.config)
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.config.learning_rate, alpha=0.95)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.5,
                                                                    patience=4, verbose=True)
        self.early_stopping = EarlyStopping(patience=10)
        self.agg_sum = self.config.loss_fn[:3] == 'SPL'
        self.loss_agg = np.sum if self.agg_sum else np.mean

        # Metric
        self.metric = getattr(metrics, config.metric)()
        self.metric_2 = getattr(metrics, config.secondary_metric)()

        print(f' Loading Data '.center(self.terminal_width, '*'))
        data_loader = DataLoader(self.config)
        self.ids = data_loader.ids

        self.train_loader = data_loader.create_train_loader()
        self.val_loader = data_loader.create_val_loader()
        self.n_windows = data_loader.n_windows

        self.start_epoch, self.min_val_error = 1, None
        # Load checkpoint if training is to be resumed
        self.model_checkpoint = ModelCheckpoint(config=self.config)
        if config.resume_training:
            self.model, self.optimizer, self.scheduler, [self.start_epoch, self.min_val_error, num_bad_epochs] = \
                self.model_checkpoint.load(self.model, self.optimizer, self.scheduler)
            self.early_stopping.best = self.min_val_error
            self.early_stopping.num_bad_epochs = num_bad_epochs
            print(f'Resuming model training from epoch {self.start_epoch}')
        else:
            # remove previous logs, if any
            if self.config.fold is None:
                logs = glob.glob('./logs/.*') + glob.glob('./logs/*')
                for f in logs:
                    try:
                        os.remove(f)
                    except IsADirectoryError:
                        shutil.rmtree(f)
            else:
                logs = glob.glob(f'./logs/fold_{self.config.fold}/.*') + glob.glob(f'./logs/fold_{self.config.fold}/*')
                for f in logs:
                    os.remove(f)

        # logging
        self.writer = SummaryWriter(f'logs') if self.config.fold is None \
            else SummaryWriter(f'logs/fold_{self.config.fold}')

    def _get_val_loss_and_err(self):
        self.model.eval()
        progbar = tqdm(self.val_loader)
        progbar.set_description("             ")
        losses, epoch_preds, epoch_ys, epoch_ws, epoch_scales = [], [], [], [], []
        for i, [x, y, norm_factor, ids_idx, loss_input, _] in enumerate(progbar):
            epoch_ys.append(y.data.numpy())
            epoch_scales.append(loss_input[0].data.numpy())
            epoch_ws.append(loss_input[1].data.numpy())

            x = [inp.to(self.config.device) for inp in x]
            y = y.to(self.config.device)
            norm_factor = norm_factor.to(self.config.device)
            loss_input = [inp.to(self.config.device) for inp in loss_input]

            preds = self.model(*x) * norm_factor[:, None, None]
            epoch_preds.append(preds.data.cpu().numpy())
            loss = self.criterion(preds, y, *loss_input)
            losses.append(loss.data.cpu().numpy())

        epoch_preds, epoch_ys = np.concatenate(epoch_preds, axis=0), np.concatenate(epoch_ys, axis=0)
        epoch_ws, epoch_scales = np.concatenate(epoch_ws, axis=0), np.concatenate(epoch_scales, axis=0)

        val_error = self.metric.get_error(epoch_preds, epoch_ys, epoch_scales, epoch_ws)
        val_error_2 = self.metric_2.get_error(epoch_preds[:, :, 4], epoch_ys, epoch_scales, epoch_ws)

        return self.loss_agg(losses), val_error, val_error_2

    def train(self):
        print(f' Training '.center(self.terminal_width, '*'), end='\n\n')

        for epoch in range(self.start_epoch, self.config.num_epochs + 1):
            print(f' Epoch [{epoch}/{self.config.num_epochs}] '.center(self.terminal_width, 'x'))
            self.model.train()
            progbar = tqdm(self.train_loader)
            losses, epoch_preds, epoch_ys, epoch_ws, epoch_scales = [], [], [], [], []
            for i, [x, y, norm_factor, ids_idx, loss_input, window_id] in enumerate(progbar):
                x = [inp.to(self.config.device) for inp in x]
                y = y.to(self.config.device)
                norm_factor = norm_factor.to(self.config.device)
                loss_input = [inp.to(self.config.device) for inp in loss_input]

                # Forward + Backward + Optimize
                self.optimizer.zero_grad()
                preds = self.model(*x) * norm_factor[:, None, None]

                if self.config.sliding_window:
                    if torch.sum(window_id == self.n_windows - 1) > 0:
                        epoch_ys.append(y[window_id == self.n_windows - 1].data.cpu().numpy().reshape(-1, 28))
                        epoch_scales.append(loss_input[0][window_id == self.n_windows - 1]
                                            .data.cpu().numpy().reshape(-1))
                        epoch_ws.append(loss_input[1][window_id == self.n_windows - 1]
                                        .data.cpu().numpy().reshape(-1))
                        epoch_preds.append(preds[window_id == self.n_windows - 1].data.cpu().numpy().reshape(-1, 28, 9))
                else:
                    epoch_ys.append(y.data.cpu().numpy())
                    epoch_scales.append(loss_input[0].data.cpu().numpy())
                    epoch_ws.append(loss_input[1].data.cpu().numpy())
                    epoch_preds.append(preds.data.cpu().cpu().numpy())

                loss = self.criterion(preds, y, *loss_input)
                losses.append(loss.data.cpu().numpy())

                if self.agg_sum:
                    progbar.set_description("loss = %0.3f " % np.round(
                        (len(self.train_loader) / (i + 1)) * self.loss_agg(losses) / self.n_windows, 3))
                else:
                    progbar.set_description("loss = %0.3f " % np.round(self.loss_agg(losses), 3))

                loss.backward()
                self.optimizer.step()

            # Get training and validation loss and error
            epoch_preds, epoch_ys = np.concatenate(epoch_preds, axis=0), np.concatenate(epoch_ys, axis=0)
            epoch_ws, epoch_scales = np.concatenate(epoch_ws, axis=0), np.concatenate(epoch_scales, axis=0)

            if self.agg_sum:
                train_loss = self.loss_agg(losses) / self.n_windows
            else:
                train_loss = self.loss_agg(losses)

            train_error = self.metric.get_error(epoch_preds, epoch_ys, epoch_scales, epoch_ws)
            train_error_2 = self.metric_2.get_error(epoch_preds[:, :, 4], epoch_ys, epoch_scales, epoch_ws)

            val_loss, val_error, val_error_2 = self._get_val_loss_and_err()

            print(f'Training Loss: {train_loss:.4f}, Training Error: {train_error:.4f}, '
                  f'Training Secondary Error: {train_error_2:.4f}\n'
                  f'Validation Loss: {val_loss:.4f}, Validation Error: {val_error:.4f}, '
                  f'Validation Secondary Error: {val_error_2:.4f}')

            # Change learning rate according to scheduler
            self.scheduler.step(val_error)

            # save checkpoint and best model
            if self.min_val_error is None:
                self.min_val_error = val_error
                is_best = True
                print(f'Best model obtained at the end of epoch {epoch}')
            else:
                if val_error < self.min_val_error:
                    self.min_val_error = val_error
                    is_best = True
                    print(f'Best model obtained at the end of epoch {epoch}')
                else:
                    is_best = False
            self.model_checkpoint.save(is_best, self.min_val_error, self.early_stopping.num_bad_epochs,
                                       epoch, self.model, self.optimizer, self.scheduler)

            # write logs
            self.writer.add_scalar(f'{self.config.loss_fn}/train', train_loss, epoch * i)
            self.writer.add_scalar(f'{self.config.loss_fn}/val', val_loss, epoch * i)
            self.writer.add_scalar(f'{self.config.metric}/train', train_error, epoch * i)
            self.writer.add_scalar(f'{self.config.metric}/val', val_error, epoch * i)
            self.writer.add_scalar(f'{self.config.secondary_metric}/train', train_error_2, epoch * i)
            self.writer.add_scalar(f'{self.config.secondary_metric}/val', val_error_2, epoch * i)

            # Early Stopping
            if self.early_stopping.step(val_error):
                print(f' Training Stopped'.center(self.terminal_width, '*'))
                print(f'Early stopping triggered after epoch {epoch}')
                break

        self.writer.close()


if __name__ == "__main__":
    # sys.stdout = open('train.log', 'w')
    # sys.stderr = sys.stdout
    config = Config
    terminal_width = shutil.get_terminal_size((80, 20)).columns
    # Check if k-fold training is enabled
    if config.k_fold:
        print(f' K-fold Training '.center(terminal_width, '*'))

        # If resuming model training, start training from specified fold
        start_fold = config.resume_from_fold - 1 if config.resume_training else 0

        # Loop over all folds and train model using the corresponding fold config
        for fold, [fold_train_ts, fold_val_ts] in enumerate(config.k_fold_splits):
            if fold < start_fold:
                continue
            config.fold = fold + 1
            print()
            print(f' Fold [{config.fold}/{len(config.k_fold_splits)}] '.center(terminal_width, '*'))
            config.training_ts, config.validation_ts = fold_train_ts, fold_val_ts

            trainer = Trainer(config)
            trainer.train()
            config.resume_training = False  # Train future folds from the beginning
    else:
        config.fold = None
        trainer = Trainer(config)
        trainer.train()

******************************* K-fold Training ********************************

********************************** Fold [1/3] **********************************
************************* Model: seq2seq_w_attn_on_hid *************************
Seq2Seq(
  (encoder): Encoder(
    (embeddings): ModuleList(
      (0): Embedding(3050, 50)
      (1): Embedding(8, 4)
      (2): Embedding(4, 2)
      (3): Embedding(11, 5)
      (4): Embedding(4, 2)
    )
    (cal_embedding): Embedding(31, 16)
    (rnns): ModuleList(
      (0): LSTM(132, 128, bidirectional=True)
      (1): LSTM(256, 128, bidirectional=True)
    )
    (rnn_dropouts): ModuleList(
      (0): Dropout(p=0.2, inplace=False)
    )
  )
  (decoder): AttnDecoder(
    (embeddings): ModuleList(
      (0): Embedding(3050, 50)
      (1): Embedding(8, 4)
      (2): Embedding(4, 2)
      (3): Embedding(11, 5)
      (4): Embedding(4, 2)
    )
    (cal_embedding): Embedding(31, 16)
    (attns): ModuleList(
      (0): Linear(in_features=644, out_

loss = 0.354 : 100%|██████████| 3749/3749 [31:45<00:00,  1.97it/s]
             : 100%|██████████| 268/268 [00:37<00:00,  7.21it/s]
loss = 0.279 : 100%|██████████| 3749/3749 [31:47<00:00,  1.97it/s]
             : 100%|██████████| 268/268 [00:37<00:00,  7.17it/s]
loss = 0.248 : 100%|██████████| 3749/3749 [31:45<00:00,  1.97it/s]
             : 100%|██████████| 268/268 [00:37<00:00,  7.20it/s]
loss = 0.205 : 100%|██████████| 3749/3749 [31:45<00:00,  1.97it/s]
             : 100%|██████████| 268/268 [00:37<00:00,  7.17it/s]
loss = 0.259 : 100%|██████████| 3749/3749 [31:43<00:00,  1.97it/s]
             : 100%|██████████| 268/268 [00:37<00:00,  7.16it/s]
loss = 0.210 : 100%|██████████| 3749/3749 [31:44<00:00,  1.97it/s]
             : 100%|██████████| 268/268 [00:36<00:00,  7.26it/s]
