In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange
from itertools import islice
plt.style.use('ggplot')

import mxnet as mx

from gluonts.dataset.common import ListDataset
from gluonts.model.deepar import DeepAREstimator
from gluonts.trainer import Trainer
from gluonts.dataset.util import to_pandas
from gluonts.distribution import NegativeBinomialOutput

from demo_utils import *


TRAIN_START = '2018-01-01'
TRAIN_END = '2019-09-28'
TEST_START = '2019-06-28'
TEST_END = '2019-11-29'
TIME_FRAME = [TRAIN_START, TRAIN_END, TEST_START, TEST_END]


sales_df = pd.read_csv('cid2_794_sales.csv')
sales_df.fillna({'sale_qtty': 0}, inplace=True)

In [39]:
cid3_cnt = len(set(sales_df.cid3))
cid3_dict = dict(zip(sorted(set(sales_df.cid3)), range(cid3_cnt)))

sales_df = generate_holiday(sales_df, TIME_FRAME)


In [41]:
sku_list = sorted(set(sales_df.item_sku_id))
train_data, validation_data, test_data = [], [], []

In [42]:
i = 1
sku = sku_list[i]

In [43]:
sku

147986

In [49]:
time_frame = [TRAIN_START, TRAIN_END, TEST_START, TEST_END]

In [55]:
sales_df_sub = sales_df[sales_df.item_sku_id == sku].sort_values('date').reset_index(drop=True)
start = sales_df_sub.date[0]
cid3 = sales_df_sub.cid3[0]
train_start, train_end, test_start, test_end = time_frame
decoder_len = date_diff(train_end, test_end) - 1
feat_cols = ['item_sku_id', 'date', 'sale_qtty', 'booking_flag', 'booking_pay_flag',
             'presale_flag', 'presale_pay_flag', 'instant_flag', 'expose_flag', 'instant_hour',
             'instant_price', 'redprice', 'nominal_netprice']
sales_df_sub = sales_df_sub[feat_cols]
train_data = sales_df_sub[sales_df_sub.date.between(train_start, train_end)]
test_data = sales_df_sub[sales_df_sub.date.between(train_start, test_end)]
cols_to_drop = ['item_sku_id', 'date', 'sale_qtty']
Xtrain, ytrain = train_data.drop(cols_to_drop, axis=1), train_data.sale_qtty.values
Xtest, ytest = test_data.drop(cols_to_drop, axis=1), test_data.sale_qtty.values
cols_to_normalize = ['instant_price', 'redprice', 'nominal_netprice', 'instant_hour']
for col in cols_to_normalize:
    temp_mean, temp_std = Xtrain[col].mean(), Xtrain[col].std()
    if temp_std < 1e-4:
        Xtrain[col] = Xtrain[col] - temp_mean
        Xtest[col] = Xtest[col] - temp_mean
        continue
    Xtrain[col] = (Xtrain[col] - temp_mean) / temp_std
    Xtest[col] = (Xtest[col] - temp_mean) / temp_std
Xtrain, Xtest = Xtrain.values, Xtest.values
train_listdataset = {'start': start,
                     'target': ytrain,
                     'feat_static_cat': cid3_dict[cid3],
                     'feat_dynamic_real': Xtrain.T}

validation_listdataset = {'start': start,
                          'target': ytest,
                          'feat_static_cat': cid3_dict[cid3],
                          'feat_dynamic_real': Xtest.T}

# test data's target should only contain dates before prediction_length days.
# test data's feat_dynamic_real should contain dates for total_length days
test_listdataset = {'start': start,
                    'target': ytest[:-decoder_len],
                    'feat_static_cat': cid3_dict[cid3],
                    'feat_dynamic_real': Xtest.T}

In [61]:
a =[]
a.append(train_listdataset)

In [63]:
train_data = ListDataset(a, freq='1D')

In [65]:
list(train_data)

[{'start': Timestamp('2018-01-01 00:00:00', freq='D'),
  'target': array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  2.,  1.,  1.,  0.,  3.,  2.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,
          1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
          1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          1.,  0.,  1.,  6.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
          1.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          2.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  2.,
          1.,  1.,  2.,  0.,  0., 11.,  1.,  0.

In [None]:
def generate_sku_listdataset(sku, sales_df, time_frame, cid3_dict):
    sales_df_sub = sales_df[sales_df.item_sku_id == sku].sort_values('date').reset_index(drop=True)
    start = sales_df_sub.date[0]
    cid3 = sales_df_sub.cid3[0]

    train_start, train_end, test_start, test_end = time_frame
    decoder_len = date_diff(train_end, test_end) - 1
    feat_cols = ['item_sku_id', 'date', 'sale_qtty', 'booking_flag', 'booking_pay_flag',
                 'presale_flag', 'presale_pay_flag', 'instant_flag', 'expose_flag', 'instant_hour',
                 'instant_price', 'redprice', 'nominal_netprice']
    sales_df_sub = sales_df_sub[feat_cols]
    train_data = sales_df_sub[sales_df_sub.date.between(train_start, train_end)]
    test_data = sales_df_sub[sales_df_sub.date.between(train_start, test_end)]
    cols_to_drop = ['item_sku_id', 'date', 'sale_qtty']
    Xtrain, ytrain = train_data.drop(cols_to_drop, axis=1), train_data.sale_qtty.values
    Xtest, ytest = test_data.drop(cols_to_drop, axis=1), test_data.sale_qtty.values

    cols_to_normalize = ['instant_price', 'redprice', 'nominal_netprice', 'instant_hour']
    for col in cols_to_normalize:
        temp_mean, temp_std = Xtrain[col].mean(), Xtrain[col].std()
        if temp_std < 1e-4:
            Xtrain[col] = Xtrain[col] - temp_mean
            Xtest[col] = Xtest[col] - temp_mean
            continue
        Xtrain[col] = (Xtrain[col] - temp_mean) / temp_std
        Xtest[col] = (Xtest[col] - temp_mean) / temp_std

    Xtrain, Xtest = Xtrain.values, Xtest.values

    train_listdataset = {'start': start,
                         'target': ytrain,
                         'feat_static_cat': cid3_dict[cid3],
                         'feat_dynamic_real': Xtrain.T}

    validation_listdataset = {'start': start,
                              'target': ytest,
                              'feat_static_cat': cid3_dict[cid3],
                              'feat_dynamic_real': Xtest.T}

    # test data's target should only contain dates before prediction_length days.
    # test data's feat_dynamic_real should contain dates for total_length days
    test_listdataset = {'start': start,
                        'target': ytest[:-decoder_len],
                        'feat_static_cat': cid3_dict[cid3],
                        'feat_dynamic_real': Xtest.T}

    return train_listdataset, validation_listdataset, test_listdataset