In [1]:
import torch
import numpy as np
import pandas as pd
import category_encoders as ce
from tqdm import notebook
import matplotlib.pyplot as plt
import gc
import pickle as pkl
import shutil
from tqdm import tqdm, notebook
import glob
import os
from scipy import stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
with open('../../accuracy_stream/data/data.pickle', 'rb') as f:
    data_dict = pkl.load(f)

ids = data_dict['sales_data_ids']
# calendar_index = data_dict['calendar_index']
X_prev_day_sales = data_dict['X_prev_day_sales']
X_enc_only_feats = data_dict['X_enc_only_feats']
X_enc_dec_feats = data_dict['X_enc_dec_feats']
X_calendar = data_dict['X_calendar']
enc_dec_feat_names = data_dict['enc_dec_feat_names']
Y = data_dict['Y']

In [3]:
import sys

sys.path.extend(['..'])
from data_loader.data_generator import DataLoader
from utils.data_utils import *
from utils.training_utils import ModelCheckpoint, EarlyStopping
from losses_and_metrics import loss_functions, metrics
from config import Config

seed = 0
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
np.random.seed(seed)

In [4]:
import torch


class Config:

    resume_training = False

    loss_fn = 'SPLLoss'
    metric = 'SPLMetric'
    secondary_metric = 'WRMSSEMetric'
    architecture = 'seq2seq'

    # Running a sliding window training will help increase the training data
    sliding_window = False  # Note: sliding window has not been tested with WRMSSELoss
    window_length = 28 * 13

    # hidden dimension and no. of layers will be the same for both encoder and decoder
    rnn_num_hidden = 2
    rnn_num_layers = 1
    bidirectional = False
    enc_rnn_dropout = 0.2
    dec_rnn_dropout = 0.0

    num_epochs = 1
    batch_size = 128
    learning_rate = 0.001

    # training, validation and test periods
    training_ts = {'data_start_t': 1969 - 1 - (28 * 7), 'horizon_start_t': 1969 - 1 - (28 * 4),
                   'horizon_end_t': 1969 - 1 - (28 * 3)}
    validation_ts = {'data_start_t': 1969 - 1 - (28 * 6), 'horizon_start_t': 1969 - 1 - (28 * 3),
                     'horizon_end_t': 1969 - 1 - (28 * 2)}
    test_ts = {'data_start_t': 1969 - 1 - (28 * 15), 'horizon_start_t': 1969 - 1 - (28 * 2),
               'horizon_end_t': 1969 - 1 - (28 * 1)}

    data_file = '../../accuracy_stream/data/data.pickle'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = Config

In [5]:
class Trainer:
    def __init__(self, config, model):
        self.config = config
        self.terminal_width = shutil.get_terminal_size((80, 20)).columns

        # Model
        print(f' Model: {self.config.architecture} '.center(self.terminal_width, '*'))
        self.model = model

        # Loss, Optimizer and LRScheduler
        self.criterion = getattr(loss_functions, config.loss_fn)(self.config)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.5,
                                                                    patience=5, verbose=True)
        self.early_stopping = EarlyStopping(patience=10)
        self.agg_sum = self.config.loss_fn[:3] == 'SPL'
        self.loss_agg = np.sum if self.agg_sum else np.mean

        # Metric
        self.metric = getattr(metrics, config.metric)()
        self.metric_2 = getattr(metrics, config.secondary_metric)()

        print(f' Loading data '.center(self.terminal_width, '*'))
        data_loader = DataLoader(self.config)
        self.ids = data_loader.ids
        self.train_loader = data_loader.create_train_loader()
        self.val_loader = data_loader.create_val_loader()
        self.n_windows = data_loader.n_windows
        self.rnn_input = []


        self.start_epoch, self.min_val_error = 1, None

    def _get_val_loss_and_err(self):
        self.model.eval()
        progbar = tqdm(self.val_loader)
        progbar.set_description("             ")
        losses, epoch_preds, epoch_ys, epoch_ws, epoch_scales = [], [], [], [], []
        for i, [x, y, norm_factor, ids_idx, loss_input, _] in enumerate(progbar):
            epoch_ys.append(y.data.numpy())
            epoch_scales.append(loss_input[0].data.numpy())
            epoch_ws.append(loss_input[1].data.numpy())

            x = [inp.to(self.config.device) for inp in x]
            y = y.to(self.config.device)
            norm_factor = norm_factor.to(self.config.device)
            loss_input = [inp.to(self.config.device) for inp in loss_input]

            preds = self.model(*x) * norm_factor[:, None, None]
            epoch_preds.append(preds.data.cpu().numpy())
            loss = self.criterion(preds, y, *loss_input)
            losses.append(loss.data.cpu().numpy())

        epoch_preds, epoch_ys = np.concatenate(epoch_preds, axis=0), np.concatenate(epoch_ys, axis=0)
        epoch_ws, epoch_scales = np.concatenate(epoch_ws, axis=0), np.concatenate(epoch_scales, axis=0)

        val_error = self.metric.get_error(epoch_preds, epoch_ys, epoch_scales, epoch_ws)
        val_error_2 = self.metric_2.get_error(epoch_preds[:, :, 4], epoch_ys, epoch_scales, epoch_ws)

        return self.loss_agg(losses), val_error, val_error_2

    def train(self):
        print(f' Training '.center(self.terminal_width, '*'), end='\n\n')

        for epoch in range(self.start_epoch, self.config.num_epochs + 1):
            print(f' Epoch [{epoch}/{self.config.num_epochs}] '.center(self.terminal_width, 'x'))
            self.model.train()
            progbar = notebook.tqdm(self.train_loader)
            losses, epoch_preds, epoch_ys, epoch_ws, epoch_scales = [], [], [], [], []
            for i, [x, y, norm_factor, ids_idx, loss_input, window_id] in enumerate(progbar):
                x = [inp.to(self.config.device) for inp in x]
#                 self.rnn_input.append(x[3].data.cpu().numpy())
                y = y.to(self.config.device)
                norm_factor = norm_factor.to(self.config.device)
                loss_input = [inp.to(self.config.device) for inp in loss_input]

                # Forward + Backward + Optimize
                self.optimizer.zero_grad()
                preds = self.model(*x) * norm_factor[:, None, None]

                if self.config.sliding_window:
                    if torch.sum(window_id == self.n_windows - 1) > 0:
                        epoch_ys.append(y[window_id == self.n_windows - 1].data.cpu().numpy().reshape(-1, 28))
                        epoch_scales.append(loss_input[0][window_id == self.n_windows - 1]
                                            .data.cpu().numpy().reshape(-1))
                        epoch_ws.append(loss_input[1][window_id == self.n_windows - 1]
                                        .data.cpu().numpy().reshape(-1))
                        epoch_preds.append(preds[window_id == self.n_windows - 1].data.cpu().numpy().reshape(-1, 28, 9))
                else:
                    epoch_ys.append(y.data.cpu().numpy())
                    epoch_scales.append(loss_input[0].data.cpu().numpy())
                    epoch_ws.append(loss_input[1].data.cpu().numpy())
                    epoch_preds.append(preds.data.cpu().cpu().numpy())

                loss = self.criterion(preds, y, *loss_input)
                losses.append(loss.data.cpu().numpy())

                if self.agg_sum:
                    progbar.set_description("loss = %0.3f " % np.round(
                        (len(self.train_loader) / (i + 1)) * self.loss_agg(losses) / self.n_windows, 3))
                else:
                    progbar.set_description("loss = %0.3f " % np.round(self.loss_agg(losses), 3))

                loss.backward()
                self.optimizer.step()

            # Get training and validation loss and error
            epoch_preds, epoch_ys = np.concatenate(epoch_preds, axis=0), np.concatenate(epoch_ys, axis=0)
            epoch_ws, epoch_scales = np.concatenate(epoch_ws, axis=0), np.concatenate(epoch_scales, axis=0)

            if self.agg_sum:
                train_loss = self.loss_agg(losses) / self.n_windows
            else:
                train_loss = self.loss_agg(losses)

            train_error = self.metric.get_error(epoch_preds, epoch_ys, epoch_scales, epoch_ws)
            train_error_2 = self.metric_2.get_error(epoch_preds[:, :, 4], epoch_ys, epoch_scales, epoch_ws)

            val_loss, val_error, val_error_2 = self._get_val_loss_and_err()

            print(f'Training Loss: {train_loss:.4f}, Training Error: {train_error:.4f}, '
                  f'Training Secondary Error: {train_error_2:.4f}\n'
                  f'Validation Loss: {val_loss:.4f}, Validation Error: {val_error:.4f}, '
                  f'Validation Secondary Error: {val_error_2:.4f}')

### Build Transformer

In [6]:
import torch
import torch.nn as nn
import torch.utils.data


# Build a seq2seq model
# Encoder
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_sizes, cal_embedding_sizes, config):
        super(Encoder, self).__init__()
        self.config = config
        self.input_size = input_size
        self.max_length = config.window_length if config.sliding_window \
            else config.training_ts['horizon_start_t'] - config.training_ts['data_start_t']

        self.embeddings = nn.ModuleList([nn.Embedding(classes, hidden_size)
                                         for classes, hidden_size in embedding_sizes])
        self.cal_embedding = nn.Embedding(cal_embedding_sizes[0], cal_embedding_sizes[1])
        
        self.pos_embedding = nn.Embedding(self.max_length, self.input_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.input_size, nhead=4)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        self.dropout = nn.Dropout(0.1)
        self.scale = torch.sqrt(torch.FloatTensor([self.input_size])).to(self.config.device)

    def forward(self, x, x_emb, x_cal_emb):
        x, x_emb, x_cal_emb = x.permute(1, 0, 2), x_emb.permute(1, 0, 2), x_cal_emb.permute(1, 0, 2)  # make time-major
        output_emb = [emb(x_emb[:, :, i]) for i, emb in enumerate(self.embeddings)]
        output_emb = torch.cat(output_emb, 2)

        # share embedding layer for both the calendar events
        output_emb_cal = [self.cal_embedding(x_cal_emb[:, :, 0]), self.cal_embedding(x_cal_emb[:, :, 1])]
        output_emb_cal = torch.cat(output_emb_cal, 2)

        x = torch.cat([x, output_emb, output_emb_cal], 2).permute(1, 0, 2)
        
        batch_size = x.shape[0]
        x_len = x.shape[1]
        
        # Positional Encoding
        pos = torch.arange(0, x_len).unsqueeze(0).repeat(batch_size, 1).to(self.config.device)
        x = self.dropout((x * self.scale) + self.pos_embedding(pos))
        x = x.permute(1, 0, 2)
        
        # Transformer
        output = self.transformer_encoder(x)

        return output


# Decoder
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_sizes, cal_embedding_sizes, output_size, config):
        super(Decoder, self).__init__()
        self.config = config
        self.input_size = input_size
        self.max_length = 28

        self.embeddings = nn.ModuleList([nn.Embedding(classes, hidden_size)
                                         for classes, hidden_size in embedding_sizes])
        self.cal_embedding = nn.Embedding(cal_embedding_sizes[0], cal_embedding_sizes[1])
        
        self.pos_embedding = nn.Embedding(self.max_length, self.input_size)
        decoder_layer = nn.TransformerDecoderLayer(d_model=self.input_size, nhead=4)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=2)
        
        self.dropout = nn.Dropout(0.1)
        self.scale = torch.sqrt(torch.FloatTensor([self.input_size])).to(self.config.device)
        self.pred = nn.Linear(self.input_size, output_size)


    def forward(self, enc_out, y_mask, x, x_emb, x_cal_emb):
        x, x_emb, x_cal_emb = x.permute(1, 0, 2), x_emb.permute(1, 0, 2), x_cal_emb.permute(1, 0, 2)  # make time-major
        output_emb = [emb(x_emb[:, :, i]) for i, emb in enumerate(self.embeddings)]
        output_emb = torch.cat(output_emb, 2)

        # share embedding layer for both the calendar events
        output_emb_cal = [self.cal_embedding(x_cal_emb[:, :, 0]), self.cal_embedding(x_cal_emb[:, :, 1])]
        output_emb_cal = torch.cat(output_emb_cal, 2)

        x = torch.cat([x, output_emb, output_emb_cal], 2).permute(1, 0, 2)

        batch_size = x.shape[0]
        x_len = x.shape[1]
        
        # Positional Encoding
        pos = torch.arange(0, x_len).unsqueeze(0).repeat(batch_size, 1).to(self.config.device)
        x = self.dropout((x * self.scale) + self.pos_embedding(pos))
        x = x.permute(1, 0, 2)
        
        # Transformer
        output = self.transformer_decoder(x, enc_out, y_mask)
        
        output = self.pred(output.permute(1, 0, 2))
        return output


class TransformerModel(nn.Module):
    def __init__(self, encoder, decoder, config):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.config = config
        self.pred_len = 28
        
    def forward(self, x_enc, x_enc_emb, x_cal_enc_emb, x_dec, x_dec_emb, x_cal_dec_emb, x_prev_day_sales_dec):
        batch_size = x_dec.shape[0]

        encoder_output = self.encoder(x_enc, x_enc_emb, x_cal_enc_emb)
        
        # If running in eval mode, run through the decoder in single steps and use predicted x_prev_sales
        # as actual x_prev_sales is not available
        if self.training:
            x_dec = torch.cat([x_dec, x_prev_day_sales_dec], dim=2)
            y_mask = torch.tril(torch.ones((self.pred_len, self.pred_len))).bool()
            predictions = self.decoder(encoder_output, y_mask, x_dec, x_dec_emb, x_cal_dec_emb)
        else:
            
            # create a tensor to store the outputs
            predictions = torch.zeros(batch_size, self.pred_len, 9).to(self.config.device)
            
            for timestep in range(self.pred_len):

                if timestep == 0:
                    # for the first timestep of decoder, use previous steps' sales
                    dec_input = torch.cat([x_dec[:, 0, :], x_prev_day_sales_dec[:, 0]], dim=1).unsqueeze(1)
                else:
                    # for next timestep, current timestep's output will serve as the input along with other features
                    dec_input = torch.cat([x_dec[:, timestep, :], decoder_output[:, 4].unsqueeze(1)], dim=1).unsqueeze(1)
                
                y_mask = torch.tril(torch.ones((1, 1))).bool()
                decoder_output = self.decoder(encoder_output, y_mask, dec_input,
                                              x_dec_emb[:, timestep, :].unsqueeze(1),
                                              x_cal_dec_emb[:, timestep, :].unsqueeze(1))[:, 0, :]

                # add predictions to predictions tensor
                predictions[:, timestep] = decoder_output

        return predictions

In [7]:
config = Config

In [8]:
# for item_id, dept_id, cat_id, store_id, state_id respectively
embedding_sizes = [(3049 + 1, 2), (7 + 1, 1), (3 + 1, 1), (10 + 1, 1), (3 + 1, 1)]
cal_embedding_sizes = (31, 1)
num_features_enc = 12 + sum([j for i, j in embedding_sizes]) + cal_embedding_sizes[1] * 2
num_features_dec = 12 + sum([j for i, j in embedding_sizes]) + cal_embedding_sizes[1] * 2
enc = Encoder(num_features_enc, embedding_sizes, cal_embedding_sizes, config)
dec = Decoder(num_features_dec, embedding_sizes, cal_embedding_sizes, 9, config)
model = TransformerModel(enc, dec, config)
model.to(config.device)

TransformerModel(
  (encoder): Encoder(
    (embeddings): ModuleList(
      (0): Embedding(3050, 2)
      (1): Embedding(8, 1)
      (2): Embedding(4, 1)
      (3): Embedding(11, 1)
      (4): Embedding(4, 1)
    )
    (cal_embedding): Embedding(31, 1)
    (pos_embedding): Embedding(84, 20)
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=20, out_features=20, bias=True)
          )
          (linear1): Linear(in_features=20, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=20, bias=True)
          (norm1): LayerNorm((20,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((20,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        

In [9]:
trainer = Trainer(config, model)
trainer.train()

******************************** Model: seq2seq ********************************
********************************* Loading data *********************************
*********************************** Training ***********************************

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx Epoch [1/1] xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))




             : 100%|██████████| 335/335 [01:37<00:00,  3.43it/s]


Training Loss: 0.4396, Training Error: 0.4396, Training Secondary Error: 49.2589
Validation Loss: 0.3600, Validation Error: 0.3600, Validation Secondary Error: 39.3912
