In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
from itertools import product

from tqdm import tqdm_notebook

import time as time

import random
import math
import os

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from pylab import rcParams

import seaborn as sns

from sklearn import metrics
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset

from utils import (
    load_and_partition_data,
    split_sequence
)

import utils

import warnings
warnings.filterwarnings("ignore")

## Data Preprocessing

In [2]:
train = pd.read_csv("sales_train_validation.csv")
sell_prices = pd.read_csv("sell_prices.csv")
calendar = pd.read_csv("calendar.csv")
# validation contains 28 more dates
validation = pd.read_csv("sales_train_evaluation.csv")

In [3]:
d_cols = [c for c in train.columns if 'd_' in c]
dates = calendar[calendar.d.isin(d_cols)]['date']
dates_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in dates]
df_sales = train[d_cols].T
df_sales.columns = train['id'].values
df_sales = pd.DataFrame(df_sales).set_index([dates_list])
df_sales.index = pd.to_datetime(df_sales.index)
df_sales.columns = [i for i in range(len(df_sales.columns))]

d_cols = [c for c in validation.columns if 'd_' in c]
dates = calendar[calendar.d.isin(d_cols)]['date']
dates_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in dates]
df_validation = validation[d_cols].T
df_validation.columns = validation['id'].values
df_validation = pd.DataFrame(df_validation).set_index([dates_list])
df_validation.index = pd.to_datetime(df_validation.index)
df_validation.columns = [i for i in range(len(df_validation.columns))]

In [4]:
SEED = 1345
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(SEED)

## Running Transformer Model

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [6]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
        super().__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:, : x.size(1)]
        return self.dropout(x)


class TransformerWithPE(torch.nn.Module):
    def __init__(
        self, in_dim: int, out_dim: int, embed_dim: int, num_heads: int, num_layers: int
    ) -> None:
        super().__init__()
        self.positional_encoding = PositionalEncoding(embed_dim)
        self.encoder_embedding = torch.nn.Linear(
            in_features=in_dim, out_features=embed_dim
        )
        self.decoder_embedding = torch.nn.Linear(
            in_features=out_dim, out_features=embed_dim
        )
        self.output_layer = torch.nn.Linear(in_features=embed_dim, out_features=out_dim)
        self.transformer = torch.nn.Transformer(
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            d_model=embed_dim,
            batch_first=True,
        )

    def forward(self, src: torch.Tensor, tgt: torch.Tensor) -> torch.Tensor:
        # if self.train:
        # Add noise to decoder inputs during training
        # tgt = tgt + torch.normal(0, 0.1, size=tgt.shape).to(tgt.device)

        # Embed encoder input and add positional encoding.
        # [bs, src_seq_len, embed_dim]
        src = self.encoder_embedding(src)
        src = self.positional_encoding(src)

        # Generate mask to avoid attention to future outputs.
        # [tgt_seq_len, tgt_seq_len]
        tgt_mask = torch.nn.Transformer.generate_square_subsequent_mask(tgt.shape[1])
        # Embed decoder input and add positional encoding.
        # [bs, tgt_seq_len, embed_dim]
        tgt = self.decoder_embedding(tgt)
        tgt = self.positional_encoding(tgt)

        # Get prediction from transformer and map to output dimension.
        # [bs, tgt_seq_len, embed_dim]
        pred = self.transformer(src, tgt, tgt_mask=tgt_mask.to(device))
        pred = self.output_layer(pred)

        return pred

    def infer(self, src: torch.Tensor, tgt_len: int) -> torch.Tensor:
        output = torch.zeros((src.shape[0], tgt_len + 1, src.shape[2])).to(src.device)
        output[:, 0] = src[:, -1]
        for i in range(tgt_len):
            output[:, i + 1] = self.forward(src, output)[:, i]

        return output[:, 1:]

In [7]:
optimal_batch_size = 32
optimal_d_model = 64
optimal_num_head = 4
optimal_num_layer = 2
optimal_sequence_length = 28

### Batch 1 

In [8]:
results1 = pd.read_csv("Baseline_transformer_Results_1.csv")
results1 = results1.iloc[:, 1:]
results1.index = df_validation.index[optimal_sequence_length:]

In [9]:
summary_df = pd.DataFrame(columns = ["rmse_train_transformer_pr", "r2_train_transformer_pr", "rmse_test_transformer_pr", "r2_test_transformer_pr", "rmse_train_transformer_mu", "r2_train_transformer_mu", "rmse_test_transformer_mu", "r2_test_transformer_mu", "Time Taken"])
prediction_df_1 = pd.DataFrame()
prediction_df_1.index = df_validation.index[optimal_sequence_length:]
prediction_df_2 = pd.DataFrame()
prediction_df_2.index = df_validation.index[optimal_sequence_length:]
results = results1

for string_i in results.columns: 
    
    i = int(string_i)
    start_time = time.time()

    original_prediction_residuals_df = df_sales.iloc[optimal_sequence_length:, i] - results[string_i][:-28]
    mu = original_prediction_residuals_df.mean()
    total_residuals = [mu]*(optimal_sequence_length-1)+list(original_prediction_residuals_df)
    total_residuals = np.array(total_residuals).reshape(len(total_residuals), 1)
    total_residuals_df = pd.DataFrame(total_residuals)
    total_residuals_df.index = df_sales.index[:-1]

    y_scaler = MinMaxScaler((-1, 1))
    r_scaler = MinMaxScaler((-1, 1))
    y_scaler.fit(df_sales.iloc[:, i:i+1].values.reshape(-1, 1))
    r_scaler.fit(total_residuals_df.values.reshape(-1, 1))
    normalized_y = y_scaler.transform(df_sales.iloc[:-1, i:i+1])
    normalized_r = r_scaler.transform(total_residuals_df)
    train_data_normalized = np.concatenate([normalized_y, normalized_r], axis=1)
    sequences = load_and_partition_data(train_data_normalized, optimal_sequence_length+1)
    train_data = torch.tensor(np.array(sequences[:-28]), dtype=torch.float)
    val_data = torch.tensor(np.array(sequences[-28:]), dtype=torch.float)
    train_loader = DataLoader(train_data, batch_size=optimal_batch_size , shuffle=True, drop_last=True)
    val_loader = DataLoader(val_data, batch_size=optimal_batch_size, shuffle=False, drop_last=False)
    ##############################################################  Training  ##########################################################
    #####  Parameters  ######################
    num_features = len(train_data_normalized[0])
    D_MODEL = optimal_d_model  
    NUM_HEADS = optimal_num_head 
    NUM_LAYERS = optimal_num_layer 
    NUM_EPOCHS = 200
    LR = 0.001
    #####Init the Model #######################
    model = TransformerWithPE(num_features, num_features, D_MODEL, NUM_HEADS, NUM_LAYERS)
    model.to(device)
    early_stopper = utils.EarlyStopper(patience=20)
    ##### Set Criterion Optimzer and scheduler ####################
    criterion = torch.nn.MSELoss().to(device)    # mean-squared error for regression
    optimizer = torch.optim.Adam(model.parameters(), lr=LR,weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,  patience=20,factor =0.1 ,min_lr=1e-7, eps=1e-08)
    # Train the model
    train_loss_, val_loss_ = [], []
    for epoch in range(NUM_EPOCHS): 
        train_loss, val_loss = 0, 0
        model.train()
        for batch in train_loader:
            src, tgt, tgt_y = split_sequence(batch, optimal_sequence_length/(optimal_sequence_length+1))
            pred = model(src.to(device), tgt.to(device))
            loss = criterion(pred, tgt_y.to(device))
            train_loss += loss.item()
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss/=len(train_loader.dataset)
        train_loss_.append(train_loss)
        #Evaluate on test     
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                src, tgt, tgt_y = split_sequence(batch, optimal_sequence_length/(optimal_sequence_length+1))
                val = model(src.to(device), tgt.to(device))
                loss = criterion(val, tgt_y.to(device))
                val_loss += loss.item()
        val_loss/=len(val_loader.dataset)
        val_loss_.append(val_loss)
        scheduler.step(val_loss)
        if early_stopper.early_stop(val_loss, model, 'Models/transformer_final_w_residuals.pth'):
            early_stopped = epoch+1
            break
    ##############################################################  Testing 1  ##########################################################        break
    model = torch.load('Models/transformer_final_w_residuals.pth')
    sequences_tensor = Variable(torch.Tensor(sequences))
    model.eval()
    with torch.no_grad():
        src, tgt, tgt_y = split_sequence(sequences_tensor, 28/29)
        predicts_ = model(src.to(device), tgt.to(device))
    predicts_ = predicts_.cpu().numpy().reshape(-1, 2)
    forecasts_ = np.zeros((28, 2))
    src = train_data_normalized[-optimal_sequence_length:]
    src = Variable(torch.Tensor(src)).unsqueeze(0)
    tgt_len = 1
    model.eval()
    with torch.no_grad():
        forecasted = model.infer(src.to(device), tgt_len)
    predicts_ = np.concatenate([predicts_, forecasted.cpu().data.numpy()[0]])
    forecasted[0][0][0] = y_scaler.transform([[df_sales.iloc[-1, i]]])[0][0]
    for j in range(28):
        src = torch.cat((src[:, 1:].to(device), forecasted), dim = 1)
        model.eval()
        with torch.no_grad():
            forecasted = model.infer(src.to(device), tgt_len)
        forecasts_[j][0] = forecasted.cpu().data.numpy()[0][0][0]
        forecasts_[j][1] = forecasted.cpu().data.numpy()[0][0][1]
    all_prediction = np.append(predicts_[:, 0], forecasts_[:, 0])

    sales_validation = pd.DataFrame(df_validation.iloc[:,i])
    
    prediction_df_1[i] = y_scaler.inverse_transform(np.array(all_prediction).reshape(-1, 1)).reshape(-1)
    rmse_train_lstm_1 = np.sqrt(mean_squared_error(sales_validation[optimal_sequence_length:-28], prediction_df_1.iloc[:-28][i]))
    r2_train_lstm_1 = r2_score(sales_validation[optimal_sequence_length:-28], prediction_df_1.iloc[:-28][i])
    rmse_test_lstm_1 = np.sqrt(mean_squared_error(sales_validation[-28:], prediction_df_1.iloc[-28:][i]))
    r2_test_lstm_1 = r2_score(sales_validation[-28:], prediction_df_1.iloc[-28:][i])
    ##############################################################  Testing 2  ##########################################################
    sequences_tensor = Variable(torch.Tensor(sequences))
    model.eval()
    with torch.no_grad():
        src, tgt, tgt_y = split_sequence(sequences_tensor, 28/29)
        predicts_ = model(src.to(device), tgt.to(device))
    predicts_ = predicts_.cpu().numpy().reshape(-1, 2)
    forecasts_ = np.zeros((28, 2))
    src = train_data_normalized[-optimal_sequence_length:]
    src = Variable(torch.Tensor(src)).unsqueeze(0)
    tgt_len = 1
    model.eval()
    with torch.no_grad():
        forecasted = model.infer(src.to(device), tgt_len)
    predicts_ = np.concatenate([predicts_, forecasted.cpu().data.numpy()[0]])
    forecasted[0][0][0] = y_scaler.transform([[df_sales.iloc[-1, i]]])[0][0]
    for j in range(28):
        forecasted[0][0][1] = torch.tensor(r_scaler.transform([[mu]])[0][0]).to(device)
        src = torch.cat((src[:, 1:].to(device), forecasted), dim = 1)
        model.eval()
        with torch.no_grad():
            forecasted = model.infer(src.to(device), tgt_len)
        forecasts_[j][0] = forecasted.cpu().data.numpy()[0][0][0]
        forecasts_[j][1] = forecasted.cpu().data.numpy()[0][0][1]
    all_prediction = np.append(predicts_[:, 0], forecasts_[:, 0])
    
    prediction_df_2[i] = y_scaler.inverse_transform(np.array(all_prediction).reshape(-1, 1)).reshape(-1)
    rmse_train_lstm_2 = np.sqrt(mean_squared_error(sales_validation[optimal_sequence_length:-28], prediction_df_2.iloc[:-28][i]))
    r2_train_lstm_2 = r2_score(sales_validation[optimal_sequence_length:-28], prediction_df_2.iloc[:-28][i])
    rmse_test_lstm_2 = np.sqrt(mean_squared_error(sales_validation[-28:], prediction_df_2.iloc[-28:][i]))
    r2_test_lstm_2 = r2_score(sales_validation[-28:], prediction_df_2.iloc[-28:][i])
    time_taken = time.time() - start_time

    summary_df.loc[i] = [rmse_train_lstm_1, r2_train_lstm_1, rmse_test_lstm_1, r2_test_lstm_1, rmse_train_lstm_2, r2_train_lstm_2, rmse_test_lstm_2, r2_test_lstm_2, time_taken]    
    print(f"Item {i} test r2(pr), test r2(mu), time taken: {r2_test_lstm_1}, {r2_test_lstm_2}, {time_taken}")


summary_df.to_csv("Transformer_error_modelling_Summary_1.csv")
prediction_df_1.to_csv("Transformer_error_modelling_Results_1_1.csv")
prediction_df_2.to_csv("Transformer_error_modelling_Results_1_2.csv")

Item 2 test r2(pr), test r2(mu), time taken: -0.4572266356826966, -0.38677913103702943, 62.00556302070618
Item 268 test r2(pr), test r2(mu), time taken: -0.019748091499781184, -0.05160965513894222, 25.518983602523804
Item 345 test r2(pr), test r2(mu), time taken: -0.28420537082767305, -1.018158565068115, 40.21719288825989
Item 380 test r2(pr), test r2(mu), time taken: -0.23837314121914766, -0.25064531753435215, 30.759873628616333
Item 465 test r2(pr), test r2(mu), time taken: -0.20849677005533485, -0.19427787457403656, 29.76356840133667
Item 478 test r2(pr), test r2(mu), time taken: -0.28746446131097403, -0.5553771468633992, 29.376527309417725
Item 520 test r2(pr), test r2(mu), time taken: -0.24199676956041039, -0.2314286338463254, 41.96453833580017
Item 529 test r2(pr), test r2(mu), time taken: -0.627303411357325, -0.6474059616092065, 37.351189613342285
Item 569 test r2(pr), test r2(mu), time taken: -0.05014907584273587, -0.024964243633691208, 47.93453240394592
Item 576 test r2(pr), t

### Batch 2

In [10]:
results2 = pd.read_csv("Baseline_transformer_Results_2.csv")
results2 = results2.iloc[:, 1:]
results2.index = df_validation.index[optimal_sequence_length:]

In [11]:
summary_df = pd.DataFrame(columns = ["rmse_train_transformer_pr", "r2_train_transformer_pr", "rmse_test_transformer_pr", "r2_test_transformer_pr", "rmse_train_transformer_mu", "r2_train_transformer_mu", "rmse_test_transformer_mu", "r2_test_transformer_mu", "Time Taken"])
prediction_df_1 = pd.DataFrame()
prediction_df_1.index = df_validation.index[optimal_sequence_length:]
prediction_df_2 = pd.DataFrame()
prediction_df_2.index = df_validation.index[optimal_sequence_length:]
results = results2

for string_i in results.columns: 
    
    i = int(string_i)
    start_time = time.time()

    original_prediction_residuals_df = df_sales.iloc[optimal_sequence_length:, i] - results[string_i][:-28]
    mu = original_prediction_residuals_df.mean()
    total_residuals = [mu]*(optimal_sequence_length-1)+list(original_prediction_residuals_df)
    total_residuals = np.array(total_residuals).reshape(len(total_residuals), 1)
    total_residuals_df = pd.DataFrame(total_residuals)
    total_residuals_df.index = df_sales.index[:-1]

    y_scaler = MinMaxScaler((-1, 1))
    r_scaler = MinMaxScaler((-1, 1))
    y_scaler.fit(df_sales.iloc[:, i:i+1].values.reshape(-1, 1))
    r_scaler.fit(total_residuals_df.values.reshape(-1, 1))
    normalized_y = y_scaler.transform(df_sales.iloc[:-1, i:i+1])
    normalized_r = r_scaler.transform(total_residuals_df)
    train_data_normalized = np.concatenate([normalized_y, normalized_r], axis=1)
    sequences = load_and_partition_data(train_data_normalized, optimal_sequence_length+1)
    train_data = torch.tensor(np.array(sequences[:-28]), dtype=torch.float)
    val_data = torch.tensor(np.array(sequences[-28:]), dtype=torch.float)
    train_loader = DataLoader(train_data, batch_size=optimal_batch_size , shuffle=True, drop_last=True)
    val_loader = DataLoader(val_data, batch_size=optimal_batch_size, shuffle=False, drop_last=False)
    ##############################################################  Training  ##########################################################
    #####  Parameters  ######################
    num_features = len(train_data_normalized[0])
    D_MODEL = optimal_d_model  
    NUM_HEADS = optimal_num_head 
    NUM_LAYERS = optimal_num_layer 
    NUM_EPOCHS = 200
    LR = 0.001
    #####Init the Model #######################
    model = TransformerWithPE(num_features, num_features, D_MODEL, NUM_HEADS, NUM_LAYERS)
    model.to(device)
    early_stopper = utils.EarlyStopper(patience=20)
    ##### Set Criterion Optimzer and scheduler ####################
    criterion = torch.nn.MSELoss().to(device)    # mean-squared error for regression
    optimizer = torch.optim.Adam(model.parameters(), lr=LR,weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,  patience=20,factor =0.1 ,min_lr=1e-7, eps=1e-08)
    # Train the model
    train_loss_, val_loss_ = [], []
    for epoch in range(NUM_EPOCHS): 
        train_loss, val_loss = 0, 0
        model.train()
        for batch in train_loader:
            src, tgt, tgt_y = split_sequence(batch, optimal_sequence_length/(optimal_sequence_length+1))
            pred = model(src.to(device), tgt.to(device))
            loss = criterion(pred, tgt_y.to(device))
            train_loss += loss.item()
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss/=len(train_loader.dataset)
        train_loss_.append(train_loss)
        #Evaluate on test     
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                src, tgt, tgt_y = split_sequence(batch, optimal_sequence_length/(optimal_sequence_length+1))
                val = model(src.to(device), tgt.to(device))
                loss = criterion(val, tgt_y.to(device))
                val_loss += loss.item()
        val_loss/=len(val_loader.dataset)
        val_loss_.append(val_loss)
        scheduler.step(val_loss)
        if early_stopper.early_stop(val_loss, model, 'Models/transformer_final_w_residuals.pth'):
            early_stopped = epoch+1
            break
    ##############################################################  Testing 1  ##########################################################        break
    model = torch.load('Models/transformer_final_w_residuals.pth')
    sequences_tensor = Variable(torch.Tensor(sequences))
    model.eval()
    with torch.no_grad():
        src, tgt, tgt_y = split_sequence(sequences_tensor, 28/29)
        predicts_ = model(src.to(device), tgt.to(device))
    predicts_ = predicts_.cpu().numpy().reshape(-1, 2)
    forecasts_ = np.zeros((28, 2))
    src = train_data_normalized[-optimal_sequence_length:]
    src = Variable(torch.Tensor(src)).unsqueeze(0)
    tgt_len = 1
    model.eval()
    with torch.no_grad():
        forecasted = model.infer(src.to(device), tgt_len)
    predicts_ = np.concatenate([predicts_, forecasted.cpu().data.numpy()[0]])
    forecasted[0][0][0] = y_scaler.transform([[df_sales.iloc[-1, i]]])[0][0]
    for j in range(28):
        src = torch.cat((src[:, 1:].to(device), forecasted), dim = 1)
        model.eval()
        with torch.no_grad():
            forecasted = model.infer(src.to(device), tgt_len)
        forecasts_[j][0] = forecasted.cpu().data.numpy()[0][0][0]
        forecasts_[j][1] = forecasted.cpu().data.numpy()[0][0][1]
    all_prediction = np.append(predicts_[:, 0], forecasts_[:, 0])

    sales_validation = pd.DataFrame(df_validation.iloc[:,i])
    
    prediction_df_1[i] = y_scaler.inverse_transform(np.array(all_prediction).reshape(-1, 1)).reshape(-1)
    rmse_train_lstm_1 = np.sqrt(mean_squared_error(sales_validation[optimal_sequence_length:-28], prediction_df_1.iloc[:-28][i]))
    r2_train_lstm_1 = r2_score(sales_validation[optimal_sequence_length:-28], prediction_df_1.iloc[:-28][i])
    rmse_test_lstm_1 = np.sqrt(mean_squared_error(sales_validation[-28:], prediction_df_1.iloc[-28:][i]))
    r2_test_lstm_1 = r2_score(sales_validation[-28:], prediction_df_1.iloc[-28:][i])
    ##############################################################  Testing 2  ##########################################################
    sequences_tensor = Variable(torch.Tensor(sequences))
    model.eval()
    with torch.no_grad():
        src, tgt, tgt_y = split_sequence(sequences_tensor, 28/29)
        predicts_ = model(src.to(device), tgt.to(device))
    predicts_ = predicts_.cpu().numpy().reshape(-1, 2)
    forecasts_ = np.zeros((28, 2))
    src = train_data_normalized[-optimal_sequence_length:]
    src = Variable(torch.Tensor(src)).unsqueeze(0)
    tgt_len = 1
    model.eval()
    with torch.no_grad():
        forecasted = model.infer(src.to(device), tgt_len)
    predicts_ = np.concatenate([predicts_, forecasted.cpu().data.numpy()[0]])
    forecasted[0][0][0] = y_scaler.transform([[df_sales.iloc[-1, i]]])[0][0]
    for j in range(28):
        forecasted[0][0][1] = torch.tensor(r_scaler.transform([[mu]])[0][0]).to(device)
        src = torch.cat((src[:, 1:].to(device), forecasted), dim = 1)
        model.eval()
        with torch.no_grad():
            forecasted = model.infer(src.to(device), tgt_len)
        forecasts_[j][0] = forecasted.cpu().data.numpy()[0][0][0]
        forecasts_[j][1] = forecasted.cpu().data.numpy()[0][0][1]
    all_prediction = np.append(predicts_[:, 0], forecasts_[:, 0])
    
    prediction_df_2[i] = y_scaler.inverse_transform(np.array(all_prediction).reshape(-1, 1)).reshape(-1)
    rmse_train_lstm_2 = np.sqrt(mean_squared_error(sales_validation[optimal_sequence_length:-28], prediction_df_2.iloc[:-28][i]))
    r2_train_lstm_2 = r2_score(sales_validation[optimal_sequence_length:-28], prediction_df_2.iloc[:-28][i])
    rmse_test_lstm_2 = np.sqrt(mean_squared_error(sales_validation[-28:], prediction_df_2.iloc[-28:][i]))
    r2_test_lstm_2 = r2_score(sales_validation[-28:], prediction_df_2.iloc[-28:][i])
    time_taken = time.time() - start_time

    summary_df.loc[i] = [rmse_train_lstm_1, r2_train_lstm_1, rmse_test_lstm_1, r2_test_lstm_1, rmse_train_lstm_2, r2_train_lstm_2, rmse_test_lstm_2, r2_test_lstm_2, time_taken]    
    print(f"Item {i} test r2(pr), test r2(mu), time taken: {r2_test_lstm_1}, {r2_test_lstm_2}, {time_taken}")

summary_df.to_csv("Transformer_error_modelling_Summary_2.csv")
prediction_df_1.to_csv("Transformer_error_modelling_Results_2_1.csv")
prediction_df_2.to_csv("Transformer_error_modelling_Results_2_2.csv")

Item 6100 test r2(pr), test r2(mu), time taken: -0.22951782591004766, -0.21142702529061097, 19.55533242225647
Item 6366 test r2(pr), test r2(mu), time taken: -0.2788048761388724, -0.3989091185389335, 42.336856842041016
Item 6443 test r2(pr), test r2(mu), time taken: -0.25609705329119925, -0.06422524929257745, 32.28115367889404
Item 6478 test r2(pr), test r2(mu), time taken: -0.22997247625210093, -0.20761939038207666, 33.776047468185425
Item 6563 test r2(pr), test r2(mu), time taken: -0.04605290955320607, -0.024572877644703395, 31.01759934425354
Item 6576 test r2(pr), test r2(mu), time taken: -1.103184482832134, -1.1386869283192658, 43.519251346588135
Item 6618 test r2(pr), test r2(mu), time taken: -0.2418079754729403, -0.17738798519226395, 41.08570432662964
Item 6627 test r2(pr), test r2(mu), time taken: -0.2794527274882823, -0.3055695363424429, 31.575950384140015
Item 6667 test r2(pr), test r2(mu), time taken: -0.03238008706281459, -0.5385357598849316, 33.6034619808197
Item 6674 test 

### Batch 3

In [9]:
results3 = pd.read_csv("Baseline_transformer_Results_3.csv")
results3 = results3.iloc[:, 1:]
results3.index = df_validation.index[optimal_sequence_length:]

In [10]:
summary_df = pd.DataFrame(columns = ["rmse_train_transformer_pr", "r2_train_transformer_pr", "rmse_test_transformer_pr", "r2_test_transformer_pr", "rmse_train_transformer_mu", "r2_train_transformer_mu", "rmse_test_transformer_mu", "r2_test_transformer_mu", "Time Taken"])
prediction_df_1 = pd.DataFrame()
prediction_df_1.index = df_validation.index[optimal_sequence_length:]
prediction_df_2 = pd.DataFrame()
prediction_df_2.index = df_validation.index[optimal_sequence_length:]
results = results3

for string_i in results.columns: 
    
    i = int(string_i)
    start_time = time.time()

    original_prediction_residuals_df = df_sales.iloc[optimal_sequence_length:, i] - results[string_i][:-28]
    mu = original_prediction_residuals_df.mean()
    total_residuals = [mu]*(optimal_sequence_length-1)+list(original_prediction_residuals_df)
    total_residuals = np.array(total_residuals).reshape(len(total_residuals), 1)
    total_residuals_df = pd.DataFrame(total_residuals)
    total_residuals_df.index = df_sales.index[:-1]

    y_scaler = MinMaxScaler((-1, 1))
    r_scaler = MinMaxScaler((-1, 1))
    y_scaler.fit(df_sales.iloc[:, i:i+1].values.reshape(-1, 1))
    r_scaler.fit(total_residuals_df.values.reshape(-1, 1))
    normalized_y = y_scaler.transform(df_sales.iloc[:-1, i:i+1])
    normalized_r = r_scaler.transform(total_residuals_df)
    train_data_normalized = np.concatenate([normalized_y, normalized_r], axis=1)
    sequences = load_and_partition_data(train_data_normalized, optimal_sequence_length+1)
    train_data = torch.tensor(np.array(sequences[:-28]), dtype=torch.float)
    val_data = torch.tensor(np.array(sequences[-28:]), dtype=torch.float)
    train_loader = DataLoader(train_data, batch_size=optimal_batch_size , shuffle=True, drop_last=True)
    val_loader = DataLoader(val_data, batch_size=optimal_batch_size, shuffle=False, drop_last=False)
    ##############################################################  Training  ##########################################################
    #####  Parameters  ######################
    num_features = len(train_data_normalized[0])
    D_MODEL = optimal_d_model  
    NUM_HEADS = optimal_num_head 
    NUM_LAYERS = optimal_num_layer 
    NUM_EPOCHS = 200
    LR = 0.001
    #####Init the Model #######################
    model = TransformerWithPE(num_features, num_features, D_MODEL, NUM_HEADS, NUM_LAYERS)
    model.to(device)
    early_stopper = utils.EarlyStopper(patience=20)
    ##### Set Criterion Optimzer and scheduler ####################
    criterion = torch.nn.MSELoss().to(device)    # mean-squared error for regression
    optimizer = torch.optim.Adam(model.parameters(), lr=LR,weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,  patience=20,factor =0.1 ,min_lr=1e-7, eps=1e-08)
    # Train the model
    train_loss_, val_loss_ = [], []
    for epoch in range(NUM_EPOCHS): 
        train_loss, val_loss = 0, 0
        model.train()
        for batch in train_loader:
            src, tgt, tgt_y = split_sequence(batch, optimal_sequence_length/(optimal_sequence_length+1))
            pred = model(src.to(device), tgt.to(device))
            loss = criterion(pred, tgt_y.to(device))
            train_loss += loss.item()
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss/=len(train_loader.dataset)
        train_loss_.append(train_loss)
        #Evaluate on test     
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                src, tgt, tgt_y = split_sequence(batch, optimal_sequence_length/(optimal_sequence_length+1))
                val = model(src.to(device), tgt.to(device))
                loss = criterion(val, tgt_y.to(device))
                val_loss += loss.item()
        val_loss/=len(val_loader.dataset)
        val_loss_.append(val_loss)
        scheduler.step(val_loss)
        if early_stopper.early_stop(val_loss, model, 'Models/transformer_final_w_residuals.pth'):
            early_stopped = epoch+1
            break
    ##############################################################  Testing 1  ##########################################################        break
    model = torch.load('Models/transformer_final_w_residuals.pth')
    sequences_tensor = Variable(torch.Tensor(sequences))
    model.eval()
    with torch.no_grad():
        src, tgt, tgt_y = split_sequence(sequences_tensor, 28/29)
        predicts_ = model(src.to(device), tgt.to(device))
    predicts_ = predicts_.cpu().numpy().reshape(-1, 2)
    forecasts_ = np.zeros((28, 2))
    src = train_data_normalized[-optimal_sequence_length:]
    src = Variable(torch.Tensor(src)).unsqueeze(0)
    tgt_len = 1
    model.eval()
    with torch.no_grad():
        forecasted = model.infer(src.to(device), tgt_len)
    predicts_ = np.concatenate([predicts_, forecasted.cpu().data.numpy()[0]])
    forecasted[0][0][0] = y_scaler.transform([[df_sales.iloc[-1, i]]])[0][0]
    for j in range(28):
        src = torch.cat((src[:, 1:].to(device), forecasted), dim = 1)
        model.eval()
        with torch.no_grad():
            forecasted = model.infer(src.to(device), tgt_len)
        forecasts_[j][0] = forecasted.cpu().data.numpy()[0][0][0]
        forecasts_[j][1] = forecasted.cpu().data.numpy()[0][0][1]
    all_prediction = np.append(predicts_[:, 0], forecasts_[:, 0])

    sales_validation = pd.DataFrame(df_validation.iloc[:,i])
    
    prediction_df_1[i] = y_scaler.inverse_transform(np.array(all_prediction).reshape(-1, 1)).reshape(-1)
    rmse_train_lstm_1 = np.sqrt(mean_squared_error(sales_validation[optimal_sequence_length:-28], prediction_df_1.iloc[:-28][i]))
    r2_train_lstm_1 = r2_score(sales_validation[optimal_sequence_length:-28], prediction_df_1.iloc[:-28][i])
    rmse_test_lstm_1 = np.sqrt(mean_squared_error(sales_validation[-28:], prediction_df_1.iloc[-28:][i]))
    r2_test_lstm_1 = r2_score(sales_validation[-28:], prediction_df_1.iloc[-28:][i])
    ##############################################################  Testing 2  ##########################################################
    sequences_tensor = Variable(torch.Tensor(sequences))
    model.eval()
    with torch.no_grad():
        src, tgt, tgt_y = split_sequence(sequences_tensor, 28/29)
        predicts_ = model(src.to(device), tgt.to(device))
    predicts_ = predicts_.cpu().numpy().reshape(-1, 2)
    forecasts_ = np.zeros((28, 2))
    src = train_data_normalized[-optimal_sequence_length:]
    src = Variable(torch.Tensor(src)).unsqueeze(0)
    tgt_len = 1
    model.eval()
    with torch.no_grad():
        forecasted = model.infer(src.to(device), tgt_len)
    predicts_ = np.concatenate([predicts_, forecasted.cpu().data.numpy()[0]])
    forecasted[0][0][0] = y_scaler.transform([[df_sales.iloc[-1, i]]])[0][0]
    for j in range(28):
        forecasted[0][0][1] = torch.tensor(r_scaler.transform([[mu]])[0][0]).to(device)
        src = torch.cat((src[:, 1:].to(device), forecasted), dim = 1)
        model.eval()
        with torch.no_grad():
            forecasted = model.infer(src.to(device), tgt_len)
        forecasts_[j][0] = forecasted.cpu().data.numpy()[0][0][0]
        forecasts_[j][1] = forecasted.cpu().data.numpy()[0][0][1]
    all_prediction = np.append(predicts_[:, 0], forecasts_[:, 0])
    
    prediction_df_2[i] = y_scaler.inverse_transform(np.array(all_prediction).reshape(-1, 1)).reshape(-1)
    rmse_train_lstm_2 = np.sqrt(mean_squared_error(sales_validation[optimal_sequence_length:-28], prediction_df_2.iloc[:-28][i]))
    r2_train_lstm_2 = r2_score(sales_validation[optimal_sequence_length:-28], prediction_df_2.iloc[:-28][i])
    rmse_test_lstm_2 = np.sqrt(mean_squared_error(sales_validation[-28:], prediction_df_2.iloc[-28:][i]))
    r2_test_lstm_2 = r2_score(sales_validation[-28:], prediction_df_2.iloc[-28:][i])
    time_taken = time.time() - start_time

    summary_df.loc[i] = [rmse_train_lstm_1, r2_train_lstm_1, rmse_test_lstm_1, r2_test_lstm_1, rmse_train_lstm_2, r2_train_lstm_2, rmse_test_lstm_2, r2_test_lstm_2, time_taken]    
    print(f"Item {i} test r2(pr), test r2(mu), time taken: {r2_test_lstm_1}, {r2_test_lstm_2}, {time_taken}")

summary_df.to_csv("Transformer_error_modelling_Summary_3.csv")
prediction_df_1.to_csv("Transformer_error_modelling_Results_3_1.csv")
prediction_df_2.to_csv("Transformer_error_modelling_Results_3_2.csv")

Item 12269 test r2(pr), test r2(mu), time taken: -0.346587183162123, -0.6128355778685319, 21.31509804725647
Item 12274 test r2(pr), test r2(mu), time taken: -0.16606068016396436, -0.16667339173268814, 28.31914782524109
Item 12308 test r2(pr), test r2(mu), time taken: 0.0, 0.0, 45.57865238189697
Item 12314 test r2(pr), test r2(mu), time taken: -0.15652431175403936, -0.12927761654269077, 25.127774477005005
Item 12342 test r2(pr), test r2(mu), time taken: -0.011697271834518963, -0.018647401108710104, 23.55124807357788
Item 12363 test r2(pr), test r2(mu), time taken: -0.12889761420239987, -0.09692135776371247, 31.76591920852661
Item 12466 test r2(pr), test r2(mu), time taken: -0.05278446340332299, -0.008100617530055976, 28.664662837982178
Item 12490 test r2(pr), test r2(mu), time taken: -0.001956009029135508, -0.0004993643424890948, 23.0234375
Item 12493 test r2(pr), test r2(mu), time taken: -0.3064072394130417, -0.3053770070742996, 47.81609344482422
Item 12552 test r2(pr), test r2(mu), ti