# Seq2Seq setup
[example here](https://www.kaggle.com/code/omershect/learning-pytorch-seq2seq-with-m5-data-set)

In [26]:
import os
import json
import logging
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils import (
    train_model,
    preprocess_data, 
    SequenceDataset, 
    score_model, 
    predict,
    get_predictions,
    plot_predictions)

In [27]:
filename = "../data/mta_subway_221231_100wk_dbscan.parquet"
df = pd.read_parquet(filename)
df = df.fillna(0)
df.shape

(16800, 704)

In [28]:
df = df.head(1000)

In [29]:
df_train, df_test, features = preprocess_data(df)
df_test.shape

(200, 704)

In [30]:
train_dataset = SequenceDataset(
        df_train,
        #target=None,
        features=features,
        sequence_length=336,
        horizon_length=168,
        forecast_lead=1
        )
test_dataset = SequenceDataset(
        df_test,
        #target=None,
        features=features,
        sequence_length=336,
        horizon_length=168,
        forecast_lead=1
        )
# Need to set up SequenceDataset to take output length as an arguement 
#to specify y sequence length

# Currently only returns next single row, want to set up to take rows for each hour 
#over next week

In [31]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [7]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias= False)
    
    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        print(hidden.shape)
        hidden = hidden[2:3, :, :]
        hidden = hidden.repeat(1, src_len, 1)
        print(f"attention hidden shape: {hidden.shape}")
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)
    
class AttentionDecoder(nn.Module):
    def __init__(self, seq_len, attention, input_dim, num_features, encoder_hidden_state = 512, dropout=0):
        super(AttentionDecoder, self).__init__()
        self.seq_len = seq_len
        self.attention = attention
        self.input_dim = input_dim
        self.hidden_dim = input_dim
        self.num_features = num_features
        self.dropout=dropout
        self.lstm = nn.LSTM(
            input_size=encoder_hidden_state + 1, 
            hidden_size=input_dim,
            num_layers=3,
            batch_first=True,
            dropout=self.dropout
        )
        self.linear = nn.Linear(self.hidden_dim * 2, self.num_features)

    def forward(self, x, input_h, input_c, encoder_outputs):
        a = self.attention(input_h, encoder_outputs)
        a = a.unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs)
        x = x.reshape((1,1,1))
        rnn_input = torch.cat((x, weighted), dim = 2)
        x, (hn, cn) = self.lstm(rnn_input, (input_h, input_c))
        output = x.squeeze(0)
        weighted = weighted.squeeze(0)
        x = self.linear(torch.cat((output, weighted), dim=1))
        return x, hn, cn

In [17]:
class Encoder(nn.Module):
    def __init__(self, window, num_features, hidden_units, embedding_dim=64, num_layers=1,dropout=0):
        super(Encoder, self).__init__()
        self.window = window
        self.num_features = num_features
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_units,
            batch_first=True,
            num_layers=self.num_layers,
            dropout=self.dropout
        )
            
    def forward(self,x):
        batch_size = x.shape[0]
        #x = x.reshape((1, self.window, self.num_features))
        #h_1 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_units))
        
        h0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_())
        c0 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_())
        x, (h, c) = self.lstm(x, (h0, c0))
        #print(f"encoder x shape: {x.shape}")
        #print(f"encoder h shape: {h.shape}")
        #print(f"encoder c shape: {c.shape}")
        return x, (h, c)

    
class Decoder(nn.Module):
    def __init__(self, seq_len, input_dim, num_features, dropout=0):
        super(Decoder, self).__init__()
        self.seq_len = seq_len
        self.input_dim = input_dim
        self.hidden_dim = input_dim
        self.num_features = num_features
        self.dropout=dropout
        self.lstm = nn.LSTM(
            input_size=self.num_features, 
            hidden_size=input_dim,
            num_layers=1,
            batch_first=True,
            dropout=self.dropout
        )
        self.linear = nn.Linear(self.hidden_dim, self.num_features)

    def forward(self, x, input_h, input_c):
        batch_size = x.shape[0]
        x = x.reshape((batch_size,1,self.num_features))
        x, (hn, cn) = self.lstm(x, (input_h, input_c))
        x = self.linear(x)
        return x, hn, cn

class Seq2Seq(nn.Module):
    def __init__(self, num_features, window, horizon, hidden_units, num_layers=1, dropout=0):
        super(Seq2Seq, self).__init__()
        self.num_features = num_features
        self.window = window
        self.horizon = horizon
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.dropout = dropout
        #self.attention = Attention(64,64)
        self.encoder = Encoder(
            window=self.window,
            num_features=self.num_features,
            hidden_units=self.hidden_units, 
            num_layers=self.num_layers, 
            dropout=self.dropout
            )
        #self.decoder = AttentionDecoder(
        #    seq_len=window,
        #    attention=self.attention,
        #    input_dim=64,
        #    num_features=self.num_features,
        #    
        #    dropout=self.dropout
        #    #input is output state of encoder
        #    #output length is equal to specified horizon length
        #    )
        self.decoder = Decoder(
            seq_len=self.window, 
            input_dim=self.hidden_units, 
            num_features=self.num_features
            )
    def forward(self,x):
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
        #o,(h,c) = self.encoder(x, (h0, c0))
        o,(h,c) = self.encoder(x)
        targets_ta = []
        prev_output = x[:,-1,:].unsqueeze(1)
        for horizon_step in range(self.horizon):
            prev_x, prev_h, prev_c = self.decoder(prev_output,h, c)
            targets_ta.append(prev_x.reshape((batch_size, 1, self.num_features)))
            h, c = prev_h, prev_c
            prev_output = prev_x
        targets = torch.stack(targets_ta).squeeze(2)
        targets = targets.reshape(batch_size,self.horizon, self.num_features)
        return targets

In [18]:
model = Seq2Seq(
    num_features=len(features), 
    window=336, 
    horizon=168, 
    hidden_units=16
)

loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

rewrite `train_model()` to output whole sequence of length `horizion`


`avg_loss = train_model(train_loader, model, loss_function, optimizer=optimizer)`

In [19]:
X, y = next(iter(train_loader))

In [20]:
X.shape
# [batch size, window length, number of features]

torch.Size([4, 336, 704])

In [21]:
pred_y = model(X)
pred_y.shape

torch.Size([4, 168, 704])

In [22]:
pred_y[0].shape

torch.Size([168, 704])

In [23]:
y.shape

torch.Size([4, 168, 704])

In [24]:
loss_function(pred_y, y)

tensor(1.0738, grad_fn=<MseLossBackward0>)

In [25]:
avg_loss = train_model(
    train_loader, 
    model, 
    loss_function, 
    optimizer
)
avg_loss

1.0236536163091658