# Setup

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
# %cd 'YOUR_DRIVE'

In [1]:
import math
import random
from pathlib import Path
import sys
sys.path.insert(0,str(Path().absolute().joinpath("data")))

from data import prepareData

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
from torch.utils.data import Dataset, DataLoader, random_split

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


########### import yours ###########

####################################

BATCH_SIZE = 64

# TRAIN_RATIO: train dataset ratio, should be a float in (0, 0.8]
# (0.8-TRAIN_RATIO) will be used for valid dataset
TRAIN_RATIO = 0.6 

In [2]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')



## Util

**Do NOT Modify** code blocks in this section

In [31]:
SEED = 1234

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True

In [32]:
def train(model, iterator, optimizer, loss_fn, clip):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        if batch[0].shape[0]==BATCH_SIZE:
            src = batch[0].to(device)
            trg = batch[1].to(device)
            optimizer.zero_grad()

            output = model(src, trg)
            loss = loss_fn(output, trg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [33]:
def evaluate(model, iterator, loss_fn):
    
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            if batch[0].shape[0]==BATCH_SIZE:
                src = batch[0].to(device)
                trg = batch[1].to(device)

                output = model(src, trg)

                loss = loss_fn(output, trg)
                epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Dataset & Dataloader

**Do NOT Modify** code blocks in this section

In [34]:
MAX_LENGTH = 10
VALID_RATIO = 0.8-TRAIN_RATIO

SOS_token = 0
EOS_token = 1

In [35]:
class TranslateDataset(Dataset):
    def __init__(self, max_length=10, fra2eng=True):
        self.input_lang, self.output_lang, self.pairs = prepareData('eng', 'fra', max_length=max_length, reverse=fra2eng)
        self.max_length=max_length

        self.input_lang.addWord('PAD')
        self.output_lang.addWord('PAD')
        self.input_lang_pad = self.input_lang.word2index['PAD']
        self.output_lang_pad = self.output_lang.word2index['PAD']
        
        print("data example")
        print(random.choice(self.pairs))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        x, y = self._tensorsFromPair(pair)
        return x, y

    def _tensorFromSentence(self, lang, sentence):
        indexes = [lang.word2index[word] for word in sentence.split(' ')]
        indexes.append(EOS_token)
        return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

    def _tensorsFromPair(self, pair):
        input_tensor = self._tensorFromSentence(self.input_lang, pair[0])
        target_tensor = self._tensorFromSentence(self.output_lang, pair[1])
        return (input_tensor, target_tensor)
    
    def collate_fn(self, data):
        x_batch = []; y_batch = []
        
        for x, y in data:
            if x.shape[0] < self.max_length-1:
                x = torch.cat([x, self.input_lang_pad*torch.ones((self.max_length-1 - x.shape[0], 1), dtype=x.dtype)])
            elif x.shape[0] > self.max_length-1:
                x = x[:self.max_length-1]
            if y.shape[0] < self.max_length-1:
                y = torch.cat([y, self.output_lang_pad*torch.ones((self.max_length-1 - y.shape[0], 1), dtype=y.dtype)])
            elif y.shape[0] > self.max_length-1:
                y = y[:self.max_length-1]

            x_batch.append(torch.cat([torch.tensor([SOS_token]), x.squeeze(1)]))
            y_batch.append(torch.cat([torch.tensor([SOS_token]), y.squeeze(1)]))
        
        return torch.stack(x_batch), torch.stack(y_batch)

dataset = TranslateDataset(max_length=MAX_LENGTH)

train_size = int(len(dataset)*TRAIN_RATIO)
valid_size = int(len(dataset)*VALID_RATIO)
train_data, valid_data, test_data = random_split(dataset, [train_size, valid_size, len(dataset)-(train_size+valid_size)],)

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True)

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
data example
['tu me fais de l ombre .', 'you re blocking my light .']


# 1. Seq2Seq model with Attention Mechanism

## Implement LSTM Seq2Seq Model

In [36]:
class LSTMEncoder(nn.Module):
    
    def __init__(self, in_dim, emb_dim, hid_dim):
        super(LSTMEncoder, self).__init__()

        self.embedding = nn.Embedding(in_dim, emb_dim)
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, num_layers=1, batch_first=True)

    def forward(self, input, hidden, cell):
        '''
        Q2 - (a)
        Implement forward method of LSTM Encoder Module

        INPUT
        - input: input sentence, (B, max_len)
        - hidden: initialized hidden state, (1, B, hid_dim)
        - cell: initialized cell state, (1, B, hid_dim)

        OUTPUT
        What to be returned depends on your implementation of LSTMSeq2Seq. (Q2 - (b))
        Feel free to return outputs you need.
        some examples below
        - hidden states of encoder

        '''
        ################### YOUR CODE ##|#################
        hidden=hidden.to(DEVICE)
        cell=cell.to(DEVICE)
        
        
        embedded_input=self.embedding(input).to(DEVICE) #(B, max_len, emb_dim)
        hiddens, (hidden, cell)=self.lstm(embedded_input, (hidden,cell)) #(B, 1, hid_dim)
#         print("embedded_input (B, 1, hid_dim)", hidden.shape)
#         print("embedded_input (B, 1, hid_dim)", cell.shape)
        
        
        return hiddens, (hidden, cell)
        #################################################

In [37]:
# lstme=LSTMEncoder(4346,512,256)
# hid_dim=256
# sample_x, sample_y = next(iter(train_dataloader))
# sample_x = sample_x.squeeze(0)
# sample_y = sample_y.squeeze(0)
# embedded_x = nn.Embedding(4346, 512)(sample_x)
# print(embedded_x.shape)
# hidden_0 = torch.zeros(1,BATCH_SIZE, hid_dim)  # (1, Hout) for unbatched input
# cell_0 =  torch.zeros(1,BATCH_SIZE,hid_dim)# (1, Hcell) for unbatched input

# hiddens, (hidden, cell)=lstme(sample_x, hidden_0, cell_0)
# print('LSTM Encoder outputs')
# print(f'hiddens: {hiddens.shape}\thidden: {hidden.shape}\tcell: {cell.shape}')

In [38]:
class AttnLSTMDecoder(nn.Module):

    def __init__(self, emb_dim, hid_dim, out_dim, dropout, enc_hiddens=None):
        super(AttnLSTMDecoder, self).__init__()
        
        self.t = 0 # (t)th token decoder
        self.enc_hiddens = enc_hiddens # encoder output
        self.dropout = dropout
        
        self.embedding = nn.Embedding(out_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim + hid_dim, hid_dim, batch_first=True)
        self.classifier = nn.Linear(hid_dim, out_dim)
        self.cell_0 = torch.zeros(1, hid_dim)

    def forward(self, input, hidden, cell):
        
        '''
        Q2 - (a)
        Implement forward method of LSTM Decoder Module with dot-product attention

        INPUT
        - input: input sentence (B, 1)
        - hidden: previous hidden state (B, hid_dim)
        - cell: previous cell state (1, B, hid_dim)

        OUTPUT
        What to be returned depends on your implementation of LSTMSeq2Seq. (Q2 - (b))
        Feel free to return outputs you need.
        Some examples below
        - predicted token embedding (B, n_words of target language), (B, emb_dim), etc.
        - current hidden state
        - current cell state
        '''

        ################### YOUR CODE ###################
        query = hidden # set query to calculate attention #(1,64,512)
#         print("Query Dimension (1,64,512)", query.shape)
        # Attention
        kv=self.enc_hiddens # (64,10,512)
        
#         print("Query Dimension (64,10,512)", kv.shape)
        
        attn_score=torch.bmm(kv, torch.permute(query, (1,2,0)))
        attn_coefficient =  F.softmax(attn_score, dim=0)#(64,10, 1)
#         print("attn_coefficient Dimension (64,10, 1)", attn_coefficient.shape)
        
        
        weighted_kv = kv*attn_coefficient #(64,10,512)
#         print("weighted_kv Dimension (64,10,512)", weighted_kv.shape)
        
        weighted_sum = torch.sum(weighted_kv, dim=1) # (64,512)
        
        weighted_sum=torch.reshape(weighted_sum,(weighted_sum.shape[0],1,weighted_sum.shape[1]))
        #(64,1,256)
#         print("weighted_sum Dimension (64,1,512)", weighted_sum.shape)
        
        embedded_input=self.embedding(input)
        embedded_input=torch.reshape(embedded_input,(BATCH_SIZE, 1, hid_dim))
#         print("embedded_input:\t",  embedded_input.shape)
#         print("weighted_sum shape:\t",  weighted_sum.shape)
        new_input=torch.cat((embedded_input, weighted_sum) ,dim=2) #(64,1,768)
#         print("new_input shape:(64,1,768)\t",  new_input.shape)
        
        
        hidden=torch.reshape(hidden, (1, BATCH_SIZE, hid_dim))
        hiddens, (hidden, cell)=self.lstm(new_input, (hidden,cell)) #(hiddens: 64, 1, 256)
        self.t += 1 # update time for each forward
        prediction=self.classifier(hiddens.squeeze(0)) #(64,1,2804)
        return prediction, hidden, cell
        #################################################


In [39]:
# out_dim=2805
# emb_dim=512
# dec_embedder = nn.Embedding(out_dim, emb_dim)
# embedded_y = dec_embedder(sample_y) # ground truth
# lstm = nn.LSTM(emb_dim + hid_dim, hid_dim, batch_first=True)
# hidden=torch.randn(64, 256)
# cell=torch.randn(1,64,256)
# input=torch.randint(1,100,(64,1))
# x=torch.randn(1,64,256)
# x=torch.permute(x, (1,2,0))
# print(x.shape)
# y=torch.randn(64,10,256)
# sttn=torch.bmm(y,x)
# attn_coefficient =  F.softmax(sttn, dim=0)
# weighted_kv = y*attn_coefficient
# print(weighted_kv.shape)
# weighted_sum = torch.sum(weighted_kv, dim=1) # attention value . concat되는 친구
# weighted_sum=torch.reshape(weighted_sum,(weighted_sum.shape[0],1,weighted_sum.shape[1]))
# print(weighted_sum.shape)
# embedded_input=dec_embedder(input)
# print(embedded_input.shape)
# new_input=torch.cat((embedded_input, weighted_sum) ,dim=2)
# hidden=torch.reshape(hidden, (1,64,256))
# print("1",new_input.shape)
# print("2",hidden.shape)
# print("3",cell.shape)
# hiddens, (hidden, cell)=lstm(new_input, (hidden,cell))
# print(hiddens.shape)
# print(hidden.shape)
# print(cell.shape)

In [64]:
# This code block has been referred to https://deep-learning-study.tistory.com/685

class LSTMSeq2Seq(nn.Module):
    def __init__(self, in_dim, out_dim, emb_dim, hid_dim, device, dropout):
        super(LSTMSeq2Seq, self).__init__()

        self.in_dim = in_dim
        self.out_dim = out_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.device = device
        self.dropout = dropout
        
        self.encoder = LSTMEncoder(in_dim, emb_dim, hid_dim)
        self.decoder = AttnLSTMDecoder(emb_dim, hid_dim, out_dim, dropout)
        
    def forward(self, src, trg):
        '''
        Q2 - (b)
        Implement forward method of LSTM Seq2Seq Module
        (Decoder module should attend encoder's outputs using dot product.)
        
        INPUT
        - src: source language batched data (B, max_len)
        - trg: target language batched data (B, max_len)

        OUTPUT
        - output of one-hot prediction (B, out_dim, max_len)
        '''
        batch_size, mx_len = src.shape
        ################### YOUR CODE ###################

        # Encoder (start from zero-hidden & zero-cell states)
        
        
        hidden_0 = torch.zeros(1,BATCH_SIZE, hid_dim)  # (1, Hout) for unbatched input
        cell_0 =  torch.zeros(1,BATCH_SIZE,hid_dim)# (1, Hcell) for unbatched input
        hiddens, (hidden, cell)=self.encoder(src, hidden_0, cell_0)
        # Decoder
        self.decoder.enc_hiddens = hiddens # set encoder's hidden states
        outputs = torch.zeros(MAX_LENGTH, batch_size, out_dim).to(self.device) # to store each decoder's output
        input=trg[:,0]
        for t in range(1, MAX_LENGTH): # for each t'th token, get decoder outputs
            output,hidden,cell=self.decoder(input, hidden, cell)
            outputs[t]=torch.reshape(output,(batch_size, out_dim))
            continue
        outputs=torch.permute(outputs,(1,2,0))
        self.decoder.t=0 # after for loop, reset decoder's time to evaluate properly
        return outputs
        ################### YOUR CODE ###################

In [63]:
# This code block has been referred to https://deep-learning-study.tistory.com/685

class LSTMSeq2Seqforeval(nn.Module):
    def __init__(self, in_dim, out_dim, emb_dim, hid_dim, device, dropout):
        super(LSTMSeq2Seqforeval, self).__init__()

        self.in_dim = in_dim
        self.out_dim = out_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.device = device
        self.dropout = dropout
        
        self.encoder = LSTMEncoder(in_dim, emb_dim, hid_dim)
        self.decoder = AttnLSTMDecoder(emb_dim, hid_dim, out_dim, dropout)
        
    def forward(self, src, trg):
        '''
        Q2 - (b)
        Implement forward method of LSTM Seq2Seq Module
        (Decoder module should attend encoder's outputs using dot product.)
        
        INPUT
        - src: source language batched data (B, max_len)
        - trg: target language batched data (B, max_len)

        OUTPUT
        - output of one-hot prediction (B, out_dim, max_len)
        '''
        batch_size, mx_len = src.shape
        ################### YOUR CODE ###################

        # Encoder (start from zero-hidden & zero-cell states)
        
        
        hidden_0 = torch.zeros(1,BATCH_SIZE, hid_dim)  # (1, Hout) for unbatched input
        cell_0 =  torch.zeros(1,BATCH_SIZE,hid_dim)# (1, Hcell) for unbatched input
        hiddens, (hidden, cell)=self.encoder(src, hidden_0, cell_0)
        # Decoder
        self.decoder.enc_hiddens = hiddens # set encoder's hidden states
        outputs = torch.zeros(MAX_LENGTH, batch_size, out_dim).to(self.device) # to store each decoder's output
        input=trg[:,0]
        print(input.shape)
        for t in range(1, MAX_LENGTH): # for each t'th token, get decoder outputs
            output,hidden,cell=self.decoder(input, hidden, cell)
            outputs[t]=torch.reshape(output,(batch_size, out_dim))
            print(output.shape)
            input=output
            continue
        outputs=torch.permute(outputs,(1,2,0))
        self.decoder.t=0 # after for loop, reset decoder's time to evaluate properly
        return outputs
        ################### YOUR CODE ###################


## Training

In [23]:
'''
Q2 - (c)
Train your Seq2Seq model and plot losses and perplexities.
Upon successful training, the test perplexity should be less than 5.
You may use visualization libraries for plotting and modify training options such as hyperparameters and optimizer.

'''

'\nQ2 - (c)\nTrain your Seq2Seq model and plot losses and perplexities.\nUpon successful training, the test perplexity should be less than 5.\nYou may use visualization libraries for plotting and modify training options such as hyperparameters and optimizer.\n\n'

In [65]:
in_dim = dataset.input_lang.n_words
out_dim = dataset.output_lang.n_words
hid_dim = 256
emb_dim = 256
dropout = 0.5
learning_rate=1e-2
N_EPOCHS = 100
valid_every=1

best_valid_loss = float('inf')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMSeq2Seq(in_dim, out_dim, emb_dim, hid_dim, device, dropout).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
loss_fn = nn.CrossEntropyLoss(ignore_index = dataset.output_lang_pad)

In [67]:
# Train your model

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_dataloader, optimizer, loss_fn, 1)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    
    if epoch%valid_every==0:
        print("==========================")
        valid_loss = evaluate(model, valid_dataloader, loss_fn)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            model.decoder.t=0
            torch.save(model.state_dict(), 'lstm-attn-model.pt')

        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01
	Train Loss: 4.732 | Train PPL: 113.571
	 Val. Loss: 4.421 |  Val. PPL:  83.207
Epoch: 02
	Train Loss: 4.425 | Train PPL:  83.523
	 Val. Loss: 4.255 |  Val. PPL:  70.491
Epoch: 03
	Train Loss: 4.279 | Train PPL:  72.151


KeyboardInterrupt: 

In [105]:
in_dim = dataset.input_lang.n_words
out_dim = dataset.output_lang.n_words
hid_dim = 256
emb_dim = 256
dropout = 0.7
learning_rate=1e-3
N_EPOCHS = 100
valid_every=1

best_valid_loss = float('inf')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMSeq2Seq(in_dim, out_dim, emb_dim, hid_dim, device, dropout).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(ignore_index = dataset.output_lang_pad)

In [106]:
# Train your model

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_dataloader, optimizer, loss_fn, 1)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    
    if epoch%valid_every==0:
        print("==========================")
        valid_loss = evaluate(model, valid_dataloader, loss_fn)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            model.decoder.t=0
            torch.save(model.state_dict(), 'lstm-attn-model.pt')

        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01
	Train Loss: 4.541 | Train PPL:  93.751
	 Val. Loss: 3.798 |  Val. PPL:  44.597
Epoch: 02
	Train Loss: 3.684 | Train PPL:  39.789
	 Val. Loss: 3.526 |  Val. PPL:  33.993
Epoch: 03
	Train Loss: 3.463 | Train PPL:  31.928
	 Val. Loss: 3.396 |  Val. PPL:  29.843
Epoch: 04
	Train Loss: 3.318 | Train PPL:  27.617
	 Val. Loss: 3.304 |  Val. PPL:  27.209
Epoch: 05
	Train Loss: 3.190 | Train PPL:  24.281
	 Val. Loss: 3.225 |  Val. PPL:  25.154
Epoch: 06
	Train Loss: 3.072 | Train PPL:  21.582
	 Val. Loss: 3.160 |  Val. PPL:  23.563
Epoch: 07
	Train Loss: 2.963 | Train PPL:  19.363
	 Val. Loss: 3.126 |  Val. PPL:  22.784
Epoch: 08
	Train Loss: 2.853 | Train PPL:  17.343
	 Val. Loss: 3.049 |  Val. PPL:  21.089
Epoch: 09
	Train Loss: 2.750 | Train PPL:  15.635
	 Val. Loss: 3.009 |  Val. PPL:  20.265
Epoch: 10
	Train Loss: 2.655 | Train PPL:  14.227
	 Val. Loss: 2.983 |  Val. PPL:  19.740
Epoch: 11
	Train Loss: 2.566 | Train PPL:  13.019
	 Val. Loss: 2.948 |  Val. PPL:  19.062
Epoch: 12


In [104]:
# Test your model

loaded_model = LSTMSeq2Seq(in_dim, out_dim, emb_dim, hid_dim, device, dropout).to(device)
loaded_model.load_state_dict(torch.load('lstm-attn-model.pt'))

test_loss = evaluate(loaded_model, test_dataloader, loss_fn)
print(f'\t Test. Loss: {valid_loss:.3f} |  Test. PPL: {math.exp(valid_loss):7.3f}')

	 Test. Loss: 3.307 |  Test. PPL:  27.300


## [Bonus] Implement GRU Seq2Seq Model

In [None]:
'''
Q2 - (d)
Change the modules(encoder, decoder) in Seq2Seq model to GRU, and repeat (a)~(c).

'''

In [53]:
class GRUEncoder(nn.Module):
    
    def __init__(self, in_dim, emb_dim, hid_dim):
        super(GRUEncoder, self).__init__()
        ################### YOUR CODE ###################
        self.embedding = nn.Embedding(in_dim, emb_dim)
        self.lstm = nn.GRU(input_size=emb_dim, hidden_size=hid_dim, num_layers=1, batch_first=True)

        #################################################

    def forward(self, input, hidden):

        ################### YOUR CODE ###################
        hidden=hidden.to(DEVICE)
        cell=cell.to(DEVICE)
        
        
        embedded_input=self.embedding(input).to(DEVICE) #(B, max_len, emb_dim)
        hiddens, (hidden, cell)=self.lstm(embedded_input, (hidden,cell)) #(B, 1, hid_dim)

        return hiddens, (hidden, cell)
        #################################################


In [54]:
class AttnGRUDecoder(nn.Module):

    def __init__(self, emb_dim, hid_dim, out_dim, dropout, enc_hiddens=None):
        super(AttnGRUDecoder, self).__init__()
        
        ################### YOUR CODE ###################
        self.t = 0 # (t)th token decoder
        self.enc_hiddens = enc_hiddens # encoder output
        self.dropout = dropout
        
        self.embedding = nn.Embedding(out_dim, emb_dim)
        self.lstm = nn.GRU(emb_dim + hid_dim, hid_dim, batch_first=True)
        self.classifier = nn.Linear(hid_dim, out_dim)
        self.cell_0 = torch.zeros(1, hid_dim)
        #################################################

    def forward(self, input, hidden):

        ################### YOUR CODE ###################
        query = hidden # set query to calculate attention #(1,64,512)
#         print("Query Dimension (1,64,512)", query.shape)
        # Attention
        kv=self.enc_hiddens # (64,10,512)
        
#         print("Query Dimension (64,10,512)", kv.shape)
        
        attn_score=torch.bmm(kv, torch.permute(query, (1,2,0)))
        attn_coefficient =  F.softmax(attn_score, dim=0)#(64,10, 1)
#         print("attn_coefficient Dimension (64,10, 1)", attn_coefficient.shape)
        
        
        weighted_kv = kv*attn_coefficient #(64,10,512)
#         print("weighted_kv Dimension (64,10,512)", weighted_kv.shape)
        
        weighted_sum = torch.sum(weighted_kv, dim=1) # (64,512)
        
        weighted_sum=torch.reshape(weighted_sum,(weighted_sum.shape[0],1,weighted_sum.shape[1]))
        #(64,1,256)
#         print("weighted_sum Dimension (64,1,512)", weighted_sum.shape)
        
        embedded_input=self.embedding(input)
        embedded_input=torch.reshape(embedded_input,(BATCH_SIZE, 1, hid_dim))
#         print("embedded_input:\t",  embedded_input.shape)
#         print("weighted_sum shape:\t",  weighted_sum.shape)
        new_input=torch.cat((embedded_input, weighted_sum) ,dim=2) #(64,1,768)
#         print("new_input shape:(64,1,768)\t",  new_input.shape)
        
        
        hidden=torch.reshape(hidden, (1, BATCH_SIZE, hid_dim))
        hiddens, (hidden, cell)=self.lstm(new_input, (hidden,cell)) #(hiddens: 64, 1, 256)
        self.t += 1 # update time for each forward
        prediction=self.classifier(hiddens.squeeze(0)) #(64,1,2804)
        return prediction, hidden, cell
        return 
        #################################################


In [55]:
class GRUSeq2Seq(nn.Module):
    def __init__(self, in_dim, out_dim, emb_dim, hid_dim, device, dropout):
        super(GRUSeq2Seq, self).__init__()

        self.in_dim = in_dim
        self.out_dim = out_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.device = device
        self.dropout = dropout
        
        self.encoder = GRUEncoder(in_dim, emb_dim, hid_dim)
        self.decoder = AttnGRUDecoder(emb_dim, hid_dim, out_dim, dropout)
        
    def forward(self, src, trg):
        
        batch_size, mx_len = src.shape
 # Encoder (start from zero-hidden & zero-cell states)
        
        
        self.hidden_0 = torch.zeros(BATCH_SIZE, hid_dim)  # (1, Hout) for unbatched input
        self.cell_0 =  torch.zeros(1,BATCH_SIZE,hid_dim)# (1, Hcell) for unbatched input
        hiddens, (hidden, cell)=self.encoder(src, hidden_0, cell_0)
        # Decoder
        self.decoder.enc_hiddens = hiddens # set encoder's hidden states
        outputs = torch.zeros(MAX_LENGTH, batch_size, out_dim).to(self.device) # to store each decoder's output
        input=trg[:,0]
        for t in range(1, MAX_LENGTH): # for each t'th token, get decoder outputs
            output,hidden,cell=self.decoder(input, hidden, cell)
            outputs[t]=torch.reshape(output,(batch_size, out_dim))
            continue
        outputs=torch.permute(outputs,(1,2,0))
        self.decoder.t=0 # after for loop, reset decoder's time to evaluate properly
        return outputs
        #################################################
        


In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gru_model = GRUSeq2Seq(in_dim, out_dim, emb_dim, hid_dim, device, dropout).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(ignore_index = dataset.output_lang_pad)

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_dataloader, optimizer, loss_fn, 1)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    
    if epoch%valid_every==0:
        print("==========================")
        valid_loss = evaluate(model, valid_dataloader, loss_fn)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            model.decoder.t=0
            torch.save(model.state_dict(), 'gru-attn-model.pt')

        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

RuntimeError: For batched 3-D input, hx and cx should also be 3-D but got (2-D, 3-D) tensors

In [121]:
loaded_model = GRUSeq2Seq(in_dim, out_dim, emb_dim, hid_dim, device, dropout).to(device)
model.load_state_dict(torch.load('gru-attn-model.pt'))

test_loss = evaluate(model, test_dataloader, loss_fn)
print(f'\t Test. Loss: {valid_loss:.3f} |  Test. PPL: {math.exp(valid_loss):7.3f}')

	 Test. Loss: -332.651 |  Test. PPL:   0.000


# 2. Seq2Seq model with Transformer

## Implement Transformer Seq2Seq Model

In [None]:
class TransEncoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, ff_dim, dropout, device, max_length = MAX_LENGTH):
        super().__init__()

        self.hid_dim = hid_dim
        self.max_length = max_length

        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        
        encoder_layer = TransformerEncoderLayer(hid_dim, n_heads, ff_dim, dropout, batch_first=True)
        self.encoder = TransformerEncoder(encoder_layer, n_layers)
        
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor([hid_dim], device = device, dtype=torch.float32))
        
    def forward(self, src, pos_emb, src_mask):
        '''
        Q3 - (c)
        Implement forward method of TransEncoder Module
        (Use torch.nn.TransformerEncoder, torch.nn.TransformerEncoderLayer)
        
        INPUT
        - src: source language batched data (B, max_len)
        - pos_emb: positional embedding (max_len, hid_dim)
        - src_mask: padding mask tensor for source sentences (B, max_len)

        OUTPUT
        What to be returned depends on your implementation of TransSeq2Seq.
        Feel free to return outputs you need.
        Some examples below,

        - encoder output (B, max_len, hid_dim)
        '''
        batch_size, src_len = src.shape
        #################### YOUR CODE ####################

        
        return None
        ###################################################

In [None]:
class TransDecoder(nn.Module):
    def __init__(self, out_dim, hid_dim, n_layers, n_heads, ff_dim, dropout, device, max_length = MAX_LENGTH):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.max_length = max_length
        
        self.tok_embedding = nn.Embedding(out_dim, hid_dim)
        
        decoder_layer = TransformerDecoderLayer(hid_dim, n_heads, ff_dim, dropout, batch_first=True)
        self.decoder = TransformerDecoder(decoder_layer, n_layers)
        
        self.fc_out = nn.Linear(hid_dim, out_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.tensor([hid_dim], device = device, dtype=torch.float32))
        
    def forward(self, trg, pos_emb, enc_src, trg_mask, trg_sub_mask, src_mask):
        '''
        Q3 - (c)
        Implement forward method of TransDecoder Module
        (Use torch.nn.TransformerDecoder, torch.nn.TransformerDecoderLayer)
        
        INPUT
        - trg: target language batched data (B, max_len)
        - pos_emb: positional embedding (max_len, hid_dim)
        - enc_src: encoder outputs (B, max_len, hid_dim)
        - trg_mask: padding mask tensor for target sentences (B, max_len)
        - trg_sub_mask: subsequent mask for target sentences (max_len, max_len)
        - src_mask: padding mask tensor for source sentences (B, max_len)

        OUTPUT
        What to be returned depends on your implementation of TransSeq2Seq.
        Feel free to return outputs you need.
        Some examples below,

        - decoder output (B, max_len, out_dim)
        '''
        batch_size, trg_len = trg.shape

        #################### YOUR CODE ####################
        
        return None
        ###################################################

In [None]:
class TransSeq2Seq(nn.Module):
    def __init__(self, in_dim, out_dim, hid_dim, ff_dim, n_layers, n_heads, dropout_p, device, max_length=MAX_LENGTH):
        super().__init__()
        
        self.device = device
        self.hid_dim = hid_dim
        self.max_length = max_length

        self.encoder = TransEncoder(in_dim, hid_dim, n_layers[0], n_heads, ff_dim, dropout_p, device)
        self.decoder = TransDecoder(out_dim, hid_dim, n_layers[1], n_heads, ff_dim, dropout_p, device)
        
    def make_src_mask(self, src):
        '''
        Q3 - (b)
        Implement mask generating function

        INPUT
        - src: batched input sentences (B, max_len)

        OUTPUT
        - Boolean padding mask tensor (B, max_len)
        '''
        #################### YOUR CODE ####################
        
        return None
        ###################################################

    def make_trg_mask(self, trg):
        '''
        Q3 - (b)
        Implement mask generating function

        INPUT
        - trg: batched target sentences (B, max_len)

        OUTPUT
        - A tuple of a padding mask tensor and a subsequent mask tensor ((B, max_len), (max_len, max_len))
        '''
        #################### YOUR CODE ####################

        return None
        ###################################################

    def forward(self, src, trg):
        '''
        Q3 - (c)
        Implement forward method of TransSeq2Seq Module
        
        INPUT
        - src: source language batched data (B, max_len)
        - trg: target language batched data (B, max_len)

        OUTPUT
        - decoder output (B, out_dim, max_dim)
        
        '''
        #################### YOUR CODE ####################
        
        return None
        ###################################################
    
    
    def get_pos_emb(self):
        '''
        Q3 - (a)
        Implement absolute positional embedding

        OUTPUT
        - positional embedding tensor (max_len, hid_dim)
        '''
        #################### YOUR CODE ####################

        return None
        ###################################################

## Training

In [None]:
'''
Q3 - (d)
Train your Seq2Seq model and plot losses and perplexities.
Upon successful training, the test perplexity should be less than 2.
You may use visualization libraries for plotting and modify training options such as hyperparameters and optimizer.

Based on the results from lSTM(GRU)-based and transformer-based Seq2Seq models,
briefly describe which approach is better and why.
'''

In [None]:
in_dim = dataset.input_lang.n_words
out_dim = dataset.output_lang.n_words
hid_dim = 256
ff_dim = 1024
n_enc_layers = 4
n_dec_layers = 4
n_layers = [n_enc_layers, n_dec_layers]
n_heads = 8
dropout = 0.1

learning_rate=1e-2
N_EPOCHS = 100
valid_every=5
best_valid_loss = float('inf')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransSeq2Seq(in_dim, out_dim, hid_dim, ff_dim, n_layers, n_heads, dropout, device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(ignore_index = dataset.output_lang_pad)
# print(model)

In [None]:
# Train your model
for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_dataloader, optimizer, loss_fn, 1)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    
    if epoch%valid_every==0:
        print("==========================")
        valid_loss = evaluate(model, valid_dataloader, loss_fn)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            model.decoder.t=0
            torch.save(model.state_dict(), 'transformer-model.pt')

        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
# Test your model
loaded_model = TransSeq2Seq(in_dim, out_dim, hid_dim, ff_dim, n_layers, n_heads, dropout, device).to(device)
loaded_model.load_state_dict(torch.load('transformer-model.pt'))

test_loss = evaluate(loaded_model, test_dataloader, loss_fn)
print(f'\t Test. Loss: {valid_loss:.3f} |  Test. PPL: {math.exp(valid_loss):7.3f}')