## Importing required libraries

In [1]:
# suppressing warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import random
import math
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Iterable, List

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k

from torchdata.datapipes.iter import IterableWrapper, Mapper

from nltk.translate.bleu_score import sentence_bleu

!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 13.4 MB/s eta 0:00:01
     --------- ------------------------------ 2.9/12.8 MB 7.3 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 7.7 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 8.4 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 9.2 MB/s eta 0:00:01
     ------------------------------------ --- 11.8/12.8 MB 9.7 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 9.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/rel

#### Checking if CUDA is available

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


## The Encoder

In [4]:
# a rrn implemented using "nn.EmbeddingBag", "nn.LSTM", "nn.Dropout" functions from "pytorch" library
class Encoder(nn.Module):
    def __init__(self, vocab_len, emb_dim, hid_dim, n_layers, dropout_prob):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_len, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_batch):
        embed = self.dropout(self.embedding(input_batch))
        embed = embed.to(device)
        outputs, (hidden, cell) = self.lstm(embed)

        return hidden, cell

#### Encoder -> example of one forward pass

In [5]:
# dummy data
vocab_len = 8
emb_dim = 10
hid_dim = 8
n_layers = 1
dropout_prob = 0.5

# instantiating the model
encoder_t = Encoder(vocab_len, emb_dim, hid_dim, n_layers, dropout_prob).to(device)

In [6]:
# dummy data 
src_batch = torch.tensor([0,3,4,2,1]) #where 0,3,4,2,1 are vocab indecies
src_batch = src_batch.t().to(device)

# getting the embedding of the text token indices
embedded = encoder_t.embedding(src_batch)
# appling dropout to embedded
embedded_dropout = encoder_t.dropout(embedded)
embedded_dropout.to(device)
# passing through the lstm
outputs, (hidden_t, cell_t) = encoder_t.lstm(embedded_dropout)

print(f"Input(src) tensor  [shape -> {src_batch.shape[0]}]:-\n", src_batch)
print(f"\nEmbedded tokens  [shape -> {embedded.shape[0]}]:-\n", embedded)
print(f"\nAfter dropout:-  [shape -> {embedded_dropout.shape[0]}]\n", embedded_dropout)
print(f"\nHidden:-         [shape -> {hidden_t.shape[0]}]\n", hidden_t)
print(f"\nCell:-           [shape -> {cell_t.shape[0]}]\n", cell_t)
print(f"\nOutput:-         [shape -> {outputs.shape[0]}]\n", outputs)

Input(src) tensor  [shape -> 5]:-
 tensor([0, 3, 4, 2, 1], device='cuda:0')

Embedded tokens  [shape -> 5]:-
 tensor([[ 0.6591, -1.1215,  2.0592, -0.5522,  0.9117, -0.4574,  2.6500, -1.0586,
         -0.4957,  1.5255],
        [-0.2039,  0.0906,  1.2810, -0.9542, -1.7231,  0.6712, -1.2482, -0.1359,
         -0.0165,  0.9027],
        [-0.0992,  0.7913, -1.1062, -0.7618, -0.7259, -1.2637,  0.3867,  1.5638,
         -0.6415, -1.0602],
        [-1.4792, -1.7376, -0.9612,  0.5893,  0.9242, -0.7253, -0.2297,  1.5039,
         -1.3460,  0.2714],
        [ 1.5120,  1.7224, -0.0128, -0.1236,  0.4383, -0.4804,  0.0082,  1.7586,
         -3.0753,  0.4310]], device='cuda:0', grad_fn=<EmbeddingBackward0>)

After dropout:-  [shape -> 5]
 tensor([[ 1.3181, -2.2431,  0.0000, -0.0000,  1.8234, -0.0000,  5.3000, -2.1173,
         -0.9914,  0.0000],
        [-0.0000,  0.0000,  2.5620, -1.9084, -3.4461,  0.0000, -2.4964, -0.2719,
         -0.0329,  1.8055],
        [-0.1983,  0.0000, -2.2123, -1.5236, -0

## The Decoder

In [7]:
# a rrn implemented using "nn.EmbeddingBag", "nn.Linear" ,"nn.LSTM", "nn.Dropout", "nn.LogSoftmax" functions from "pytorch" library
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch_size]
        input = input.unsqueeze(0) # input = [1, batch_size]
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction_logit = self.fc_out(output.squeeze(0))
        prediction = self.softmax(prediction_logit)

        return prediction, hidden, cell

#### Decoder -> example of one forward pass

In [8]:
# dummy data
output_dim = 6
emb_dim = 10
hid_dim = 8
n_layers = 1
dropout = 0.5

# instantiating the model
decoder_t = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)

In [9]:
# dummy data 
input_t = torch.tensor([0]).to(device) # <bos>

# getting the embedding of the input text token <bos>
embedded = decoder_t.embedding(input_t)
# appling dropout to embedded
embedded_dropout = decoder_t.dropout(embedded)
embedded_dropout.to(device)
# passing through the lstm
output, (hidden, cell) = decoder_t.lstm(embedded_dropout, (hidden_t, cell_t))
# passing through the fully connected layer
prediction_logit = decoder_t.fc_out(output)
# appling softmax
prediction = decoder_t.softmax(prediction_logit)

print(f"Input(target) tensor  [shape -> {input_t.shape[0]}]:-\n", input_t)
print(f"\nEmbedded tokens  [shape -> {embedded.shape[0]}]:-\n", embedded)
print(f"\nAfter dropout:-  [shape -> {embedded_dropout.shape[0]}]\n", embedded_dropout)
#print(f"\nRNN Hidden:-     [shape -> {hidden.shape[0]}]\n", hidden)
#print(f"\nRNN Cell:-       [shape -> {cell.shape[0]}]\n", cell)
print(f"\nRNN Output:-     [shape -> {output.shape[0]}]\n", output)
print(f"\nFC layer Out:-   [shape -> {prediction_logit.shape[0]}]\n", prediction_logit)
print(f"\nSoftmax to Out:- [shape -> {prediction.shape[0]}]\n", prediction)

Input(target) tensor  [shape -> 1]:-
 tensor([0], device='cuda:0')

Embedded tokens  [shape -> 1]:-
 tensor([[-1.8793,  0.2864,  0.0866,  0.3047, -0.1428, -0.4833, -2.7147, -0.0226,
         -2.0847, -0.2899]], device='cuda:0', grad_fn=<EmbeddingBackward0>)

After dropout:-  [shape -> 1]
 tensor([[-0.0000,  0.0000,  0.1731,  0.0000, -0.2856, -0.0000, -0.0000, -0.0000,
         -0.0000, -0.0000]], device='cuda:0', grad_fn=<NativeDropoutBackward0>)

RNN Output:-     [shape -> 1]
 tensor([[ 0.0945,  0.0011,  0.0397, -0.0620,  0.1445, -0.1348,  0.0785, -0.2011]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

FC layer Out:-   [shape -> 1]
 tensor([[ 0.3602, -0.0872, -0.0362, -0.2333,  0.2648, -0.2661]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

Softmax to Out:- [shape -> 1]
 tensor([[-1.4605, -1.9078, -1.8568, -2.0540, -1.5559, -2.0868]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>)


#### Encoder-decoder connection

In [10]:
# dummy data 
src = torch.tensor([[0,3,4,2,1]]) #where 0,3,4,2,1 are vocab indecies
src = src.t().to(device)
# instantiating the encoder model
encoder_t = Encoder(vocab_len, emb_dim, hid_dim, n_layers, dropout_prob).to(device)
# geting the encoder output
hidden_t , cell_t = encoder_t(src)



# dummy data 
trg = torch.tensor([[0],[2],[3],[5],[1]]).to(device) # 0 -> <bos>
# instantiating the decoder model
decoder_t = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)

# a tensor to store decoder opuputs at each time step
batch_size = trg.shape[1]
trg_len = trg.shape[0]
trg_vocab_size = decoder_t.output_dim
outputs_t = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

# the first input to the decoder is the <bos> token
input = trg[0,:]

# looping through the trg length
for t in range(1, trg_len):
    output_t, hidden_t, cell_t = decoder_t(input, hidden_t, cell_t)
    # storing the output in current time step
    outputs_t[t] = output_t
    #getting the predicted token  index
    top_1 = output_t.argmax(1)
    # deciding weather to use tracher forcing
    teacher_force = random.random() < 0.5 # 0.5 -> teacher forcing ratio
    input = trg[t] if teacher_force else top_1

print(f"Decoder output in each time step [shape -> {outputs_t.shape}]:-\n",outputs_t)

Decoder output in each time step [shape -> torch.Size([5, 1, 6])]:-
 tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-1.7104, -1.5063, -2.1073, -2.0045, -1.8040, -1.7341]],

        [[-1.6352, -1.6171, -2.0413, -2.0507, -1.7456, -1.7512]],

        [[-1.8648, -1.5223, -2.0661, -2.0419, -1.6634, -1.7097]],

        [[-1.9186, -1.4293, -2.1572, -2.0169, -1.6606, -1.7432]]],
       device='cuda:0', grad_fn=<CopySlices>)


In [33]:
# making predictions
pred_tokens = outputs_t.argmax(2)
print(pred_tokens)

tensor([[0],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')


## Sequence-to-sequence model

In [38]:
# connecting encoder and decoder components to create the seq2seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, trg_vocab):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.trg_vocab = trg_vocab

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimension of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

        #last encoder hidden state
        hidden, cell = self.encode(src)
        hidden.to(device)
        cell.to(device)

        #first input to decoder is <bos>
        input = trg[0,:]

        for t in range(1, trg_len):
            output , hidden, cell = self.decoder(input, hidden, cell)
             # storing the output from the current time step
            outputs[t] = output
            #getting the predicted token index
            top_1 = output.argmax(1)
            # deciding weather to use tracher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[t] if  teacher_force else top_1
        
        return outputs

#### A function to train the model

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    train_iterator = tqdm(iterator, desc="Training", leavel=False)

    for i, (src, trg) in enumerate(iterator):
        # senting the src anf trg tensors to device
        src = src.to(device)
        trg = trg.to(device)

        # clearing the gradinet from previous batch
        optimizer.zero_grad()

        # the models predictions - token probabilities
        output = model(src, trg)

        #----- trg shape -> [trg len, batch_size]
        #----- output shape -> [trg_len, batch_size, output_dim]

        output_dim = output.size[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)

        #----- trg shape -> [(trg len -1) * batch_size]
        #----- output shape -> [(trg len -1) * batch_size, output dim]

        # computing the loss
        loss = creterion(output, trg)
        # compiting the gradient
        loss.backward()
        # cliping the gradien
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # updating the weights
        optimizer.step()

        # Updating the tqdm
        train_iterator.set_postfix(loss=loss.)
        