## Importing required libraries

In [1]:
# suppressing warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import random
import math
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Iterable, List

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k

from torchdata.datapipes.iter import IterableWrapper, Mapper

from nltk.translate.bleu_score import sentence_bleu

!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------ --------------------- 6.0/12.8 MB 37.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 40.2 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
     ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
     --------------- ------------------------ 5.5/14.6 MB 41.9 MB/s eta 0:00:01
     --------------------------------------  14.4/14.6 MB 45.2 MB/s eta 0:00:01
     --------------------------------------- 14.6/14.6 MB 41.8 MB/s eta 0:00:0

#### Checking if CUDA is available

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


## The Encoder

In [4]:
# a rrn implemented using "nn.EmbeddingBag", "nn.LSTM", "nn.Dropout" functions from "pytorch" library
class Encoder(nn.Module):
    def __init__(self, vocab_len, emb_dim, hid_dim, n_layers, dropout_prob):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_len, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_batch):
        embed = self.dropout(self.embedding(input_batch))
        embed = embed.to(device)
        outputs, (hidden, cell) = self.lstm(embed)

        return hidden, cell

#### Encoder -> example of one forward pass

In [5]:
# dummy data
vocab_len = 8
emb_dim = 10
hid_dim = 8
n_layers = 1
dropout_prob = 0.5

# instantiating the model
encoder_t = Encoder(vocab_len, emb_dim, hid_dim, n_layers, dropout_prob).to(device)

In [6]:
# dummy data 
src_batch = torch.tensor([0,3,4,2,1]) #where 0,3,4,2,1 are vocab indecies
src_batch = src_batch.t().to(device)

# getting the embedding of the text token indices
embedded = encoder_t.embedding(src_batch)
# appling dropout to embedded
embedded_dropout = encoder_t.dropout(embedded)
embedded_dropout.to(device)
# passing through the lstm
outputs, (hidden_t, cell_t) = encoder_t.lstm(embedded_dropout)

print(f"Input(src) tensor  [shape -> {src_batch.shape[0]}]:-\n", src_batch)
print(f"\nEmbedded tokens  [shape -> {embedded.shape[0]}]:-\n", embedded)
print(f"\nAfter dropout:-  [shape -> {embedded_dropout.shape[0]}]\n", embedded_dropout)
print(f"\nHidden:-         [shape -> {hidden_t.shape[0]}]\n", hidden_t)
print(f"\nCell:-           [shape -> {cell_t.shape[0]}]\n", cell_t)
print(f"\nOutput:-         [shape -> {outputs.shape[0]}]\n", outputs)

Input(src) tensor  [shape -> 5]:-
 tensor([0, 3, 4, 2, 1], device='cuda:0')

Embedded tokens  [shape -> 5]:-
 tensor([[-0.1802,  1.1975,  1.0771, -0.5216,  2.1803,  0.1691, -1.1963,  0.6785,
         -1.0588, -0.8614],
        [ 1.2755,  2.2683, -0.1140, -1.1156, -0.7328, -0.0107,  0.2911, -0.2835,
         -0.1929,  0.2068],
        [-0.2381, -0.0269, -0.5800,  0.2044,  0.7740,  0.9528,  0.8390, -0.9844,
          0.0256,  1.5315],
        [ 2.0494, -0.1680, -0.0388,  0.9677,  0.2004, -0.5216, -1.0922,  0.2936,
         -1.2084, -0.5807],
        [ 1.1939,  1.2736,  0.4263,  0.5206, -1.7993,  0.9224,  0.1417,  0.2404,
         -1.0601,  0.1225]], device='cuda:0', grad_fn=<EmbeddingBackward0>)

After dropout:-  [shape -> 5]
 tensor([[-0.3604,  0.0000,  2.1542, -0.0000,  0.0000,  0.3383, -2.3926,  0.0000,
         -0.0000, -0.0000],
        [ 2.5510,  0.0000, -0.2280, -0.0000, -0.0000, -0.0000,  0.5821, -0.5670,
         -0.3857,  0.4136],
        [-0.0000, -0.0539, -1.1599,  0.0000,  0

## The Decoder

In [7]:
# a rrn implemented using "nn.EmbeddingBag", "nn.Linear" ,"nn.LSTM", "nn.Dropout", "nn.LogSoftmax" functions from "pytorch" library
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch_size]
        input = input.unsqueeze(0) # input = [1, batch_size]
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction_logit = self.fc_out(output.squeeze(0))
        prediction = self.softmax(prediction_logit)

        return prediction, hidden, cell

#### Decoder -> example of one forward pass

In [8]:
# dummy data
output_dim = 6
emb_dim = 10
hid_dim = 8
n_layers = 1
dropout = 0.5

# instantiating the model
decoder_t = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)

In [9]:
# dummy data 
input_t = torch.tensor([0]).to(device) # <bos>

# getting the embedding of the input text token <bos>
embedded = decoder_t.embedding(input_t)
# appling dropout to embedded
embedded_dropout = decoder_t.dropout(embedded)
embedded_dropout.to(device)
# passing through the lstm
output, (hidden, cell) = decoder_t.lstm(embedded_dropout, (hidden_t, cell_t))
# passing through the fully connected layer
prediction_logit = decoder_t.fc_out(output)
# appling softmax
prediction = decoder_t.softmax(prediction_logit)

print(f"Input(target) tensor  [shape -> {input_t.shape[0]}]:-\n", input_t)
print(f"\nEmbedded tokens  [shape -> {embedded.shape[0]}]:-\n", embedded)
print(f"\nAfter dropout:-  [shape -> {embedded_dropout.shape[0]}]\n", embedded_dropout)
#print(f"\nRNN Hidden:-     [shape -> {hidden.shape[0]}]\n", hidden)
#print(f"\nRNN Cell:-       [shape -> {cell.shape[0]}]\n", cell)
print(f"\nRNN Output:-     [shape -> {output.shape[0]}]\n", output)
print(f"\nFC layer Out:-   [shape -> {prediction_logit.shape[0]}]\n", prediction_logit)
print(f"\nSoftmax to Out:- [shape -> {prediction.shape[0]}]\n", prediction)

Input(target) tensor  [shape -> 1]:-
 tensor([0], device='cuda:0')

Embedded tokens  [shape -> 1]:-
 tensor([[-2.7828,  0.2202,  1.3489,  1.2117,  0.0890,  0.3613, -0.1720,  0.3643,
         -1.5604, -0.0764]], device='cuda:0', grad_fn=<EmbeddingBackward0>)

After dropout:-  [shape -> 1]
 tensor([[-0.0000,  0.4404,  2.6978,  0.0000,  0.0000,  0.0000, -0.3440,  0.0000,
         -0.0000, -0.0000]], device='cuda:0', grad_fn=<NativeDropoutBackward0>)

RNN Output:-     [shape -> 1]
 tensor([[ 0.2054, -0.0121, -0.1001,  0.1277, -0.0731, -0.2087,  0.1585,  0.1621]],
       device='cuda:0', grad_fn=<SqueezeBackward1>)

FC layer Out:-   [shape -> 1]
 tensor([[ 0.2296,  0.0203,  0.0551, -0.0038,  0.0826, -0.1642]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

Softmax to Out:- [shape -> 1]
 tensor([[-1.6056, -1.8148, -1.7801, -1.8390, -1.7526, -1.9994]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward0>)


#### Encoder-decoder connection

In [10]:
# dummy data 
src = torch.tensor([[0,3,4,2,1]]) #where 0,3,4,2,1 are vocab indecies
src = src.t().to(device)
# instantiating the encoder model
encoder_t = Encoder(vocab_len, emb_dim, hid_dim, n_layers, dropout_prob).to(device)
# geting the encoder output
hidden_t , cell_t = encoder_t(src)

print(f"Decoder Input:- [shape - {src.shape}]"," (src_length, batch_size)\n",src)
print("\nEncoder Context vectors:-")
print(f"\nHidden:- [shape - {hidden_t.shape}]"," (1, batch_size, hid_dim)\n",hidden_t)
print(f"\nCell:- [shape - {cell_t.shape}]"," (1, batch_size, hid_dim)\n", cell_t)



# dummy data 
trg = torch.tensor([[0],[2],[3],[5],[1]]).to(device) # 0 -> <bos>
# instantiating the decoder model
decoder_t = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout).to(device)

# a tensor to store decoder opuputs at each time step
batch_size = trg.shape[1]
trg_len = trg.shape[0]
trg_vocab_size = decoder_t.output_dim
outputs_t = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

# the first input to the decoder is the <bos> token
input = trg[0,:]

print("\n----------------------------------------------------------------------------------------------------------------------------------------------")
print(f"\nTarget String:- [shape - {trg.shape}]"," (trg_length, batch_size)\n",trg)
print(f"\nFirst Decoder Input - <bos> [shape - {input.shape}]"," (batch_size)\n",input, "\n")

# looping through the trg length
for t in range(1, trg_len):
    output_t, hidden_t, cell_t = decoder_t(input, hidden_t, cell_t)
    # storing the output in current time step
    outputs_t[t] = output_t
    #getting the predicted token  index
    top_1 = output_t.argmax(1)
    # deciding weather to use tracher forcing
    teacher_force = random.random() < 0.5 # 0.5 -> teacher forcing ratio
    input = trg[t] if teacher_force else top_1

print(f"Decoder output in each time step [shape -> {outputs_t.shape}]"," (trg_length, batch_size, trg_vocab_size)\n",outputs_t)

Decoder Input:- [shape - torch.Size([5, 1])]  (src_length, batch_size)
 tensor([[0],
        [3],
        [4],
        [2],
        [1]], device='cuda:0')

Encoder Context vectors:-

Hidden:- [shape - torch.Size([1, 1, 8])]  (1, batch_size, hid_dim)
 tensor([[[-0.0707, -0.0221, -0.0760, -0.5548,  0.0131, -0.1154,  0.0085,
           0.5803]]], device='cuda:0', grad_fn=<CudnnRnnBackward0>)

Cell:- [shape - torch.Size([1, 1, 8])]  (1, batch_size, hid_dim)
 tensor([[[-0.1098, -0.1428, -0.1020, -0.6944,  0.0744, -0.4329,  0.4973,
           0.8614]]], device='cuda:0', grad_fn=<CudnnRnnBackward0>)

----------------------------------------------------------------------------------------------------------------------------------------------

Target String:- [shape - torch.Size([5, 1])]  (trg_length, batch_size)
 tensor([[0],
        [2],
        [3],
        [5],
        [1]], device='cuda:0')

First Decoder Input - <bos> [shape - torch.Size([1])]  (batch_size)
 tensor([0], device='cuda:0') 


In [11]:
# making predictions
pred_tokens = outputs_t.argmax(2)
print(pred_tokens)

tensor([[0],
        [1],
        [1],
        [2],
        [1]], device='cuda:0')


## Sequence-to-sequence model

In [38]:
# connecting encoder and decoder components to create the seq2seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, trg_vocab):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.trg_vocab = trg_vocab

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimension of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

        #last encoder hidden state
        hidden, cell = self.encode(src)
        hidden.to(device)
        cell.to(device)

        #first input to decoder is <bos>
        input = trg[0,:]

        for t in range(1, trg_len):
            output , hidden, cell = self.decoder(input, hidden, cell)
             # storing the output from the current time step
            outputs[t] = output
            #getting the predicted token index
            top_1 = output.argmax(1)
            # deciding weather to use tracher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[t] if  teacher_force else top_1
        
        return outputs

#### Example loss of one example document

In [13]:
print(f"Decoder output in each time step [shape -> {outputs_t.shape}]"," (trg_length, batch_size, trg_vocab_size)\n",outputs_t)
output_t1 = outputs_t.clone().detach()
output_t1 = output_t1[1:].view(-1,output_dim)
print(f"\nRemoving the 1st row <bos> and re-shaping the output [shape - {output_t1.shape}]"," ([(trg_length-1) * batch_size], trg_vocab_size)\n",output_t1)
print("\n-------------------------------------------------------------------------------------------------------------------------------------------\n")
print(f"\nTarget String:- [shape - {trg.shape}]"," (trg_length, batch_size)\n",trg)
trg1 = trg.clone().detach()
trg1 = trg1[1:].contiguous().view(-1)
print(f"\nRemoving the 1st row <bos> and re-shaping the target [shape - {trg1.shape}]"," [(trg_length-1) * batch_size]\n",trg1)
print("\n-------------------------------------------------------------------------------------------------------------------------------------------\n")
print(f"Cross Entropy Loss:- ", nn.CrossEntropyLoss()(output_t1,trg1))

Decoder output in each time step [shape -> torch.Size([5, 1, 6])]  (trg_length, batch_size, trg_vocab_size)
 tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],

        [[-2.1110, -1.6511, -1.6739, -1.7298, -1.9239, -1.7363]],

        [[-2.1344, -1.5531, -1.7289, -1.6742, -2.0126, -1.7632]],

        [[-2.1961, -1.6754, -1.4862, -1.8491, -1.9812, -1.7147]],

        [[-2.1349, -1.6192, -1.7047, -1.6984, -1.9182, -1.7603]]],
       device='cuda:0', grad_fn=<CopySlices>)

Removing the 1st row <bos> and re-shaping the output [shape - torch.Size([4, 6])]  ([(trg_length-1) * batch_size], trg_vocab_size)
 tensor([[-2.1110, -1.6511, -1.6739, -1.7298, -1.9239, -1.7363],
        [-2.1344, -1.5531, -1.7289, -1.6742, -2.0126, -1.7632],
        [-2.1961, -1.6754, -1.4862, -1.8491, -1.9812, -1.7147],
        [-2.1349, -1.6192, -1.7047, -1.6984, -1.9182, -1.7603]],
       device='cuda:0')

-----------------------------------------------------------------------------------------------

#### A function to train the model

In [14]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    train_iterator = tqdm(iterator, desc="Training", leavel=False)

    for i, (src, trg) in enumerate(train_iterator):
        # senting the src anf trg tensors to device
        src = src.to(device)
        trg = trg.to(device)

        # clearing the gradinet from previous batch
        optimizer.zero_grad()

        # the models predictions - token probabilities
        output = model(src, trg)

        #----- trg shape -> [trg len, batch_size]
        #----- output shape -> [trg_len, batch_size, output_dim]

        output_dim = output.size[-1]
        output = output[1:].view(-1, output_dim) # removing <bos> and re-shaping
        trg = trg[1:].contiguous().view(-1) # removing <bos> and re-shaping

        #----- trg shape -> [(trg len -1) * batch_size]
        #----- output shape -> [(trg len -1) * batch_size, output dim]

        # computing the loss
        loss = creterion(output, trg)
        # compiting the gradient
        loss.backward()
        # cliping the gradien
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # updating the weights
        optimizer.step()

        # Updating the tqdm
        train_iterator.set_postfix(loss=loss.item())

        epoch_loss += loss.item()

    return epoch_loss / len(list(iterator))

#### A function to evaluat the model

In [15]:
def evaluate(model, iterator, criterion):
    model.eval()

    epoch_loss = 0
    valid_iterator = tqdm(iterator, desc="Training", leave=False)

    with torch.no_grade():

        for i, (src, trg) in enumerate(valid_iterator):
            src = src.to(device)
            trg = trg.to(device)

            output = model(src,trg,0) # 0 -> turning off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim) # removing <bos> and re-shaping
            trg = trg[1:].contiguous().view(-1) # removing <bos> and re-shaping

            loss = criterion(output, trg)
            valid_iterator.set_postfix(loss=loss.item())
            epoch_loss += loss.item()
        return epoch_loss/len(list(iterator))

## Data pre-processing

In [16]:
# running the code that has been created that contains all the transformation processes on data.
%run Multi30K_de_en_dataloader.py

In [23]:
# geting the train and valid data-loaders
train_dataloader, valid_dataloader = get_translation_dataloaders(batch_size = 4)

print("The first batch in train_dataloader:- ")
src, trg = next(iter(train_dataloader))
print("\nSRC tensor:-\n", src)
print("\nTRG tensor:-\n", trg)

The first batch in train_dataloader:- 

SRC tensor:-
 tensor([[    2,     2,     2,     2],
        [    3,  5510,  5510, 12642],
        [    1,     3,     3,     8],
        [    1,     1,     1,  1701],
        [    1,     1,     1,     3]], device='cuda:0')

TRG tensor:-
 tensor([[   2,    2,    2,    2],
        [   3, 6650,  216,    6],
        [   1, 4623,  110, 3398],
        [   1,  259, 3913,  202],
        [   1,  172, 1650,  109],
        [   1, 9953, 3823,   37],
        [   1,  115,   71,    3],
        [   1,  692, 2808,    1],
        [   1, 3428, 2187,    1],
        [   1,    5,    5,    1],
        [   1,    3,    3,    1]], device='cuda:0')


In [32]:
# printing the data from an example batch
data_itr = iter(train_dataloader)
for n in range(1000):
    german, english = next(data_itr)

print("The 1000th batch in train_dataloader:- ")
print("\nGerman tensor:-\n", german)
print("\nEnglish tensor:-\n", english)

german = german.T
english = english.T

print("\nGerman T tensor:-\n", german)
print("\nEnglish T tensor:-\n", english)

# printing the actual text
print("\nGerman Text:-\n") # The german text is in reverse
for e in german:
    print(index_to_eng(e))

print("\nEnglish Text:-\n")
for e in english:
    print(index_to_eng(e))


The 1000th batch in train_dataloader:- 

German tensor:-
 tensor([[   2,    2,    2,    2],
        [  21,   84,    5,   14],
        [ 773,   42,  315,   17],
        [8314,  561,  149,  332],
        [  10,   25,   22,   63],
        [ 541,  458, 1121,    6],
        [3125,   22,  104, 4505],
        [ 174,   94,  901,  468],
        [   4,    4,    4,    4],
        [   3,    3,    3,    3]], device='cuda:0')

English tensor:-
 tensor([[   2,    2,    2,    2],
        [  19,   83, 2989,    6],
        [  52,   17,   10,   16],
        [ 266,  363,   56,  616],
        [   7,  354,   18,    4],
        [ 287,   20,   27,  477],
        [1198, 1528,  515,   29],
        [ 134,    5,   60,  175],
        [  67,    3,  210,   28],
        [   4,    1, 2688,  256],
        [2266,    1,   63,  552],
        [   5,    1,  811,    5],
        [   3,    1,    5,    3],
        [   1,    1,    3,    1]], device='cuda:0')

German T tensor:-
 tensor([[   2,   21,  773, 8314,   10,  541, 3125, 

In [25]:
english

tensor([[   2,    2,    2,    2],
        [  19,   83, 2989,    6],
        [  52,   17,   10,   16],
        [ 266,  363,   56,  616],
        [   7,  354,   18,    4],
        [ 287,   20,   27,  477],
        [1198, 1528,  515,   29],
        [ 134,    5,   60,  175],
        [  67,    3,  210,   28],
        [   4,    1, 2688,  256],
        [2266,    1,   63,  552],
        [   5,    1,  811,    5],
        [   3,    1,    5,    3],
        [   1,    1,    3,    1]], device='cuda:0')

## Training the model

#### Initializations

In [33]:
SEED = 1234 # for result reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [39]:
INPUT_DIM = len(vocab_transform['de'])
OUTPUT_DIM = len(vocab_transform['en'])
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 1
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, trg_vocab=vocab_transform['en'].to(device))

In [43]:
# initializing the initial weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19214, 128)
    (lstm): LSTM(128, 256, dropout=0.3)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(10837, 128)
    (lstm): LSTM(128, 256, dropout=0.3)
    (fc_out): Linear(in_features=256, out_features=10837, bias=True)
    (softmax): LogSoftmax(dim=1)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (trg_vocab): Vocab()
)

In [49]:
# no.of trainable paraments 
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model)}:, trainable parameters")

The model has 7422165:, trainable parameters


In [51]:
# defining the loss criterion, optimizer using functions from the PyTorch Library 
optimizer = optim.Adam(model.parameters())
PAD_IDX = vocab_transform['en'].get_stoi()['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
# a helper function for claculating time taken for training
def epoch_time(start_time, end_time):
    elasped_time = end_time - start_time
    elasped_mins = int(elasped_time/60)
    elasped_secs = int(elasped_time - )