# Encoder Decoder Model
## Sequence to Sequence Learning with Neural Networks (https://arxiv.org/abs/1409.3215)

In [1]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb
# Comments: https://colab.research.google.com/drive/1NmWujB2PoJk24uOwZ4cAfX3O8cZyigyf
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/seq2seq1.png

In [2]:
%matplotlib inline

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from nltk.translate.bleu_score import corpus_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from tqdm import tqdm

In [4]:
# Variables
SOS_token = 0
EOS_token = 1
PAD_token = 2
is_ignore_pads = True
MAX_LENGTH = 10
hidden_size = 128
batch_size = 64
epochs = 200
SPLIT_RATIO = 0.95


ENG_PREFIXES = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# Data location
file_path = 'data/eng-fra.txt'

In [5]:
# Language class handler
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2:"<PAD>"}
        self.n_words = 3  # Count SOS, EOS and PAD_token

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


class PreProcess(object):
  # Turn a Unicode string to plain ASCII, thanks to
  # https://stackoverflow.com/a/518232/2809427
  def unicodeToAscii(s):
      return ''.join(
          c for c in unicodedata.normalize('NFD', s)
          if unicodedata.category(c) != 'Mn'
      )

  # Lowercase, trim, and remove non-letter characters
  def normalizeString(s):
      s = PreProcess.unicodeToAscii(s.lower().strip())
      s = re.sub(r"([.!?])", r" \1", s)
      s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
      return s.strip()

In [6]:
class DataHandler(object):

  # read langs and create lang objects, and pairs
  def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(file_path, encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[PreProcess.normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

  # filter pairs with length < max length + containing the eng_prefixes as mentioned in eng_prefixes
  def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH
        # and \
        # p[1].startswith(ENG_PREFIXES)

  # filter pairs
  def filterPairs(pairs):
    return [pair for pair in pairs if DataHandler.filterPair(pair)]

  # Read data, filter data, register language objects
  def prepareData(lang1, lang2, reverse=False):

    # initiate language objects, and get pairs
    input_lang, output_lang, pairs = DataHandler.readLangs(lang1, lang2, reverse)

    print("Read %s sentence pairs" % len(pairs))
    pairs = DataHandler.filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")

    # Register pairs with lang objects
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [7]:
import random

class DataLoaderHandler(object):

  def sentenceFromIndices(lang, indices):
      return ' '.join([lang.index2word[index] for index in indices])

  # create a list of token-indices from a list of token
  def indexesFromSentence(lang, sentence):
      return [lang.word2index[word] for word in sentence.split(' ')]

  # create tensor from sentence
  def tensorFromSentence(lang, sentence):
      indexes = DataLoaderHandler.indexesFromSentence(lang, sentence)
      indexes.append(EOS_token)
      return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

  # create tensors from pair of sentences
  def tensorsFromPair(pair):
      input_tensor = DataLoaderHandler.tensorFromSentence(input_lang, pair[0])
      target_tensor = DataLoaderHandler.tensorFromSentence(output_lang, pair[1])
      return (input_tensor, target_tensor)

  def split_train_test(pairs, split_ratio):

    # Shuffle the data to ensure randomness
    random.shuffle(pairs)

    # Calculate the split indices
    split_idx = int(len(pairs) * split_ratio)

    # Split the data into train and test sets
    train_pairs = pairs[:split_idx]
    test_pairs = pairs[split_idx:]

    # Optionally, if you want to further use the data as lists instead of references
    train_pairs = list(train_pairs)
    test_pairs = list(test_pairs)

    return train_pairs, test_pairs

  def tokenize_into_numpy_arrays(pairs, n, input_lang, output_lang):
    # TODO: TRY INPUT AS VARIABLE LENGTH
    # Init numpy arrays for timesteps with zeros. Should this be something else other than zeros to mark an empty token? (Since 0 is taken by SOS token)

    input_ids = np.full((n, MAX_LENGTH), PAD_token, dtype=np.int32)
    target_ids = np.full((n, MAX_LENGTH), PAD_token, dtype=np.int32)
    # input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    # target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        # Get list of token-indices
        inp_ids = DataLoaderHandler.indexesFromSentence(input_lang, inp)
        tgt_ids = DataLoaderHandler.indexesFromSentence(output_lang, tgt)

        # Append <end of string> tokens
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)

        # Assign token indices in the main array
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids
    return input_ids, target_ids

  # generate data loader
  def get_dataloader(batch_size):
      # prepare language data
      input_lang, output_lang, pairs = DataHandler.prepareData('eng', 'fra', True)

      n = len(pairs)

      train_pairs, test_pairs = DataLoaderHandler.split_train_test(pairs, SPLIT_RATIO)
      n_train, n_test = len(train_pairs), len(test_pairs)

      train_input_ids, train_target_ids = DataLoaderHandler.tokenize_into_numpy_arrays(train_pairs, n_train, input_lang, output_lang)
      train_data = TensorDataset(
                      torch.LongTensor(train_input_ids).to(device),
                      torch.LongTensor(train_target_ids).to(device)
      )

      test_input_ids, test_target_ids = DataLoaderHandler.tokenize_into_numpy_arrays(test_pairs, n_test, input_lang, output_lang)
      test_data = TensorDataset(
                      torch.LongTensor(test_input_ids).to(device),
                      torch.LongTensor(test_target_ids).to(device)
      )

      # Create a sampler
      train_sampler = RandomSampler(train_data)
      test_sampler = RandomSampler(test_data)

      # Create a torch dataloader
      train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
      test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=n_test)

      print(f"Train and Test Dataset # samples: {len(train_data)}, {len(test_data)}")
      print(f"Train and Test Dataloader # batches: {len(train_dataloader)}, {len(test_dataloader)}")

      return input_lang, output_lang, train_dataloader, test_dataloader

In [8]:
# Prepare Data
# input_lang, output_lang, pairs = DataHandler.prepareData('eng', 'fra', True)
# print(random.choice(pairs))

input_lang, output_lang, train_dataloader, test_dataloader = DataLoaderHandler.get_dataloader(32)

Reading lines...
Read 135842 sentence pairs
Trimmed to 105692 sentence pairs
Counting words...
Counted words:
fra 17865
eng 10699
Train and Test Dataset # samples: 100407, 5285
Train and Test Dataloader # batches: 3138, 1


**Helpers**

In [None]:
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

class Helpers(object):

  def asMinutes(s):
      m = math.floor(s / 60)
      s -= m * 60
      return '%dm %ds' % (m, s)

  def timeSince(since, percent):
      now = time.time()
      s = now - since
      es = s / (percent)
      rs = es - s
      return '%s (- %s)' % (Helpers.asMinutes(s), Helpers.asMinutes(rs))

  def showPlot(points):
      plt.figure()
      fig, ax = plt.subplots()
      # this locator puts ticks at regular intervals
      loc = ticker.MultipleLocator(base=0.2)
      ax.yaxis.set_major_locator(loc)
      plt.plot(points)
      plt.show()

The Model

In [None]:
# Comments: https://colab.research.google.com/drive/1NmWujB2PoJk24uOwZ4cAfX3O8cZyigyf#scrollTo=ARbOsC8bpH7O
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # Convert to embedding {vocab_size, embedding_dimension: hidden_size}
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        # Embedding vector
        embedding_vector = self.embedding(input)
        embedding_vector = self.dropout(embedding_vector)

        output, hidden = self.gru(embedding_vector)
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        predicted_decoder_tokens = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # print("Without teacher forcing")
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
                predicted_decoder_tokens.append(decoder_input)

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, predicted_decoder_tokens # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

class EncoderDecoderTranslation(nn.Module):

  def __init__(self, input_lang, output_lang, hidden_size, device):
        super(EncoderDecoderTranslation, self).__init__()

        self.encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
        self.decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)
        self.device = device

  def forward(self, input_tensor, target_tensor=None):

    encoder_outputs, encoder_hidden = self.encoder(input_tensor)
    decoder_outputs, _, predicted_decoder_tokens = self.decoder(encoder_outputs, encoder_hidden, target_tensor)

    return decoder_outputs, predicted_decoder_tokens


In [None]:
def train_epoch(dataloader, encoder_decoder, encoder_decoder_optimizer, criterion):

    total_loss = 0
    for data in tqdm(dataloader):
        input_tensor, target_tensor = data

        # zero out gradients before each batch
        encoder_decoder_optimizer.zero_grad()

        # Run encoder-decoder forward()
        decoder_outputs, _ = encoder_decoder(input_tensor, target_tensor)

        # calculate loss
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )

        # calculate gradients
        loss.backward()
        # update weights
        encoder_decoder_optimizer.step()

        # update epoch level loss
        total_loss += loss.item()

    return total_loss / len(dataloader)

def normalize_tensors_to_tokens(tensor, remove_first_idx=False):

  # convert_to_list
  tensor = tensor.tolist()

  # remove_sos_eos_and_pads_convert_list
  if remove_first_idx:
    tensor = [sequence[1:] for sequence in tensor]

  # remove all tokens after <eos token>
  out_list = []
  for sequence in tensor:
    new_seq = []
    for token in sequence:
      if token == EOS_token:
        break
      new_seq.append(token)
    out_list.append(new_seq)

  return out_list


def predict(data_loader, encoder_decoder):

  # Eval Mode. Turn off dropout and batchnorm
  encoder_decoder.eval()

  list_decoder_outputs = []

  # ensure no gradients are calculated with no_grad() to preserve memory
  with torch.no_grad():
    for data in data_loader:
      input_tensor, target_tensor = data
      decoder_outputs, predicted_decoder_tokens = encoder_decoder(input_tensor)
      list_decoder_outputs.append(decoder_outputs)

      # Merge timesteps of decoder predictions
      predicted_decoder_tokens = torch.cat(predicted_decoder_tokens, dim=1)

  return list_decoder_outputs, input_tensor, target_tensor, predicted_decoder_tokens


def calculate_bleu(test_target_tokens, predicted_decoder_tokens):

  return corpus_bleu(
      [[item] for item in test_target_tokens],
      [item for item in predicted_decoder_tokens],
    )

def train(train_dataloader, test_dataloader, encoder_decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):

    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_decoder_optimizer = optim.Adam(encoder_decoder.parameters(), lr=learning_rate)

    # Ignore pad token from loss calculation
    if is_ignore_pads:
      criterion = nn.NLLLoss(ignore_index = PAD_token)
    else:
      criterion = nn.NLLLoss()

    print('Time \t\t\t (Epoch\t%) \t Loss \t\t Bleu')
    for epoch in range(1, n_epochs + 1):
        # Training
        encoder_decoder.train()
        loss = train_epoch(train_dataloader, encoder_decoder, encoder_decoder_optimizer, criterion)

        print_loss_total += loss
        plot_loss_total += loss

        # Eval on Test
        encoder_decoder.eval()
        # Evaluate without teacher forcing on test set
        test_list_decoder_outputs, test_input_tensor, test_target_tensor, predicted_decoder_tokens = predict(test_dataloader, encoder_decoder)

        # Calculate bleu
        bleu = calculate_bleu(
            normalize_tensors_to_tokens(test_target_tensor, False),
            normalize_tensors_to_tokens(predicted_decoder_tokens, False)
        )


        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s \t (%d \t %d%%) \t %.4f \t %.4f' % (Helpers.timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg, bleu))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    Helpers.showPlot(plot_losses)


In [None]:

input_lang, output_lang, train_dataloader, test_dataloader = DataLoaderHandler.get_dataloader(batch_size)

Reading lines...
Read 135842 sentence pairs
Trimmed to 105692 sentence pairs
Counting words...
Counted words:
fra 17865
eng 10699
Train and Test Dataset # samples: 100407, 5285
Train and Test Dataloader # batches: 1569, 1


In [None]:
# is_ignore_pads = False
# # init encoder-decoder
# encoder_decoder = EncoderDecoderTranslation(input_lang, output_lang, hidden_size, device)

# train(train_dataloader, test_dataloader, encoder_decoder, epochs, print_every=5, plot_every=5)

In [None]:
is_ignore_pads = True
input_lang, output_lang, train_dataloader, test_dataloader = DataLoaderHandler.get_dataloader(batch_size)

# init encoder-decoder
encoder_decoder = EncoderDecoderTranslation(input_lang, output_lang, hidden_size, device)

train(train_dataloader, test_dataloader, encoder_decoder, epochs, print_every=5, plot_every=5)

Reading lines...
Read 135842 sentence pairs
Trimmed to 105692 sentence pairs
Counting words...
Counted words:
fra 17865
eng 10699
Train and Test Dataset # samples: 100407, 5285
Train and Test Dataloader # batches: 1569, 1
Time 			 (Epoch	%) 	 Loss 		 Bleu


100%|██████████| 1569/1569 [00:20<00:00, 75.22it/s]
100%|██████████| 1569/1569 [00:21<00:00, 72.57it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.21it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.10it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.93it/s]


1m 46s (- 69m 30s) 	 (5 	 2%) 	 2.6795 	 0.2050


100%|██████████| 1569/1569 [00:20<00:00, 77.05it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.64it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.76it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.78it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.08it/s]


3m 30s (- 66m 39s) 	 (10 	 5%) 	 1.4939 	 0.2824


100%|██████████| 1569/1569 [00:20<00:00, 75.51it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.97it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.06it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.94it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.67it/s]


5m 14s (- 64m 41s) 	 (15 	 7%) 	 1.0873 	 0.3209


100%|██████████| 1569/1569 [00:20<00:00, 78.05it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.80it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.32it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.74it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.47it/s]


6m 58s (- 62m 47s) 	 (20 	 10%) 	 0.8675 	 0.3474


100%|██████████| 1569/1569 [00:19<00:00, 80.45it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.63it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.98it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.95it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.84it/s]


8m 41s (- 60m 52s) 	 (25 	 12%) 	 0.7288 	 0.3668


100%|██████████| 1569/1569 [00:21<00:00, 72.09it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.12it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.07it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.50it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.11it/s]


10m 27s (- 59m 14s) 	 (30 	 15%) 	 0.6329 	 0.3685


100%|██████████| 1569/1569 [00:19<00:00, 79.37it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.14it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.60it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.23it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.81it/s]


12m 11s (- 57m 27s) 	 (35 	 17%) 	 0.5625 	 0.3880


100%|██████████| 1569/1569 [00:19<00:00, 78.81it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.14it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.94it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.35it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.15it/s]


13m 54s (- 55m 39s) 	 (40 	 20%) 	 0.5088 	 0.3906


100%|██████████| 1569/1569 [00:20<00:00, 76.00it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.35it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.83it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.65it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.05it/s]


15m 38s (- 53m 52s) 	 (45 	 22%) 	 0.4665 	 0.3943


100%|██████████| 1569/1569 [00:20<00:00, 77.00it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.56it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.72it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.85it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.10it/s]


17m 22s (- 52m 8s) 	 (50 	 25%) 	 0.4315 	 0.4010


100%|██████████| 1569/1569 [00:19<00:00, 79.30it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.28it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.29it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.31it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.15it/s]


19m 6s (- 50m 21s) 	 (55 	 27%) 	 0.4024 	 0.3943


100%|██████████| 1569/1569 [00:19<00:00, 80.23it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.64it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.07it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.72it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.38it/s]


20m 49s (- 48m 35s) 	 (60 	 30%) 	 0.3782 	 0.4057


100%|██████████| 1569/1569 [00:19<00:00, 80.40it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.37it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.29it/s]
100%|██████████| 1569/1569 [00:21<00:00, 73.88it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.06it/s]


22m 33s (- 46m 52s) 	 (65 	 32%) 	 0.3577 	 0.4125


100%|██████████| 1569/1569 [00:20<00:00, 77.61it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.98it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.59it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.31it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.47it/s]


24m 17s (- 45m 6s) 	 (70 	 35%) 	 0.3395 	 0.4111


100%|██████████| 1569/1569 [00:20<00:00, 76.17it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.88it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.49it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.92it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.95it/s]


26m 0s (- 43m 21s) 	 (75 	 37%) 	 0.3240 	 0.4136


100%|██████████| 1569/1569 [00:20<00:00, 77.77it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.30it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.68it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.40it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.36it/s]


27m 44s (- 41m 37s) 	 (80 	 40%) 	 0.3100 	 0.4134


100%|██████████| 1569/1569 [00:19<00:00, 79.65it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.12it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.20it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.07it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.27it/s]


29m 28s (- 39m 52s) 	 (85 	 42%) 	 0.2973 	 0.4129


100%|██████████| 1569/1569 [00:19<00:00, 80.20it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.78it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.98it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.60it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.19it/s]


31m 11s (- 38m 7s) 	 (90 	 45%) 	 0.2869 	 0.4211


100%|██████████| 1569/1569 [00:19<00:00, 79.62it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.65it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.50it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.21it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.79it/s]


32m 55s (- 36m 22s) 	 (95 	 47%) 	 0.2768 	 0.4210


100%|██████████| 1569/1569 [00:19<00:00, 79.09it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.65it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.25it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.43it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.30it/s]


34m 38s (- 34m 38s) 	 (100 	 50%) 	 0.2679 	 0.4218


100%|██████████| 1569/1569 [00:21<00:00, 74.42it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.46it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.95it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.26it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.55it/s]


36m 24s (- 32m 56s) 	 (105 	 52%) 	 0.2596 	 0.4225


100%|██████████| 1569/1569 [00:20<00:00, 77.63it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.56it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.30it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.64it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.26it/s]


38m 8s (- 31m 12s) 	 (110 	 55%) 	 0.2521 	 0.4221


100%|██████████| 1569/1569 [00:19<00:00, 79.74it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.21it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.36it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.01it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.83it/s]


39m 51s (- 29m 27s) 	 (115 	 57%) 	 0.2453 	 0.4208


100%|██████████| 1569/1569 [00:19<00:00, 80.51it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.04it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.95it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.26it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.75it/s]


41m 35s (- 27m 43s) 	 (120 	 60%) 	 0.2396 	 0.4242


100%|██████████| 1569/1569 [00:19<00:00, 80.42it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.18it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.65it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.94it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.80it/s]


43m 18s (- 25m 59s) 	 (125 	 62%) 	 0.2333 	 0.4334


100%|██████████| 1569/1569 [00:20<00:00, 77.43it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.65it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.04it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.08it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.32it/s]


45m 1s (- 24m 14s) 	 (130 	 65%) 	 0.2279 	 0.4234


100%|██████████| 1569/1569 [00:20<00:00, 76.39it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.01it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.03it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.03it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.86it/s]


46m 45s (- 22m 30s) 	 (135 	 67%) 	 0.2232 	 0.4246


100%|██████████| 1569/1569 [00:20<00:00, 76.53it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.68it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.83it/s]
100%|██████████| 1569/1569 [00:21<00:00, 74.59it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.78it/s]


48m 31s (- 20m 47s) 	 (140 	 70%) 	 0.2183 	 0.4313


100%|██████████| 1569/1569 [00:19<00:00, 79.53it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.27it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.86it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.56it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.56it/s]


50m 15s (- 19m 3s) 	 (145 	 72%) 	 0.2144 	 0.4295


100%|██████████| 1569/1569 [00:19<00:00, 79.84it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.91it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.20it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.20it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.10it/s]


51m 58s (- 17m 19s) 	 (150 	 75%) 	 0.2101 	 0.4303


100%|██████████| 1569/1569 [00:20<00:00, 78.18it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.20it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.54it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.94it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.07it/s]


53m 42s (- 15m 35s) 	 (155 	 77%) 	 0.2069 	 0.4237


100%|██████████| 1569/1569 [00:20<00:00, 76.37it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.10it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.93it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.31it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.24it/s]


55m 25s (- 13m 51s) 	 (160 	 80%) 	 0.2025 	 0.4304


100%|██████████| 1569/1569 [00:20<00:00, 76.76it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.50it/s]
100%|██████████| 1569/1569 [00:20<00:00, 74.72it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.43it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.55it/s]


57m 10s (- 12m 7s) 	 (165 	 82%) 	 0.1996 	 0.4317


100%|██████████| 1569/1569 [00:19<00:00, 78.99it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.26it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.94it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.46it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.97it/s]


58m 54s (- 10m 23s) 	 (170 	 85%) 	 0.1963 	 0.4284


100%|██████████| 1569/1569 [00:19<00:00, 79.69it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.28it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.51it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.66it/s]
100%|██████████| 1569/1569 [00:21<00:00, 74.05it/s]


60m 39s (- 8m 39s) 	 (175 	 87%) 	 0.1931 	 0.4285


100%|██████████| 1569/1569 [00:19<00:00, 79.26it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.26it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.13it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.19it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.89it/s]


62m 23s (- 6m 55s) 	 (180 	 90%) 	 0.1902 	 0.4342


100%|██████████| 1569/1569 [00:19<00:00, 79.95it/s]
100%|██████████| 1569/1569 [00:20<00:00, 75.73it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.90it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.22it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.99it/s]


64m 7s (- 5m 11s) 	 (185 	 92%) 	 0.1875 	 0.4331


100%|██████████| 1569/1569 [00:20<00:00, 77.40it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.11it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.23it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.54it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.75it/s]


65m 50s (- 3m 27s) 	 (190 	 95%) 	 0.1850 	 0.4299


100%|██████████| 1569/1569 [00:19<00:00, 80.35it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.63it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.86it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.17it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.16it/s]


67m 34s (- 1m 43s) 	 (195 	 97%) 	 0.1826 	 0.4304


100%|██████████| 1569/1569 [00:19<00:00, 78.53it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.32it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.66it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.31it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.22it/s]


69m 17s (- 0m 0s) 	 (200 	 100%) 	 0.1810 	 0.4329


In [None]:
import torch

def show_model_layers_and_params(model):
    print("Model Layers:")
    print("--------------")
    for name, module in model.named_children():
        print(f"{name}: {module}")

    print("\nLayer-wise Number of Parameters and Memory Requirements:")
    print("-------------------------------------------------------")
    total_params = 0
    total_memory = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_params = param.numel()
            param_memory = num_params * param.element_size() / (1024 ** 2)  # Memory in MBs
            print(f"{name}: {num_params} parameters, {param_memory:.2f} MB")
            total_params += num_params
            total_memory += param_memory

    print("\nTotal Number of Parameters and Memory Usage:")
    print("------------------------------------------")
    print(f"Total parameters: {total_params}")
    print(f"Total memory usage: {total_memory:.2f} MB")


In [None]:
# Predict
test_list_decoder_outputs, test_input_tensor, test_target_tensor, predicted_decoder_tokens = predict(test_dataloader, encoder_decoder)

In [None]:
# Example Predictions without normalizing tokens
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[0].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[0].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[0].tolist()))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[1].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[1].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[1].tolist()))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[2].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[2].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[2].tolist()))

je voulais juste aller a l universite <EOS> <PAD> <PAD>
i just wanted to go to college <EOS> <PAD> <PAD>
i just wanted to go to college <EOS> <EOS> <EOS>

j ai un nouveau velo <EOS> <PAD> <PAD> <PAD> <PAD>
i ve got a new bike <EOS> <PAD> <PAD> <PAD>
i have a new bicycle <EOS> <EOS> <EOS> <EOS> <EOS>

je n aime pas cela <EOS> <PAD> <PAD> <PAD> <PAD>
i don t like this <EOS> <PAD> <PAD> <PAD> <PAD>
i don t like that <EOS> <EOS> <EOS> <EOS> <EOS>


In [None]:
# Normalize tokens
test_input_tensor = normalize_tensors_to_tokens(test_input_tensor, False)
test_target_tensor = normalize_tensors_to_tokens(test_target_tensor, False)
predicted_decoder_tokens = normalize_tensors_to_tokens(predicted_decoder_tokens, True)

In [None]:
# Example Predictions with normalizing tokens
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[0]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[0]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[0]))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[1]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[1]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[1]))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[2]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[2]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[2]))

je voulais juste aller a l universite
i just wanted to go to college
just wanted to go to college

j ai un nouveau velo
i ve got a new bike
have a new bicycle

je n aime pas cela
i don t like this
don t like that


In [None]:
is_ignore_pads = False
input_lang, output_lang, train_dataloader, test_dataloader = DataLoaderHandler.get_dataloader(batch_size)

# init encoder-decoder
encoder_decoder = EncoderDecoderTranslation(input_lang, output_lang, hidden_size, device)

train(train_dataloader, test_dataloader, encoder_decoder, epochs, print_every=5, plot_every=5)

Reading lines...
Read 135842 sentence pairs
Trimmed to 105692 sentence pairs
Counting words...
Counted words:
fra 17865
eng 10699
Train and Test Dataset # samples: 100407, 5285
Train and Test Dataloader # batches: 1569, 1
Time 			 (Epoch	%) 	 Loss 		 Bleu


100%|██████████| 1569/1569 [00:19<00:00, 79.16it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.63it/s]
100%|██████████| 1569/1569 [00:19<00:00, 80.26it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.16it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.46it/s]


1m 43s (- 67m 6s) 	 (5 	 2%) 	 1.7946 	 0.2093


100%|██████████| 1569/1569 [00:20<00:00, 76.47it/s]
100%|██████████| 1569/1569 [00:19<00:00, 79.98it/s]
100%|██████████| 1569/1569 [00:20<00:00, 76.76it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.92it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.11it/s]


3m 27s (- 65m 43s) 	 (10 	 5%) 	 0.9899 	 0.3008


100%|██████████| 1569/1569 [00:21<00:00, 73.38it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.23it/s]
100%|██████████| 1569/1569 [00:20<00:00, 78.24it/s]
100%|██████████| 1569/1569 [00:19<00:00, 78.77it/s]
100%|██████████| 1569/1569 [00:20<00:00, 77.13it/s]


5m 13s (- 64m 25s) 	 (15 	 7%) 	 0.7173 	 0.3431


100%|██████████| 1569/1569 [00:19<00:00, 78.82it/s]
 15%|█▌        | 243/1569 [00:03<00:16, 79.13it/s]


KeyboardInterrupt: ignored

# TO TRY
- Ignore padding token loss (ignore index) - mixed results
- evaluation metric - bleu (done)
- EOS token related sequence clipping - done
- shifted target sequence? - ignore (done)
- loss: what all to include? to include <EOS>? - eos included, padding not included

In [None]:
# def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
#           decoder_optimizer, criterion):

#     total_loss = 0
#     for data in dataloader:
#         input_tensor, target_tensor = data

#         encoder_optimizer.zero_grad()
#         decoder_optimizer.zero_grad()

#         encoder_outputs, encoder_hidden = encoder(input_tensor)
#         decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

#         loss = criterion(
#             decoder_outputs.view(-1, decoder_outputs.size(-1)),
#             target_tensor.view(-1)
#         )
#         loss.backward()

#         encoder_optimizer.step()
#         decoder_optimizer.step()

#         total_loss += loss.item()

#     return total_loss / len(dataloader)

# def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
#                print_every=100, plot_every=100):
#     start = time.time()
#     plot_losses = []
#     print_loss_total = 0  # Reset every print_every
#     plot_loss_total = 0  # Reset every plot_every

#     encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
#     decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
#     criterion = nn.NLLLoss()

#     for epoch in range(1, n_epochs + 1):
#         loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
#         print_loss_total += loss
#         plot_loss_total += loss

#         if epoch % print_every == 0:
#             print_loss_avg = print_loss_total / print_every
#             print_loss_total = 0
#             print('%s (%d %d%%) %.4f' % (Helpers.timeSince(start, epoch / n_epochs),
#                                         epoch, epoch / n_epochs * 100, print_loss_avg))

#         if epoch % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

#     Helpers.showPlot(plot_losses)

# hidden_size = 128
# batch_size = 32
# input_lang, output_lang, train_dataloader = DataLoaderHandler.get_dataloader(batch_size)
# encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

# # train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)