In [1]:
# Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation

# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/2%20-%20Learning%20Phrase%20Representations%20using%20RNN%20Encoder-Decoder%20for%20Statistical%20Machine%20Translation.ipynb
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb
# Comments: https://colab.research.google.com/drive/1NmWujB2PoJk24uOwZ4cAfX3O8cZyigyf
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/seq2seq7.png
# https://www.youtube.com/watch?v=BSSoEtv5jvQ&list=PLmZlBIcArwhPHmHzyM_cZJQ8_v5paQJTV&index=7
# https://machinelearningmastery.com/the-luong-attention-mechanism/
# https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
# https://github.com/tensorflow/nmt

In [2]:
%matplotlib inline

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from nltk.translate.bleu_score import corpus_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from tqdm import tqdm

In [4]:
# Variables
SOS_token = 0
EOS_token = 1
PAD_token = 2
is_ignore_pads = True
MAX_LENGTH = 10
hidden_size = 128
batch_size = 1024
epochs = 200
SPLIT_RATIO = 0.95


ENG_PREFIXES = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# Data location
file_path = 'data/eng-fra.txt'

Mounted at /content/drive/


In [5]:
# Language class handler
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2:"<PAD>"}
        self.n_words = 3  # Count SOS, EOS and PAD_token

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


class PreProcess(object):
  # Turn a Unicode string to plain ASCII, thanks to
  # https://stackoverflow.com/a/518232/2809427
  def unicodeToAscii(s):
      return ''.join(
          c for c in unicodedata.normalize('NFD', s)
          if unicodedata.category(c) != 'Mn'
      )

  # Lowercase, trim, and remove non-letter characters
  def normalizeString(s):
      s = PreProcess.unicodeToAscii(s.lower().strip())
      s = re.sub(r"([.!?])", r" \1", s)
      s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
      return s.strip()

In [6]:
class DataHandler(object):

  # read langs and create lang objects, and pairs
  def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(file_path, encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[PreProcess.normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

  # filter pairs with length < max length + containing the eng_prefixes as mentioned in eng_prefixes
  def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH
        # and \
        # p[1].startswith(ENG_PREFIXES)

  # filter pairs
  def filterPairs(pairs):
    return [pair for pair in pairs if DataHandler.filterPair(pair)]

  # Read data, filter data, register language objects
  def prepareData(lang1, lang2, reverse=False):

    # initiate language objects, and get pairs
    input_lang, output_lang, pairs = DataHandler.readLangs(lang1, lang2, reverse)

    print("Read %s sentence pairs" % len(pairs))
    pairs = DataHandler.filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")

    # Register pairs with lang objects
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


In [7]:
import random

class DataLoaderHandler(object):

  def sentenceFromIndices(lang, indices):
      return ' '.join([lang.index2word[index] for index in indices])

  def outputLangTokensFromIndices(lang, indices):
      return [lang.index2word[index] for index in indices]

  # create a list of token-indices from a list of token
  def indexesFromSentence(lang, sentence):
      return [lang.word2index[word] for word in sentence.split(' ')]

  # create tensor from sentence
  def tensorFromSentence(lang, sentence):
      indexes = DataLoaderHandler.indexesFromSentence(lang, sentence)
      indexes.append(EOS_token)
      return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

  # create tensors from pair of sentences
  def tensorsFromPair(pair):
      input_tensor = DataLoaderHandler.tensorFromSentence(input_lang, pair[0])
      target_tensor = DataLoaderHandler.tensorFromSentence(output_lang, pair[1])
      return (input_tensor, target_tensor)

  def split_train_test(pairs, split_ratio):

    # Shuffle the data to ensure randomness
    random.shuffle(pairs)

    # Calculate the split indices
    split_idx = int(len(pairs) * split_ratio)

    # Split the data into train and test sets
    train_pairs = pairs[:split_idx]
    test_pairs = pairs[split_idx:]

    # Optionally, if you want to further use the data as lists instead of references
    train_pairs = list(train_pairs)
    test_pairs = list(test_pairs)

    return train_pairs, test_pairs

  def tokenize_into_numpy_arrays(pairs, n, input_lang, output_lang):
    # TODO: TRY INPUT AS VARIABLE LENGTH
    # Init numpy arrays for timesteps with zeros. Should this be something else other than zeros to mark an empty token? (Since 0 is taken by SOS token)

    input_ids = np.full((n, MAX_LENGTH), PAD_token, dtype=np.int32)
    target_ids = np.full((n, MAX_LENGTH), PAD_token, dtype=np.int32)
    # input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    # target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        # Get list of token-indices
        inp_ids = DataLoaderHandler.indexesFromSentence(input_lang, inp)
        tgt_ids = DataLoaderHandler.indexesFromSentence(output_lang, tgt)

        # Append <end of string> tokens
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)

        # Assign token indices in the main array
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids
    return input_ids, target_ids

  # generate data loader
  def get_dataloader(batch_size):
      # prepare language data
      input_lang, output_lang, pairs = DataHandler.prepareData('eng', 'fra', True)

      n = len(pairs)

      train_pairs, test_pairs = DataLoaderHandler.split_train_test(pairs, SPLIT_RATIO)
      n_train, n_test = len(train_pairs), len(test_pairs)

      train_input_ids, train_target_ids = DataLoaderHandler.tokenize_into_numpy_arrays(train_pairs, n_train, input_lang, output_lang)
      train_data = TensorDataset(
                      torch.LongTensor(train_input_ids).to(device),
                      torch.LongTensor(train_target_ids).to(device)
      )

      test_input_ids, test_target_ids = DataLoaderHandler.tokenize_into_numpy_arrays(test_pairs, n_test, input_lang, output_lang)
      test_data = TensorDataset(
                      torch.LongTensor(test_input_ids).to(device),
                      torch.LongTensor(test_target_ids).to(device)
      )

      # Create a sampler
      train_sampler = RandomSampler(train_data)
      test_sampler = RandomSampler(test_data)

      # Create a torch dataloader
      train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
      test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=n_test)

      print(f"Train and Test Dataset # samples: {len(train_data)}, {len(test_data)}")
      print(f"Train and Test Dataloader # batches: {len(train_dataloader)}, {len(test_dataloader)}")

      return input_lang, output_lang, train_dataloader, test_dataloader

In [8]:
# Prepare Data
# input_lang, output_lang, pairs = DataHandler.prepareData('eng', 'fra', True)
# print(random.choice(pairs))

input_lang, output_lang, train_dataloader, test_dataloader = DataLoaderHandler.get_dataloader(32)

Reading lines...
Read 135842 sentence pairs
Trimmed to 105692 sentence pairs
Counting words...
Counted words:
fra 17865
eng 10699
Train and Test Dataset # samples: 100407, 5285
Train and Test Dataloader # batches: 3138, 1


**Helpers**

In [9]:
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

class Helpers(object):

  def asMinutes(s):
      m = math.floor(s / 60)
      s -= m * 60
      return '%dm %ds' % (m, s)

  def timeSince(since, percent):
      now = time.time()
      s = now - since
      es = s / (percent)
      rs = es - s
      return '%s (- %s)' % (Helpers.asMinutes(s), Helpers.asMinutes(rs))

  def showPlot(points):
      plt.figure()
      fig, ax = plt.subplots()
      # this locator puts ticks at regular intervals
      loc = ticker.MultipleLocator(base=0.2)
      ax.yaxis.set_major_locator(loc)
      plt.plot(points)
      plt.show()

The Model

In [10]:
# in: keys and query vector
# shapes: (batch_size, time_steps, hidden_size); (batch_size, 1, hidden_size)
class BhadanauAttentionBeforeOutput(nn.Module):

    def __init__(self, hidden_size):
        super(BhadanauAttentionBeforeOutput, self).__init__()

        self.W_keys = nn.Linear(hidden_size, hidden_size)
        self.W_query = nn.Linear(hidden_size, hidden_size)
        self.W_combined = nn.Linear(hidden_size, 1)  # out shape: (batch_size, time_steps, 1)

    def forward(self, keys, query):

        trans_keys = self.W_keys(keys) # (batch_size, time_steps, hidden_size)
        trans_query = self.W_query(query) # (batch_size, time_steps, hidden_size)

        combine = trans_keys + trans_query # (batch_size, time_steps, hidden_size)
        relu_combine = F.relu(combine) # (batch_size, time_steps, hidden_size)

        scores_raw = self.W_combined(relu_combine) # (batch_size, time_steps, 1)

        # reshape
        scores_raw = scores_raw.squeeze(2).unsqueeze(1) # (batch_size, 1, time_steps)
        # apply softmax across the time_steps
        weights = F.softmax(scores_raw, dim=-1) # (batch_size, 1, time_steps)

        context = torch.bmm(weights, keys) # (batch_size, 1, time_steps) * (batch_size, time_steps, hidden_size) = (batch_size, 1, hidden_size)

        # Another way to calculate the context vector
        # element_wise_multiplication = keys * weights
        # context_vector = torch.sum(element_wise_multiplication, dim=1, keepdim=True)

        return context, weights



class BhadanauAttentionBeforeOutputEncoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout_p=0.1):
        super(BhadanauAttentionBeforeOutputEncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # Convert to embedding {vocab_size, embedding_dimension: hidden_size}
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        # Embedding vector
        embedding_vector = self.embedding(input)
        embedding_vector = self.dropout(embedding_vector)

        output, hidden = self.gru(embedding_vector)
        return output, hidden

class BhadanauAttentionBeforeOutputDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(BhadanauAttentionBeforeOutputDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)

        # Attention layer
        self.attention = BhadanauAttentionBeforeOutput(hidden_size)

        # Inputs: ((word_embedding+context), hidden)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

        # Inputs: ((hidden+context+input_token))
        self.out = nn.Linear(hidden_size + hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):

        encoder_context_vector = encoder_hidden

        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        predicted_decoder_tokens = []
        predicted_attn_weights = []

        for i in range(MAX_LENGTH):
#             print(f"Input Shape before forward step: {decoder_input.shape}")
            decoder_output, decoder_hidden, attn_weights  = self.forward_step(decoder_input, decoder_hidden, encoder_context_vector, encoder_outputs)
            decoder_outputs.append(decoder_output)
            predicted_attn_weights.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
#                 print("Without teacher forcing")
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
                predicted_decoder_tokens.append(decoder_input)

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        predicted_attn_weights = torch.cat(predicted_attn_weights, dim=1)
        # decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)  # not needed since we are using cross entropy loss instead of NLL loss
        return decoder_outputs, decoder_hidden, predicted_decoder_tokens, predicted_attn_weights # We return `None` for consistency in the training loop

    def forward_step(self, input, prev_hidden_state, encoder_context_vector, encoder_outputs):

        input_token_embedding = self.embedding(input)
        input_token_embedding_relued = F.relu(input_token_embedding)

        # Forward rnn cell pass
        gru_output, gru_hidden = self.gru(input_token_embedding_relued, prev_hidden_state)

        # Get Attention context for this time step using the encoder outputs and the current hidden state
        query = gru_hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(encoder_outputs, query) # keys, query

        context = context.permute(1, 0, 2)

        # Concatenate the current hidden state and this time step's attended context
        decoder_hidden_concatenate_attended_context = torch.cat(
            (
                gru_hidden,
                context
            ),
            2
        )

        # Pass through the linear layer to get logits distribution on each output vocab word
        linear_output = self.out(decoder_hidden_concatenate_attended_context)
        # print(f"linear_output shape before: {linear_output.shape}")

        # Another way to
        # linear_output = linear_output.view(linear_output.shape[1], linear_output.shape[0], linear_output.shape[2])
        linear_output = linear_output.permute(1,0,2)

        return linear_output, gru_hidden, attn_weights

class BhadanauAttentionBeforeOutputEncoderDecoderTranslation(nn.Module):

    def __init__(self, input_lang, output_lang, hidden_size, device):
        super(BhadanauAttentionBeforeOutputEncoderDecoderTranslation, self).__init__()

        self.encoder = BhadanauAttentionBeforeOutputEncoderRNN(input_lang.n_words, hidden_size).to(device)
        self.decoder = BhadanauAttentionBeforeOutputDecoderRNN(hidden_size, output_lang.n_words).to(device)
        self.device = device

    def forward(self, input_tensor, target_tensor=None):

        encoder_outputs, encoder_hidden = self.encoder(input_tensor)
        decoder_outputs, _, predicted_decoder_tokens, predicted_attn_weights = self.decoder(encoder_outputs, encoder_hidden, target_tensor)

        return decoder_outputs, predicted_decoder_tokens, predicted_attn_weights


In [11]:
def train_epoch(dataloader, encoder_decoder, encoder_decoder_optimizer, criterion):

    total_loss = 0
    for data in tqdm(dataloader):
        input_tensor, target_tensor = data

        # zero out gradients before each batch
        encoder_decoder_optimizer.zero_grad()

        # Run encoder-decoder forward()
        decoder_outputs, _, _ = encoder_decoder(input_tensor, target_tensor)

        # calculate loss
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )

        # calculate gradients
        loss.backward()
        # update weights
        encoder_decoder_optimizer.step()

        # update epoch level loss
        total_loss += loss.item()

    return total_loss / len(dataloader)

def normalize_tensors_to_tokens(tensor, remove_first_idx=False):

  # convert_to_list
  tensor = tensor.tolist()

  # remove_sos_eos_and_pads_convert_list
  if remove_first_idx:
    tensor = [sequence[1:] for sequence in tensor]

  # remove all tokens after <eos token>
  out_list = []
  for sequence in tensor:
    new_seq = []
    for token in sequence:
      if token == EOS_token:
        break
      new_seq.append(token)
    out_list.append(new_seq)

  return out_list


def predict(data_loader, encoder_decoder):

  # Eval Mode. Turn off dropout and batchnorm
  encoder_decoder.eval()

  list_decoder_outputs = []

  # ensure no gradients are calculated with no_grad() to preserve memory
  with torch.no_grad():
    for data in data_loader:
      input_tensor, target_tensor = data
      decoder_outputs, predicted_decoder_tokens, predicted_attn_weights = encoder_decoder(input_tensor)
      list_decoder_outputs.append(decoder_outputs)

      # Merge timesteps of decoder predictions
      predicted_decoder_tokens = torch.cat(predicted_decoder_tokens, dim=1)

  return list_decoder_outputs, input_tensor, target_tensor, predicted_decoder_tokens, predicted_attn_weights


def calculate_bleu(test_target_tokens, predicted_decoder_tokens):

  return corpus_bleu(
      [[item] for item in test_target_tokens],
      [item for item in predicted_decoder_tokens],
    )

def train(train_dataloader, test_dataloader, encoder_decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):

    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_decoder_optimizer = optim.Adam(encoder_decoder.parameters(), lr=learning_rate)

    # Ignore pad token from loss calculation
    if is_ignore_pads:
      # criterion = nn.NLLLoss(ignore_index = PAD_token)
      criterion = nn.CrossEntropyLoss(ignore_index = PAD_token)
    else:
      criterion = nn.CrossEntropyLoss()

    print('Time \t\t\t (Epoch\t%) \t Loss \t\t Bleu')
    for epoch in range(1, n_epochs + 1):
        # Training
        encoder_decoder.train()
        loss = train_epoch(train_dataloader, encoder_decoder, encoder_decoder_optimizer, criterion)

        print_loss_total += loss
        plot_loss_total += loss

        # Eval on Test
        encoder_decoder.eval()
        # Evaluate without teacher forcing on test set
        test_list_decoder_outputs, test_input_tensor, test_target_tensor, predicted_decoder_tokens, _ = predict(test_dataloader, encoder_decoder)

        # Calculate bleu
        bleu = calculate_bleu(
            normalize_tensors_to_tokens(test_target_tensor, False),
            normalize_tensors_to_tokens(predicted_decoder_tokens, False)
        )


        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s \t (%d \t %d%%) \t %.4f \t %.4f' % (Helpers.timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg, bleu))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    Helpers.showPlot(plot_losses)


In [12]:
is_ignore_pads = True
input_lang, output_lang, train_dataloader, test_dataloader = DataLoaderHandler.get_dataloader(batch_size)

# init encoder-decoder
encoder_decoder = BhadanauAttentionBeforeOutputEncoderDecoderTranslation(input_lang, output_lang, hidden_size, device)

train(train_dataloader, test_dataloader, encoder_decoder, epochs, print_every=5, plot_every=5)

Reading lines...
Read 135842 sentence pairs
Trimmed to 105692 sentence pairs
Counting words...
Counted words:
fra 17865
eng 10699
Train and Test Dataset # samples: 100407, 5285
Train and Test Dataloader # batches: 99, 1
Time 			 (Epoch	%) 	 Loss 		 Bleu


100%|██████████| 99/99 [00:12<00:00,  7.72it/s]
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
100%|██████████| 99/99 [00:11<00:00,  8.78it/s]
100%|██████████| 99/99 [00:10<00:00,  9.07it/s]
100%|██████████| 99/99 [00:11<00:00,  8.39it/s]
100%|██████████| 99/99 [00:11<00:00,  8.66it/s]


1m 3s (- 41m 8s) 	 (5 	 2%) 	 3.8746 	 0.1904


100%|██████████| 99/99 [00:11<00:00,  8.48it/s]
100%|██████████| 99/99 [00:11<00:00,  8.52it/s]
100%|██████████| 99/99 [00:12<00:00,  8.22it/s]
100%|██████████| 99/99 [00:12<00:00,  8.22it/s]
100%|██████████| 99/99 [00:11<00:00,  8.25it/s]


2m 6s (- 40m 4s) 	 (10 	 5%) 	 1.8292 	 0.3423


100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.45it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.28it/s]


3m 8s (- 38m 48s) 	 (15 	 7%) 	 1.2594 	 0.4043


100%|██████████| 99/99 [00:11<00:00,  8.36it/s]
100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.27it/s]


4m 11s (- 37m 46s) 	 (20 	 10%) 	 0.9839 	 0.4414


100%|██████████| 99/99 [00:11<00:00,  8.36it/s]
100%|██████████| 99/99 [00:11<00:00,  8.42it/s]
100%|██████████| 99/99 [00:11<00:00,  8.39it/s]
100%|██████████| 99/99 [00:11<00:00,  8.37it/s]
100%|██████████| 99/99 [00:11<00:00,  8.47it/s]


5m 14s (- 36m 43s) 	 (25 	 12%) 	 0.8209 	 0.4635


100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.42it/s]
100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.39it/s]
100%|██████████| 99/99 [00:11<00:00,  8.37it/s]


6m 17s (- 35m 37s) 	 (30 	 15%) 	 0.7085 	 0.4781


100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.32it/s]
100%|██████████| 99/99 [00:11<00:00,  8.31it/s]
100%|██████████| 99/99 [00:11<00:00,  8.38it/s]
100%|██████████| 99/99 [00:11<00:00,  8.37it/s]


7m 20s (- 34m 35s) 	 (35 	 17%) 	 0.6242 	 0.4880


100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.49it/s]
100%|██████████| 99/99 [00:11<00:00,  8.27it/s]
100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.42it/s]


8m 22s (- 33m 30s) 	 (40 	 20%) 	 0.5577 	 0.4923


100%|██████████| 99/99 [00:11<00:00,  8.41it/s]
100%|██████████| 99/99 [00:11<00:00,  8.33it/s]
100%|██████████| 99/99 [00:11<00:00,  8.25it/s]
100%|██████████| 99/99 [00:11<00:00,  8.39it/s]
100%|██████████| 99/99 [00:11<00:00,  8.41it/s]


9m 25s (- 32m 27s) 	 (45 	 22%) 	 0.5050 	 0.4936


100%|██████████| 99/99 [00:11<00:00,  8.42it/s]
100%|██████████| 99/99 [00:11<00:00,  8.41it/s]
100%|██████████| 99/99 [00:11<00:00,  8.42it/s]
100%|██████████| 99/99 [00:11<00:00,  8.38it/s]
100%|██████████| 99/99 [00:11<00:00,  8.42it/s]


10m 27s (- 31m 23s) 	 (50 	 25%) 	 0.4628 	 0.4993


100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.30it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.47it/s]
100%|██████████| 99/99 [00:11<00:00,  8.48it/s]


11m 31s (- 30m 22s) 	 (55 	 27%) 	 0.4245 	 0.5130


100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.33it/s]
100%|██████████| 99/99 [00:11<00:00,  8.45it/s]
100%|██████████| 99/99 [00:11<00:00,  8.40it/s]
100%|██████████| 99/99 [00:11<00:00,  8.45it/s]


12m 34s (- 29m 20s) 	 (60 	 30%) 	 0.3929 	 0.5158


100%|██████████| 99/99 [00:11<00:00,  8.32it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.41it/s]
100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]


13m 37s (- 28m 18s) 	 (65 	 32%) 	 0.3649 	 0.5291


100%|██████████| 99/99 [00:11<00:00,  8.27it/s]
100%|██████████| 99/99 [00:11<00:00,  8.55it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.45it/s]
100%|██████████| 99/99 [00:11<00:00,  8.43it/s]


14m 41s (- 27m 17s) 	 (70 	 35%) 	 0.3422 	 0.5222


100%|██████████| 99/99 [00:11<00:00,  8.33it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.39it/s]
100%|██████████| 99/99 [00:11<00:00,  8.42it/s]


15m 44s (- 26m 13s) 	 (75 	 37%) 	 0.3187 	 0.5284


100%|██████████| 99/99 [00:11<00:00,  8.42it/s]
100%|██████████| 99/99 [00:11<00:00,  8.45it/s]
100%|██████████| 99/99 [00:11<00:00,  8.35it/s]
100%|██████████| 99/99 [00:11<00:00,  8.53it/s]
100%|██████████| 99/99 [00:11<00:00,  8.32it/s]


16m 46s (- 25m 9s) 	 (80 	 40%) 	 0.3002 	 0.5329


100%|██████████| 99/99 [00:11<00:00,  8.41it/s]
100%|██████████| 99/99 [00:11<00:00,  8.41it/s]
100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.35it/s]
100%|██████████| 99/99 [00:11<00:00,  8.37it/s]


17m 48s (- 24m 5s) 	 (85 	 42%) 	 0.2830 	 0.5378


100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.53it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.30it/s]
100%|██████████| 99/99 [00:11<00:00,  8.36it/s]


18m 51s (- 23m 2s) 	 (90 	 45%) 	 0.2684 	 0.5381


100%|██████████| 99/99 [00:11<00:00,  8.53it/s]
100%|██████████| 99/99 [00:11<00:00,  8.43it/s]
100%|██████████| 99/99 [00:11<00:00,  8.44it/s]
100%|██████████| 99/99 [00:11<00:00,  8.32it/s]
100%|██████████| 99/99 [00:11<00:00,  8.50it/s]


19m 53s (- 21m 59s) 	 (95 	 47%) 	 0.2546 	 0.5345


100%|██████████| 99/99 [00:11<00:00,  8.39it/s]
 65%|██████▍   | 64/99 [00:07<00:04,  8.32it/s]


KeyboardInterrupt: 

In [None]:
import torch

def show_model_layers_and_params(model):
    print("Model Layers:")
    print("--------------")
    for name, module in model.named_children():
        print(f"{name}: {module}")

    print("\nLayer-wise Number of Parameters and Memory Requirements:")
    print("-------------------------------------------------------")
    total_params = 0
    total_memory = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_params = param.numel()
            param_memory = num_params * param.element_size() / (1024 ** 2)  # Memory in MBs
            print(f"{name}: {num_params} parameters, {param_memory:.2f} MB")
            total_params += num_params
            total_memory += param_memory

    print("\nTotal Number of Parameters and Memory Usage:")
    print("------------------------------------------")
    print(f"Total parameters: {total_params}")
    print(f"Total memory usage: {total_memory:.2f} MB")


In [None]:
show_model_layers_and_params(encoder_decoder)

- Does dot product attention increase the number of params?
- Visualize Attention

In [None]:
# Predict
test_list_decoder_outputs, test_input_tensor, test_target_tensor, predicted_decoder_tokens, predicted_attn_weights = predict(test_dataloader, encoder_decoder)

In [None]:
# Example Predictions without normalizing tokens
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[0].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[0].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[0].tolist()))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[1].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[1].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[1].tolist()))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[2].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[2].tolist()))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[2].tolist()))

In [None]:
# Normalize tokens
test_input_tensor = normalize_tensors_to_tokens(test_input_tensor, False)
test_target_tensor = normalize_tensors_to_tokens(test_target_tensor, False)
predicted_decoder_tokens = normalize_tensors_to_tokens(predicted_decoder_tokens, True)

In [None]:
# Example Predictions with normalizing tokens
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[0]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[0]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[0]))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[1]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[1]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[1]))
print()
print(DataLoaderHandler.sentenceFromIndices(input_lang, test_input_tensor[2]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, test_target_tensor[2]))
print(DataLoaderHandler.sentenceFromIndices(output_lang, predicted_decoder_tokens[2]))

In [None]:
%matplotlib inline

In [None]:
def showAttention(input_words, output_words, attentions):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    # ax.set_xticklabels([''] + input_sentence.split(' ') +
    #                    ['<EOS>'], rotation=90)
    ax.set_xticklabels([''] + input_words, rotation=30)
    ax.set_yticklabels([''] + output_words, rotation=30)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    return


def visualize_attentions(idx, test_input_tensor, test_target_tensor, predicted_decoder_tokens, predicted_attn_weights, input_lang, output_lang):


    input_lang_tokens = DataLoaderHandler.outputLangTokensFromIndices(input_lang, test_input_tensor[idx].tolist())
    target_lang_tokens = DataLoaderHandler.outputLangTokensFromIndices(output_lang, test_target_tensor[idx].tolist())
    predicted_lang_tokens = DataLoaderHandler.outputLangTokensFromIndices(output_lang, predicted_decoder_tokens[idx].tolist())
    attentions = predicted_attn_weights[idx]

    print(input_lang_tokens)
    print(target_lang_tokens)
    print(predicted_lang_tokens)

    showAttention(input_lang_tokens, predicted_lang_tokens, attentions)


test_list_decoder_outputs, test_input_tensor, test_target_tensor, predicted_decoder_tokens, predicted_attn_weights = predict(test_dataloader, encoder_decoder)

In [None]:
visualize_attentions(1, test_input_tensor, test_target_tensor, predicted_decoder_tokens, predicted_attn_weights, input_lang, output_lang)

# TO TRY
- Ignore padding token loss (ignore index) - mixed results
- evaluation metric - bleu (done)
- EOS token related sequence clipping - done
- shifted target sequence? - ignore (done)
- loss: what all to include? to include <EOS>? - eos included, padding not included
- Output Vocab is a big choking point {among other things} that can prevent large gpu batches since we have to store the output vectors of shape (batch_size, time_steps, vocab_size)

In [None]:
# def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
#           decoder_optimizer, criterion):

#     total_loss = 0
#     for data in dataloader:
#         input_tensor, target_tensor = data

#         encoder_optimizer.zero_grad()
#         decoder_optimizer.zero_grad()

#         encoder_outputs, encoder_hidden = encoder(input_tensor)
#         decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

#         loss = criterion(
#             decoder_outputs.view(-1, decoder_outputs.size(-1)),
#             target_tensor.view(-1)
#         )
#         loss.backward()

#         encoder_optimizer.step()
#         decoder_optimizer.step()

#         total_loss += loss.item()

#     return total_loss / len(dataloader)

# def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
#                print_every=100, plot_every=100):
#     start = time.time()
#     plot_losses = []
#     print_loss_total = 0  # Reset every print_every
#     plot_loss_total = 0  # Reset every plot_every

#     encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
#     decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
#     criterion = nn.NLLLoss()

#     for epoch in range(1, n_epochs + 1):
#         loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
#         print_loss_total += loss
#         plot_loss_total += loss

#         if epoch % print_every == 0:
#             print_loss_avg = print_loss_total / print_every
#             print_loss_total = 0
#             print('%s (%d %d%%) %.4f' % (Helpers.timeSince(start, epoch / n_epochs),
#                                         epoch, epoch / n_epochs * 100, print_loss_avg))

#         if epoch % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

#     Helpers.showPlot(plot_losses)

# hidden_size = 128
# batch_size = 32
# input_lang, output_lang, train_dataloader = DataLoaderHandler.get_dataloader(batch_size)
# encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

# # train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

In [None]:
import matplotlib.pyplot as plt

# Data for the x-axis and y-axis
x_values = [1, 2, 3, 4, 5]
y_values = [2, 4, 6, 8, 10]

# Plotting the line
plt.plot(x_values, y_values, marker='o', linestyle='-')  # 'o' for markers, '-' for line style
plt.title('Simple Line Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.grid(True)  # Show grid
plt.show()
