# Assignment 4, task 2

Recurrent neural networks, particularly Gated Recurrent Units, were developed with a translation task in mind. We will now do a small English-to-Swedish translation experiment by training on a corpus from Tatoeba (https://tatoeba.org/en/).

To this end, we are going to use an Encoder-Decoder architecture. The encoder is a recurrent neural network with GRU cells, that encodes the English input sentence. The Encoder can be either uni-directional or bi-directional.

After the encoder has processed the English sentence, the decoder then takes over and generates the Swedish sentence, starting from the final hidden state of the encoder (or the concatenation of the two final hidden states, in the case of a bi-directional encoder). If an attention mechanism is used, the decoder will access all the hidden states of the encoder when deciding which word to output next.


In [35]:
# First run this cell
from datetime import datetime
import argparse
import random
import pickle
import codecs
import json
import os
import nltk
import torch
import numpy as np
from pprint import pprint

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from terminaltables import AsciiTable

In [36]:
# Mappings between symbols and integers, and vice versa.
# They are global for all datasets.
source_w2i = {}
source_i2w = []
target_w2i = {}
target_i2w = []

# The padding symbol will be used to ensure that all tensors in a batch
# have equal length.
PADDING_SYMBOL = ' '
source_w2i[PADDING_SYMBOL] = 0
source_i2w.append(PADDING_SYMBOL)
target_w2i[PADDING_SYMBOL] = 0
target_i2w.append(PADDING_SYMBOL)

START_SYMBOL = '<START>'
END_SYMBOL = '<END>'
UNK_SYMBOL = '<UNK>'
source_w2i[START_SYMBOL] = 1
source_i2w.append(START_SYMBOL)
target_w2i[START_SYMBOL] = 1
target_i2w.append(START_SYMBOL)
source_w2i[END_SYMBOL] = 2
source_i2w.append(END_SYMBOL)
target_w2i[END_SYMBOL] = 2
target_i2w.append(END_SYMBOL)
source_w2i[UNK_SYMBOL] = 3
source_i2w.append(UNK_SYMBOL)
target_w2i[UNK_SYMBOL] = 3
target_i2w.append(UNK_SYMBOL)

# Max number of words to be predicted if <END> symbol is not reached
MAX_PREDICTIONS = 20

In [37]:
def load_glove_embeddings(embedding_file):
    """
    Reads pre-made embeddings from a file
    """
    N = len(source_w2i)
    embeddings = [0]*N
    with codecs.open(embedding_file, 'r', 'utf-8') as f:
        for line in f:
            data = line.split()
            word = data[0].lower()
            if word not in source_w2i:
                source_w2i[word] = N
                source_i2w.append(word)
                N += 1
                embeddings.append(0)
            vec = [float(x) for x in data[1:]]
            D = len(vec)
            embeddings[source_w2i[word]] = vec
    # Add a '0' embedding for the padding symbol
    embeddings[0] = [0]*D
    # Check if there are words that did not have a ready-made Glove embedding
    # For these words, add a random vector
    for word in source_w2i:
        index = source_w2i[word]
        if embeddings[index] == 0:
            embeddings[index] = (np.random.random(D)-0.5).tolist()
    return D, embeddings

In [38]:
class TranslationDataset(Dataset):
    """
    A dataset with source sentences and their respective translations
    into the target language.

    Each sentence is represented as a list of word IDs. 
    """

    def __init__(self, filename, record_symbols=True):
        try:
            nltk.word_tokenize("hi there.")
        except LookupError:
            nltk.download('punkt')
        self.source_list = []
        self.target_list = []
        # Read the datafile
        with codecs.open(filename, 'r', 'utf-8') as f:
            lines = f.read().split('\n')
            for line in lines:
                if '\t' not in line:
                    continue
                s, t = line.split('\t')
                source_sentence = []
                for w in nltk.word_tokenize(s):
                    if w not in source_i2w and record_symbols:
                        source_w2i[w] = len(source_i2w)
                        source_i2w.append(w)
                    source_sentence.append(
                        source_w2i.get(w, source_w2i[UNK_SYMBOL]))
                source_sentence.append(source_w2i[END_SYMBOL])
                self.source_list.append(source_sentence)
                target_sentence = []
                for w in nltk.word_tokenize(t):
                    if w not in target_i2w and record_symbols:
                        target_w2i[w] = len(target_i2w)
                        target_i2w.append(w)
                    target_sentence.append(
                        target_w2i.get(w, target_w2i[UNK_SYMBOL]))
                target_sentence.append(target_w2i[END_SYMBOL])
                self.target_list.append(target_sentence)

    def __len__(self):
        return len(self.source_list)

    def __getitem__(self, idx):
        return self.source_list[idx], self.target_list[idx]

In [39]:
# Run this cell. The function below will take care of the case of
# sequences of unequal lengths.

def pad_sequence(batch, pad_source=source_w2i[PADDING_SYMBOL], pad_target=target_w2i[PADDING_SYMBOL]):
    source, target = zip(*batch)
    max_source_len = max(map(len, source))
    max_target_len = max(map(len, target))
    padded_source = [[b[i] if i < len(b) else pad_source for i in range(
        max_source_len)] for b in source]
    padded_target = [[l[i] if i < len(l) else pad_target for i in range(
        max_target_len)] for l in target]
    return padded_source, padded_target

Here is the implementation of the encoder. For task 2(a), you will need to fill a part of the code.


In [40]:
# ==================== Encoder ==================== #

class EncoderRNN(nn.Module):
    """
    Encodes a batch of source sentences. 
    """

    def __init__(self, no_of_input_symbols, embeddings=None, embedding_size=16, hidden_size=25,
                 encoder_bidirectional=False, device='cpu', use_gru=False, tune_embeddings=False):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.is_bidirectional = encoder_bidirectional
        self.embedding = nn.Embedding(no_of_input_symbols, embedding_size)
        if embeddings != None:
            self.embedding.weight = nn.Parameter(torch.tensor(
                embeddings, dtype=torch.float), requires_grad=tune_embeddings)
        if use_gru:
            self.rnn = nn.GRU(embedding_size, hidden_size,
                              batch_first=True, bidirectional=self.is_bidirectional)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size,
                              batch_first=True, bidirectional=self.is_bidirectional)
        self.device = device
        self.to(device)

    def set_embeddings(self, embeddings):
        self.embedding.weight = torch.tensor(embeddings, dtype=torch.float)

    def forward(self, x):
        """
        x is a list of lists of size (batch_size,max_seq_length)
        Each inner list contains word IDs and represents one sentence.
        The whole list-of-lists represents a batch of sentences.

        Returns:
        the output from the encoder RNN: a pair of two tensors, one containing all hidden states, and one 
        containing the last hidden state (see https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)
        """

        x_tensor = torch.tensor(x).to(self.device)

        # FOR TASK (a), REPLACE THE FOLLOWING LINE WITH YOUR CODE
        embedded = self.embedding(x_tensor)
        return self.rnn(embedded)

Here is the decoder. For tasks (b) and (c), fill in the missing code in the 'forward' function.


In [41]:
# ==================== Decoder ==================== #

class DecoderRNN(nn.Module):

    def __init__(self, no_of_output_symbols, embedding_size=16, hidden_size=25, use_attention=True,
                 display_attention=False, device='cpu', use_gru=False):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(no_of_output_symbols, embedding_size)
        self.no_of_output_symbols = no_of_output_symbols
        # shouldn't W be 2*hidden_size
        self.W = nn.Parameter(torch.rand(hidden_size, hidden_size)-0.5)
        self.U = nn.Parameter(torch.rand(hidden_size, hidden_size)-0.5)
        self.v = nn.Parameter(torch.rand(hidden_size, 1)-0.5)
        self.use_attention = use_attention
        self.display_attention = display_attention
        if use_gru:
            self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.output = nn.Linear(hidden_size, no_of_output_symbols)
        self.device = device
        self.to(device)

    def forward(self, inp, hidden, encoder_outputs):
        """
        'input' is a list of length batch_size, containing the current word
        of each sentence in the batch

        'hidden' is a tensor containing the last hidden state of the decoder, 
        for each sequence in the batch
        hidden.shape = (1, batch_size, hidden_size)

        'encoder_outputs' is a tensor containing all hidden states from the
        encoder (used in problem c)
        encoder_outputs.shape = (batch_size, max_seq_length, hidden_size)

        Note that 'max_seq_length' above refers to the max_seq_length
        of the encoded sequence (not the decoded sequence).

        Returns:
        If use_attention and display_attention are both True (task (c)), return a triple
        (logits for the predicted next word, hidden state, attention weights alpha)

        Otherwise (task (b)), return a pair
        (logits for the predicted next word, hidden state).
        """
        inp_tensor = torch.tensor(inp).to(self.device)

        # FOR (b) and (c) REPLACE THE FOLLOWING LINE WITH YOUR CODE
        embedded = self.embedding(inp_tensor).unsqueeze(1)

        if self.use_attention and self.display_attention:
            # FOR TASK (c), REPLACE THE FOLLOWING LINES WITH YOUR CODE
            # Calculate attention weights
            alpha = torch.matmul(torch.tanh(torch.matmul(hidden.permute(1, 0, 2), self.W) +
                                            torch.matmul(encoder_outputs, self.U)), self.v)
            alpha = F.softmax(alpha, dim=1)
            # Calculate context vector
            context = torch.bmm(alpha.permute(0, 2, 1), encoder_outputs)
            context = context.permute(1, 0, 2)
            output, hidden = self.rnn(embedded, context)
            return self.output(output), hidden, alpha

        else:
            output, hidden = self.rnn(embedded, hidden)
            return self.output(output), hidden

In [42]:
# This function will be used for evaluation of both the dev set (during training)
# and the test set (after training is finished).
def evaluate(ds, encoder, decoder):
    confusion = [[0 for a in target_i2w] for b in target_i2w]
    correct_sentences, incorrect_sentences = 0, 0
    for x, y in ds:
        predicted_sentence = []
        outputs, hidden = encoder([x])
        if encoder.is_bidirectional:
            hidden = hidden.permute((1, 0, 2)).reshape(1, -1).unsqueeze(0)
        predicted_symbol = target_w2i[START_SYMBOL]
        for correct in y:
            predictions, hidden = decoder([predicted_symbol], hidden, outputs)
            _, predicted_tensor = predictions.topk(1)
            predicted_symbol = predicted_tensor.detach().item()
            confusion[int(predicted_symbol)][int(correct)] += 1
            predicted_sentence.append(predicted_symbol)
        if predicted_sentence == y:
            correct_sentences += 1
        else:
            incorrect_sentences += 1
    correct_symbols = sum([confusion[i][i] for i in range(len(confusion))])
    all_symbols = torch.tensor(confusion).sum().item()

    # Construct a neat confusion matrix
    for i in range(len(confusion)):
        confusion[i].insert(0, target_i2w[i])
    first_row = ["Predicted/Real"]
    first_row.extend(target_i2w)
    confusion.insert(0, first_row)
    # t = AsciiTable( confusion )

    # print( t.table )
    print("Correctly predicted words    : ", correct_symbols)
    print("Incorrectly predicted words  : ", all_symbols-correct_symbols)
    print("Correctly predicted sentences  : ", correct_sentences)
    print("Incorrectly predicted sentences: ", incorrect_sentences)
    print()

In [43]:
# Use 'Run all cells' to do the training.

# ================ Hyper-parameters ================ #

use_attention = True
use_gru = True         # Use Gated Recurrent Units (rather than plain RNNs)
bidirectional = True   # Use a bidirectional encoder
use_embeddings = True      # Use pre-loaded Glove embeddings
tune_embeddings = True  # Fine-tune the Glove embeddings
batch_size = 64
hidden_size = 25       # Number of dimensions in the hidden state
learning_rate = 0.001
epochs = 50            # We will train for this many epochs
save = True           # Do not save the model

# ====================== Data ===================== #

training_file = '/datasets/dd2417/eng-swe-train.txt'
test_file = '/datasets/dd2417/eng-swe-test.txt'
dev_file = '/datasets/dd2417/eng-swe-dev.txt'

# ==================== Training ==================== #
# Reproducibility
# Read a bit more here -- https://pytorch.org/docs/stable/notes/randomness.html
random.seed(5719)
np.random.seed(5719)
# torch.manual_seed(5719)
# torch.use_deterministic_algorithms(True)

# Can we run on GPU?
if torch.cuda.is_available():
    print("Current device: {}".format(torch.cuda.get_device_name(0)))
else:
    print('Running on CPU')
print()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Read datasets
training_dataset = TranslationDataset(training_file)
dev_dataset = TranslationDataset(dev_file, record_symbols=False)

print("Number of source words: ", len(source_i2w))
print("Number of target words: ", len(target_i2w))
print("Number of training sentences: ", len(training_dataset))
print()

# If we have pre-computed word embeddings, then make sure these are used
if use_embeddings:
    embedding_size, embeddings = load_glove_embeddings(
        '/datasets/dd2417/glove.6B.50d.txt')
else:
    embedding_size = args.hidden_size
    embeddings = None

training_loader = DataLoader(
    training_dataset, batch_size=batch_size, collate_fn=pad_sequence)
dev_loader = DataLoader(
    dev_dataset, batch_size=batch_size, collate_fn=pad_sequence)

criterion = nn.CrossEntropyLoss()

encoder = EncoderRNN(
    len(source_i2w),
    embeddings=embeddings,
    embedding_size=embedding_size,
    hidden_size=hidden_size,
    encoder_bidirectional=bidirectional,
    tune_embeddings=tune_embeddings,
    use_gru=use_gru,
    device=device
)
decoder = DecoderRNN(
    len(target_i2w),
    embedding_size=embedding_size,
    hidden_size=hidden_size*(bidirectional+1),
    use_attention=use_attention,
    use_gru=use_gru,
    device=device
)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

encoder.train()
decoder.train()
print(datetime.now().strftime("%H:%M:%S"), "Starting training.")

for epoch in range(epochs):
    total_loss = 0
    # tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
    for source, target in training_loader:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        loss = 0
        # hidden is (D * num_layers, B, H)
        outputs, hidden = encoder(source)
        if bidirectional:
            # (2, B, H) -> (B, 2 * H) -> (1, B, 2 * H)
            hidden = torch.cat(
                [hidden[0, :, :], hidden[1, :, :]], dim=1).unsqueeze(0)

        # The probability of doing teacher forcing will decrease
        # from 1 to 0 over the range of epochs. This could be implemented
        # like this:
        # teacher_forcing_ratio = 1- epoch/args.epochs
        # But, for now we will always use teacher forcing
        teacher_forcing_ratio = 1

        # The input to the decoder in the first time step will be
        # the boundary symbol, regardless if we are using teacher
        # forcing or not.
        idx = [target_w2i[START_SYMBOL] for sublist in target]
        predicted_symbol = [target_w2i[START_SYMBOL] for sublist in target]

        target_length = len(target[0])
        for i in range(target_length):
            use_teacher_forcing = (random.random() < teacher_forcing_ratio)
            if use_teacher_forcing:
                predictions, hidden = decoder(idx, hidden, outputs)
            else:
                # Here we input the previous prediction rather than the
                # correct symbol.
                predictions, hidden = decoder(
                    predicted_symbol, hidden, outputs)
            _, predicted_tensor = predictions.topk(1)
            predicted_symbol = predicted_tensor.squeeze().tolist()

            # The targets will be the ith symbol of all the target
            # strings. They will also be used as inputs for the next
            # time step if we use teacher forcing.
            idx = [sublist[i] for sublist in target]
            loss += criterion(predictions.squeeze(),
                              torch.tensor(idx).to(device))
        loss /= (target_length * batch_size)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        total_loss += loss
    print(datetime.now().strftime("%H:%M:%S"), "Epoch",
          epoch, "loss:", total_loss.detach().item())
    total_loss = 0

    if epoch % 10 == 0:
        print("Evaluating on the dev data...")
        evaluate(dev_dataset, encoder, decoder)

# ==================== Save the model  ==================== #

if (save):
    dt = str(datetime.now()).replace(
        ' ', '_').replace(':', '_').replace('.', '_')
    newdir = 'model_' + dt
    os.mkdir(newdir)
    torch.save(encoder.state_dict(), os.path.join(newdir, 'encoder.model'))
    torch.save(decoder.state_dict(), os.path.join(newdir, 'decoder.model'))
    with open(os.path.join(newdir, 'source_w2i'), 'wb') as f:
        pickle.dump(source_w2i, f)
        f.close()
    with open(os.path.join(newdir, 'source_i2w'), 'wb') as f:
        pickle.dump(source_i2w, f)
        f.close()
    with open(os.path.join(newdir, 'target_w2i'), 'wb') as f:
        pickle.dump(target_w2i, f)
        f.close()
    with open(os.path.join(newdir, 'target_i2w'), 'wb') as f:
        pickle.dump(target_i2w, f)
        f.close()

    settings = {
        'training_set': training_file,
        'test_set': test_file,
        'epochs': epochs,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'hidden_size': hidden_size,
        'attention': use_attention,
        'bidirectional': bidirectional,
        'embedding_size': embedding_size,
        'use_gru': use_gru,
        'tune_embeddings': tune_embeddings
    }
    with open(os.path.join(newdir, 'settings.json'), 'w') as f:
        json.dump(settings, f)

# ==================== Evaluation ==================== #

encoder.eval()
decoder.eval()
print("Evaluating on the test data...")

test_dataset = TranslationDataset(test_file, record_symbols=False)
print("Number of test sentences: ", len(test_dataset))
print()

evaluate(test_dataset, encoder, decoder)

Current device: NVIDIA H100 80GB HBM3 MIG 1g.10gb

Number of source words:  8883
Number of target words:  12861
Number of training sentences:  33454

15:31:53 Starting training.
15:32:02 Epoch 0 loss: 23.64399528503418
Evaluating on the dev data...
Correctly predicted words    :  358
Incorrectly predicted words  :  2296
Correctly predicted sentences  :  0
Incorrectly predicted sentences:  370

15:32:31 Epoch 1 loss: 15.650080680847168
15:32:40 Epoch 2 loss: 14.015079498291016
15:32:49 Epoch 3 loss: 12.933985710144043
15:32:58 Epoch 4 loss: 12.133914947509766
15:33:08 Epoch 5 loss: 11.500115394592285
15:33:18 Epoch 6 loss: 10.970999717712402
15:33:27 Epoch 7 loss: 10.514049530029297
15:33:36 Epoch 8 loss: 10.108987808227539
15:33:45 Epoch 9 loss: 9.744508743286133
15:33:55 Epoch 10 loss: 9.411721229553223
Evaluating on the dev data...
Correctly predicted words    :  808
Incorrectly predicted words  :  1846
Correctly predicted sentences  :  0
Incorrectly predicted sentences:  370

15:34:

In [44]:
# ==================== User interaction ==================== #

decoder.display_attention = True
while (True):
    text = input("> ")
    if text == "":
        continue
    if text == "exit":
        break
    try:
        source_sentence = [source_w2i[w] for w in nltk.word_tokenize(text)]
    except KeyError:
        print("Erroneous input string")
        continue
    outputs, hidden = encoder([source_sentence])
    if encoder.is_bidirectional:
        hidden = hidden.permute((1, 0, 2)).reshape(1, -1).unsqueeze(0)

    predicted_symbol = target_w2i[START_SYMBOL]
    target_sentence = []
    attention_probs = []
    num_attempts = 0
    while num_attempts < MAX_PREDICTIONS:
        if use_attention:
            predictions, hidden, alpha = decoder(
                [predicted_symbol], hidden, outputs)
            attention_probs.append(alpha.permute(
                0, 2, 1).squeeze().detach().tolist())
        else:
            predictions, hidden = decoder([predicted_symbol], hidden, outputs)

        _, predicted_tensor = predictions.topk(1)
        predicted_symbol = predicted_tensor.detach().item()
        target_sentence.append(predicted_symbol)

        num_attempts += 1

        if predicted_symbol == target_w2i[END_SYMBOL]:
            break

    for i in target_sentence:
        print(target_i2w[i].encode('utf-8').decode(), end=' ')
    print()

    if use_attention:
        # Construct the attention table
        ap = torch.tensor(attention_probs).T
        if len(ap.shape) == 1:
            ap = ap.unsqueeze(0)
        attention_probs = ap.tolist()

        for i in range(len(attention_probs)):
            for j in range(len(attention_probs[i])):
                attention_probs[i][j] = "{val:.2f}".format(
                    val=attention_probs[i][j])
        for i in range(len(attention_probs)):
            if i < len(text):
                attention_probs[i].insert(0, source_i2w[source_sentence[i]])
            else:
                attention_probs[i].insert(0, ' ')
        first_row = ["Source/Result"]
        for w in target_sentence:
            first_row.append(target_i2w[w])
        attention_probs.insert(0, first_row)
        t = AsciiTable(attention_probs)
        print(t.table)

det där projektet var borta till sjukhuset mig slut kom för sent till sjukhuset mig slut kom för sent till 
+---------------+------+------+-----------+------+-------+------+-----------+------+------+------+------+------+------+-----------+------+------+------+------+------+------+
| Source/Result | det  | där  | projektet | var  | borta | till | sjukhuset | mig  | slut | kom  | för  | sent | till | sjukhuset | mig  | slut | kom  | för  | sent | till |
+---------------+------+------+-----------+------+-------+------+-----------+------+------+------+------+------+------+-----------+------+------+------+------+------+------+
| it            | 0.05 | 0.07 | 0.08      | 0.16 | 0.03  | 0.01 | 0.03      | 0.09 | 0.05 | 0.11 | 0.04 | 0.07 | 0.07 | 0.03      | 0.09 | 0.05 | 0.11 | 0.04 | 0.07 | 0.07 |
| is            | 0.07 | 0.08 | 0.10      | 0.11 | 0.05  | 0.05 | 0.06      | 0.10 | 0.05 | 0.20 | 0.13 | 0.23 | 0.12 | 0.03      | 0.11 | 0.05 | 0.20 | 0.13 | 0.23 | 0.12 |
| seven         | 0.11

  ap = torch.tensor(attention_probs).T


mamma vill emily följde ” smith ! <END> 
+---------------+-------+------+-------+--------+------+-------+------+-------+
| Source/Result | mamma | vill | emily | följde | ”    | smith | !    | <END> |
+---------------+-------+------+-------+--------+------+-------+------+-------+
| hello         | 1.00  | 1.00 | 1.00  | 1.00   | 1.00 | 1.00  | 1.00 | 1.00  |
+---------------+-------+------+-------+--------+------+-------+------+-------+
jag bestämmer ! <END> 
+---------------+------+-----------+------+-------+
| Source/Result | jag  | bestämmer | !    | <END> |
+---------------+------+-----------+------+-------+
| hello         | 0.33 | 0.32      | 0.62 | 0.30  |
| .             | 0.67 | 0.68      | 0.38 | 0.70  |
+---------------+------+-----------+------+-------+
