## German -> English

https://www.youtube.com/watch?v=sQUqQddQtB4&list=PLhhyoLH6IjfxeoooqP9rhU3HJIAVAJ3Vz&index=31

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k

from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random

from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

In [2]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

In [3]:
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

<br/>

### Preprocessing

In [4]:
german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>')


train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

In [5]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

<br/>

### Encoder

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        if num_layers == 1:
            # dropout will not work if you only have one layer
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)
        else:
            self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers,
                               bidirectional=True, dropout=p)

        # instead of limiting the information to just forward or backward
        # we send it through a linear layer
        # NOTE: we make use of the information (from LSTM, forward and backward) by
        #       using linear layer to map it from (hidden_size * 2) to just hidden_size
        #       since Decoder will not be bidirectional.
        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, x):
        # x shape: (seq_length, N)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        # encoder_states has forward and backward hidden states
        # hidden only has the right most states
        encoder_states, (hidden, cell) = self.rnn(embedding)
        # encoder_states shape: (seq_length, N, hidden_size)
        
        # NOTE: encoder_states which remembers are really just the hidden values
        #       for every timestep (since we don't run through encoder_states from
        #       encoder through any additional linear layers), hence it will be hidden_size * 2 in size for
        #       final dimension.

        # [0:1] is the hidden state for forward
        # [1:2] is the hidden state for backward
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
        # hidden, cell shape: (2, N, hidden_size)

        return encoder_states, hidden, cell

### Decoder

In [7]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)

        # hidden_size * 2 because we have encoder_state (forward and backward)
        # hidden_size * 2 is context vector from the encoder
        # embedding_size is the normal
        if num_layers == 1:
            # dropout will not work if you only have one layer
            self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)
        else:
            self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers, dropout=p)

        # add hidden_states (forward + backward) from encoder; hidden_states from previous step in decoder
        self.energy = nn.Linear(hidden_size * 3, 1)
        self.softmax = nn.Softmax(dim=0)  # activation function for the attention layer
        self.relu = nn.ReLU()

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, encoder_states, hidden, cell):
        """
        encoder_states: hidden_states from the encoder
        hidden: hidden_states from the decoder
        """

        # x shape: (N) but we want (1, N)
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        # compute energy states
        # hidden_states from encoder (forward + backward) and hidden_states from decoder
        sequence_length = encoder_states.shape[0]
        # repeat hidden_states (from decoder) along each time steps from encoder
        h_reshaped = hidden.repeat(sequence_length, 1, 1)

        # concat previous hidden_states from decoder with the encoder_states (hidden states from the encoder)
        # and send through one layer neural network to compute the "energy states"
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))

        # compute attention scores from softmax activation function
        # the reason of computing the attention score to learn which hidden states (from decoder)
        # we should pay attention to
        attention = self.softmax(energy)
        # attention shape: (sequence_length, N, 1)

        # multiply attention with encoder_states
        attention = attention.permute(1, 2, 0)  # swap orders
        # attention shape: (N, 1, seq_length)

        encoder_states = encoder_states.permute(1, 0, 2)
        # encoder_states shape: (N, seq_length, hidden_size * 2)

        # multiply the attention scores with encoder hidden_states
        context_vector = torch.bmm(attention, encoder_states).permute(1, 0, 2)
        # context_vector shape: (N, 1, hidden_size * 2) -> (1, N, hidden_size * 2)

        # concat the context_vector with the embedding on hidden_size (dimension 2)
        rnn_input = torch.cat((context_vector, embedding), dim=2)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # output shape: (1, N, hidden_size)

        predictions = self.fc(outputs)
        # predictions shape: (1, N, length_of_vocab)

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

### Seq2Seq

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        encoder_states, hidden, cell = self.encoder(source)

        # Grab the first input of the decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            outputs[t] = output
            # output shape: (N, english_vocab_size)

            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

<br/>

### Training

In [9]:
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training hyperparameters
num_epochs = 20
learning_rate = 0.001
batch_size = 64


# Model hyperparameters
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)

# embeeding_size of around 100-300 is good number
# based on size of the dataset
encoder_embedding_size = 300
decoder_embedding_size = 300

hidden_size = 1024
num_layers = 1  # number of layers, 1 did the best during the experimentation
enc_dropout = 0.5  # if num_layers is 1, there can be no dropout
dec_dropout = 0.5  # if num_layers is 1, there can be no dropout


# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [10]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)

encoder_net = Encoder(
    input_size_encoder,
    encoder_embedding_size,
    hidden_size,
    num_layers,
    enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

In [11]:
if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.tar'), model, optimizer)


for epoch in range(num_epochs):
    print(f'\nEpoc [{epoch} / {num_epochs}]')

    checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50)

    print(f'Translated example sentence \n {translated_sentence}')

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        input_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model(input_data, target)
        # output shape: (trg_len, batch_size, output_dim)

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        # to avoid gradient becomes too large
        # make sure that gradients are in healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()

        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1


Epoc [0 / 20]
=> Saving checkpoint
Translated example sentence 
 ['archway', 'unfriendly', 'viewing', 'year', 'steering', 'sashes', 'ultimate', 'retail', 'fatigued', 'appreciate', 'youth', 'racquet', 'gripping', 'marching', 'pitched', 'spotlights', 'bowls', 'rally', 'equipped', 'no', 'affection', 'affection', 'scores', 'woven', 'camel', 'treats', 'vaulting', 'sideline', 'stilts', 'safe', 'soil', 'presses', 'sleeve', 'fun', 'stretching', 'rope', 'spray', 'interior', 'uniformed', 'buddhist', 'eagerly', 'senior', 'consisting', 'observes', 'after', 'shuffles', 'burger', 'bookshelf', 'bookshelf', 'ninja']

Epoc [1 / 20]
=> Saving checkpoint
Translated example sentence 
 ['a', 'couple', 'with', 'a', 'large', '<unk>', '<unk>', '<unk>', 'on', 'a', 'large', '<unk>', '.', '<eos>']

Epoc [2 / 20]
=> Saving checkpoint
Translated example sentence 
 ['a', 'large', 'with', 'with', 'men', 'on', 'a', 'large', '<unk>', 'by', 'a', 'large', 'of', 'a', 'large', '.', '.', '<eos>']

Epoc [3 / 20]
=> Saving 

In [12]:
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

Bleu score 24.71
