# Neural Machine Translation with RNN

We apply RNN to the language translation task, which is a typical seq to seq learning tastk. We will use a Encoder-Decoder architecture, where we use RNN models for both Encoder and Decoder parts. 

We will use the data from [the Tatoeba Project](http://www.manythings.org/anki/) and specifically, we will start with the fra-eng (French-English) translation data.

# Data Processing

In [124]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from random import shuffle, sample
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

## Read raw texts and perform train-val-test splitting

In [132]:
## Read all lines and shuffle
file = open("datasets/fra-eng/fra.txt", "r")
all_lines = file.readlines()
shuffle(all_lines)
all_lines = sample(all_lines, 1000)
## Train-Val-Test split
train_ratio, val_ratio = 0.8, 0.1
train_end = int(len(all_lines)*train_ratio)
val_end = int(len(all_lines)*(train_ratio+val_ratio))
train_raw = all_lines[:train_end]
val_raw = all_lines[train_end:val_end]
test_raw = all_lines[val_end:]

## Vocab, Dataset and Dataloader

In [133]:
def get_vocabs(lines, source_tokenizer, target_tokenizer):
    source_counter = Counter()
    target_counter = Counter()
    for line in lines:
        parts = line.split("\t") #source, target and note are separated by tab
        source_counter.update(source_tokenizer(parts[0].lower()))
        target_counter.update(target_tokenizer(parts[1].lower()))
    source_vocab = vocab(source_counter, specials=["<unk>","<pad>", "<bos>", "<eos>"])
    target_vocab = vocab(target_counter, specials=["<unk>","<pad>", "<bos>", "<eos>"])
    source_vocab.set_default_index(source_vocab['<unk>'])
    target_vocab.set_default_index(target_vocab['<unk>'])    
    return source_vocab, target_vocab

def process_data(lines, source_tokenizer, target_tokenizer, source_vocab, target_vocab):
    data=[]
    for line in lines:
        source, target, _ = line.split("\t")
        source_tensor = torch.tensor(source_vocab(source_tokenizer(source.lower())))
        target_tensor = torch.tensor(target_vocab(target_tokenizer(target.lower())))
        data.append((source_tensor, target_tensor))
    return data

def generate_batch(data_batch, source_vocab, target_vocab):
    source_batch = []
    target_batch = []
    for source, target in data_batch:
        source_batch.append(torch.cat([torch.tensor([source_vocab["<bos>"]]), source, torch.tensor([source_vocab["<eos>"]])], dim=0))
        target_batch.append(torch.cat([torch.tensor([target_vocab["<bos>"]]), target, torch.tensor([target_vocab["<eos>"]])], dim=0))
    source_batch = pad_sequence(source_batch, padding_value=source_vocab["<pad>"])
    target_batch = pad_sequence(target_batch, padding_value=target_vocab["<pad>"])
    return source_batch, target_batch

In [134]:
## Use tokenizers from spacy
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
fr_tokenizer = get_tokenizer("spacy", language="fr_core_news_sm")
## Get source target vocabs
en_vocab, fr_vocab = get_vocabs(train_raw, en_tokenizer, fr_tokenizer)
## Get source tartget token ids
train_data = process_data(train_raw, en_tokenizer, fr_tokenizer, en_vocab, fr_vocab)
val_data = process_data(val_raw, en_tokenizer, fr_tokenizer, en_vocab, fr_vocab)
test_data = process_data(test_raw, en_tokenizer, fr_tokenizer, en_vocab, fr_vocab)
## Get train, val, test dataloader
collator = lambda x: generate_batch(x, en_vocab, fr_vocab)
bsz = 16
train_iter = DataLoader(train_data, batch_size=bsz, shuffle=True, collate_fn=collator)
val_iter = DataLoader(val_data, batch_size=bsz, shuffle=False, collate_fn=collator)
test_iter = DataLoader(test_data, batch_size=bsz, shuffle=False, collate_fn=collator)

# Building Seq2Seq Module

In [65]:
from torch import nn 
import torch.nn.functional as F 
import random

In [141]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.embedding = nn.Embedding(self.input_dim, self.emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(2*enc_hid_dim, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embeded = self.embedding(src)
        outputs, hidden = self.rnn(embeded)
        hidden = torch.tanh(self.fc(torch.cat([hidden[-2,:,:],hidden[-1,:,:]],dim=-1)))

        return outputs, hidden

class Attention(nn.Module):
    def __init__(self,enc_hid_dim, dec_hid_dim, attn_dim):
        super().__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim*2) + dec_hid_dim 
        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self, decoder_hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1,0,2)

        energy = torch.tanh(
            self.attn(torch.cat([repeated_decoder_hidden, encoder_outputs], dim=-1))
        )
        attention = torch.sum(energy, dim=-1)
        return F.softmax(attention, dim=-1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(enc_hid_dim*2+emb_dim, dec_hid_dim)
        self.out = nn.Linear(self.attention.attn_in+emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def _weighted_encoder_rep(self, decoder_hidden, encoder_outputs):
        a = self.attention(decoder_hidden, encoder_outputs)
        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1,0,2)
        weighted_encoder_rep = torch.bmm(a, encoder_outputs)
        weighted_encoder_rep = weighted_encoder_rep.permute(1,0,2)

        return weighted_encoder_rep
    
    def forward(self, input, decoder_hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embeded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, encoder_outputs)

        rnn_input = torch.cat((embeded, weighted_encoder_rep),dim=2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embeded = embeded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output, weighted_encoder_rep, embeded),dim=1))
        return output, decoder_hidden.squeeze(0)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1] # tensor.max() returns tuple of maxiums and idmax
            output = (trg[t] if teacher_force else top1)
        return outputs

In [146]:
input_dim = len(en_vocab)
emb_dim = 32
enc_hid_dim = 64
dec_hid_dim = 64
dropout = 0.8
encoder = Encoder(input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout)
attn_dim = 8
attention = Attention(enc_hid_dim, dec_hid_dim, attn_dim)
output_dim = len(fr_vocab)
decoder = Decoder(output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
##device = torch.device("cpu") 
model = Seq2Seq(encoder, decoder,device).to(device)

In [147]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1186, 32)
    (rnn): GRU(32, 64, bidirectional=True)
    (fc): Linear(in_features=128, out_features=64, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=192, out_features=8, bias=True)
    )
    (embedding): Embedding(1483, 32)
    (rnn): GRU(160, 64)
    (out): Linear(in_features=224, out_features=1483, bias=True)
    (dropout): Dropout(p=0.8, inplace=False)
  )
)

# Train, Evaluation, Prediction

In [148]:
from torch import optim
import math

In [149]:
def train_step(model, iterator, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        output = model(src, trg)

        output = output[1:,].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss/len(iterator)

def evaluate_step(model, iterator, criterion):
    model.eval
    
    epoch_loss = 0
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        output = model(src, trg)

        output = output[1:,].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        epoch_loss += loss.item()
    return epoch_loss/len(iterator)


def train(model, train_iter, val_iter, n_epochs, ignore_index, clip=1):
    optimizer  = optim.Adam(model.parameters())
    loss = nn.CrossEntropyLoss(ignore_index = ignore_index)
    for epoch in range(n_epochs):
        train_loss = train_step(model, train_iter, optimizer, loss, clip)
        val_loss = evaluate_step(model, val_iter, loss)
        print(f"Epoch: {epoch+1}-----------------------------------------------------")
        print(f"\tTrain Loss: {train_loss} | Train PPL:{math.exp(train_loss)}")
        print(f"\tVal.  Loss: {val_loss} | Val. PPL:{math.exp(val_loss)}")

In [150]:
train(model, train_iter,val_iter, 2, fr_vocab["<pad>"])

Epoch: 1-----------------------------------------------------
	Train Loss: 6.752014045715332 | Train PPL:855.7806092744532
	Val.  Loss: 6.067500182560512 | Train PPL:431.60040966478596
Epoch: 2-----------------------------------------------------
	Train Loss: 5.8699533653259275 | Train PPL:354.23246036723316
	Val.  Loss: 6.121813501630511 | Train PPL:455.69034079383084
