## Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation

In [11]:
import torch
import torchtext
import torch.nn as nn
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data import functional
from torch import optim
from tqdm import tqdm
import sys
import random
import spacy

### 1. Prepare Data

#### Build Vocabulary for source and target languages

In [12]:
de_tokenizer = get_tokenizer('spacy', 'de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

def de_yield_tokens(train_iter):
    for de_text, _ in train_iter:
        yield de_tokenizer(de_text[:-1].lower())
        
def en_yield_tokens(train_iter):
    for _, en_text in train_iter:
        yield en_tokenizer(en_text[:-1].lower())
        
special_tokens = ["<unk>", "<pad>", "sos", "eos"]

train_iter = Multi30k(split=("train"))
source_vocab = build_vocab_from_iterator(de_yield_tokens(train_iter), 
                                         min_freq = 1,
                                         specials = special_tokens)
source_vocab.set_default_index(source_vocab["<unk>"])

train_iter = Multi30k(split=("train"))
target_vocab = build_vocab_from_iterator(en_yield_tokens(train_iter),
                                        min_freq = 1,
                                        specials = special_tokens)
target_vocab.set_default_index(target_vocab["<unk>"])

#### Build Dataloader

In [13]:
# Load data
train_iter, valid_iter, test_iter = Multi30k()

en_text_pipeline = lambda x:  target_vocab(["sos"] + en_tokenizer(x)[:-1] + ["eos"]) 
de_text_pipeline = lambda x: source_vocab(["sos"] + de_tokenizer(x)[:-1] + ["eos"])

BATCH_SIZE = 100

def collate_batch_input(batch):
    source_list, target_list = [], []
    for source_text, target_text in batch:  
        text_seq = de_text_pipeline(source_text.lower()) 
        source_list.append(torch.tensor(text_seq, dtype=torch.int64))
        text_seq = en_text_pipeline(target_text.lower())
        target_list.append(torch.tensor(text_seq[:-1], dtype=torch.int64))
               
    source_tensor = pad_sequence(source_list, batch_first=True, padding_value=source_vocab["<pad>"])
    target_tensor = pad_sequence(target_list, batch_first=True, padding_value=source_vocab["<pad>"])
    
    return source_tensor, target_tensor, 


train_dataset = functional.to_map_style_dataset(train_iter)
valid_dataset = functional.to_map_style_dataset(valid_iter)
test_dataset = functional.to_map_style_dataset(test_iter)

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE,
                          shuffle = True, collate_fn = collate_batch_input)
valid_loader = DataLoader(valid_dataset, batch_size = BATCH_SIZE,
                          shuffle=False, collate_fn = collate_batch_input)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE,
                          shuffle=False, collate_fn = collate_batch_input)

### 2. Define Model

#### Encoder

<img src="img/seq2seq5.png"/>

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, pad_idx):
        super(Encoder, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx = pad_idx)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first = True)
        
    def forward(self, src):
        # src = [batch_size, src_len]
        
        embedded = self.embedding(src)
        # embedded = [batch_size, src_len, embed_dim]
        
        output, hidden = self.gru(embedded)
        # output = [batch_size, seq_len, hidden_dim]
        # hidden = [1, batch_size, hidden_dim]
        
        return hidden

### Decoder

<img src="img/seq2seq6.png"/>

In [15]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, pad_index):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embed_dim, padding_idx=pad_index)
        self.gru = nn.GRU(embed_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(embed_dim + hidden_dim*2, output_dim)
        
    def forward(self, input, hidden, context):
        # input = [batch_size]
        # hidden = [1, batch_size, hidden_dim]
        # context = [1, batch_size, hidden_dim]
        
        input = input.unsqueeze(1)
        # input = [batch_size, 1]
        
        embedded = self.embedding(input)
        # embed = [batch_size, 1, embed_dim]     
        
        context = context.permute(1,0,2)
        # embed = [batch_size, 1, embed_dim]
                
        embedded_context = torch.cat((embedded, context), dim=2)
        # embedded_context = [batch_size, 1, hidden_dim + embed_dim]
        
        output, hidden = self.gru(embedded_context, hidden)
        # output = [batch_size, 1, hidden_dim]
        # hidden = [1, batch_size, hidden_dim]
                
        fc_input = torch.cat((embedded.squeeze(1), hidden.squeeze(0), context.squeeze(1)), dim=1)
        # fc_input = [batch_size, embed_dim + hidden_dim + context_dim]
               
        fc_out = self.fc(fc_input.unsqueeze(1))
        # fc_out = [batch_size, embed_dim + hidden_dim + context_dim]
        
        return fc_out, hidden

### Seq2Seq

<img src="img/seq2seq7.png" />

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
        assert encoder.hidden_dim == decoder.hidden_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_force_ratio = 0.5):
        # src = [batch_size, src_len]
        # trg = [batch_size, trg_len]
        
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size)
        
        context = self.encoder(src)
        hidden = context
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, context)  
            outputs[:, t, :] = output[:, 0, :]
            best_guess = output.argmax(2).squeeze(1)
            input = trg[:,t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs

In [17]:
INPUT_DIM = len(source_vocab)
OUTPUT_DIM = len(target_vocab)
EMBED_DIM = 50
HIDDEN_DIM = 60

src_pad_idx = source_vocab["<pad>"]
trg_pad_idx = target_vocab["<pad>"]

enc = Encoder(INPUT_DIM, EMBED_DIM, HIDDEN_DIM, src_pad_idx)
dec = Decoder(OUTPUT_DIM, EMBED_DIM, HIDDEN_DIM, trg_pad_idx,)

model = Seq2Seq(enc, dec)

In [18]:
# Calculate the number of trainable parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,149,265 trainable parameters


In [19]:
def train(model, dataloader):
    model.train()
    epoch_loss = 0
    
    for src, trg in tqdm(dataloader, desc='training...', file=sys.stdout):
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        output_dim = output.shape[-1]

        output = output.reshape(-1, output_dim)
        trg = trg.reshape(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        epoch_loss += loss.item()
    
    return epoch_loss/len(dataloader)


def evaluate(model, dataloader):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for src, trg in dataloader:
            output = model(src, trg)
            output_dim = output.shape[-1]
            output = output.reshape(-1, output_dim)
            trg = trg.reshape(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss/len(dataloader)  

In [20]:
N_EPOCHS = 10

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = trg_pad_idx)

for epoch in range(1, N_EPOCHS+1):
    train_loss = train(model, train_loader)
    val_loss = evaluate(model, valid_loader)
    print(f"| Epoch: {epoch}/{N_EPOCHS} | Train Loss: {train_loss} | Val Loss: {val_loss}")
    
eval_loss = evaluate(model, test_loader)
print("="*60)
print(eval_loss)    

torch.save(model, "ml_model_2.pt")

training...: 100%|███████████████████████████████████████████████████████████████████| 290/290 [11:50<00:00,  2.45s/it]
| Epoch: 1/10 | Train Loss: 5.838485459623666 | Val Loss: 5.255523508245295
training...: 100%|███████████████████████████████████████████████████████████████████| 290/290 [11:49<00:00,  2.45s/it]
| Epoch: 2/10 | Train Loss: 5.09617817155246 | Val Loss: 4.961202448064631
training...: 100%|███████████████████████████████████████████████████████████████████| 290/290 [11:41<00:00,  2.42s/it]
| Epoch: 3/10 | Train Loss: 4.807755406149503 | Val Loss: 4.770119883797386
training...: 100%|███████████████████████████████████████████████████████████████████| 290/290 [12:00<00:00,  2.48s/it]
| Epoch: 4/10 | Train Loss: 4.598086210777019 | Val Loss: 4.55740534175526
training...: 100%|███████████████████████████████████████████████████████████████████| 290/290 [12:40<00:00,  2.62s/it]
| Epoch: 5/10 | Train Loss: 4.426855216355159 | Val Loss: 4.407356782393022
training...: 100%|████

In [75]:
def translate_sentence(model, sentence):
    model.eval()
    tokens = ["sos"] + de_tokenizer(sentence.lower()) + ["eos"]
    sequence = source_vocab(tokens)
    sent_tensor = torch.LongTensor(sequence).unsqueeze(0)
    
    with torch.no_grad():
        hidden = model.encoder(sent_tensor)
        context = hidden
        
    outputs = target_vocab(["sos"])
        
    for _ in range(len(sequence)):
        previous_word = torch.LongTensor([outputs[-1]])
        
        with torch.no_grad():
            output, hidden = model.decoder(previous_word, hidden, context)
            best_guess = output.argmax(2).item()
            outputs.append(best_guess)
            if output.argmax(2).item() == source_vocab["<eos>"]:
                break
                
    tranlated_sent = target_vocab.lookup_tokens(outputs)
    return ' '.join(tranlated_sent[1:])

In [76]:
sentence = 'Ein kleines Mädchen in Rosa tanzt mit den Händen auf den Hüften.'
print(translate_sentence(model, sentence))

a little girl in a pink shirt is holding her head on the ground .


In [71]:
train_dataset[655]

('Ein kleines Mädchen in Rosa tanzt mit den Händen auf den Hüften.\n',
 'A little girl in pink dances with her hands on her hips.\n')

#### Reference
1. https://github.com/bentrevett