In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load data

Install `spacy` and download the raw data for the English and German Spacy tokenizers.  
NOTE: Administrator permission required. 
```bash
$ pip install spacy
$ python -m spacy download en
$ python -m spacy download de
```

In [2]:
from torchtext.datasets import Multi30k
from torchtext.data import Field

SRC = Field(tokenize = "spacy",
            tokenizer_language="de",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            tokenizer_language="en",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG), 
                                                    root='data/')
print(train_data[0].src)
print(train_data[0].trg)

['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [3]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# A dict mapping word to index
print(list(SRC.vocab.stoi.keys())[:5])
print(list(SRC.vocab.stoi.values())[:5])
# A list mapping index to word
print(SRC.vocab.itos[:8])
print(TRG.vocab.itos[:8])

['<unk>', '<pad>', '<sos>', '<eos>', '.']
[0, 1, 2, 3, 4]
['<unk>', '<pad>', '<sos>', '<eos>', '.', 'ein', 'einem', 'in']
['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the']


## `BucketIterator`: Iterate over the Datasets of Texts

In [4]:
from torch.utils.data import DataLoader
BATCH_SIZE = 4
# The default collate function checks if the batch contains tensors, numpy-arrays, ...
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=lambda x: x)

for i, batch in enumerate(train_loader):
    print(batch)
    break

[<torchtext.data.example.Example object at 0x7fd7eac40610>, <torchtext.data.example.Example object at 0x7fd7eac406d0>, <torchtext.data.example.Example object at 0x7fd7eac40810>, <torchtext.data.example.Example object at 0x7fd7f27c6d50>]


In [5]:
from torchtext.data import BucketIterator
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 4

# `BucketIterator` automatically transforms word sequences to tensors with paddings. 
train_iterator = BucketIterator(train_data, batch_size=BATCH_SIZE, device=device)
for i, batch in enumerate(train_iterator):
    print(batch.src.size())
    print(batch.trg.size())
    break

torch.Size([18, 4])
torch.Size([18, 4])


In [6]:
# `BucketIterator` automatically adds <sos>, <eos>, <pad>, <unk> to Tensors. 
batch.src

tensor([[   2,    2,    2,    2],
        [   5,    5,   18,    8],
        [  13,   13,   45, 3294],
        [   7,   29,  839,   31],
        [   6,  326,  498,  133],
        [  51,   12,    4,   10],
        [  79,   15,    3, 3135],
        [ 212,   34,    1,   21],
        [  27,   10,    1,   75],
        [   6,  339,    1,    4],
        [  51,   11,    1,    3],
        [ 320,   77,    1,    1],
        [  10,  277,    1,    1],
        [   6,    4,    1,    1],
        [  78,    3,    1,    1],
        [ 104,    1,    1,    1],
        [   4,    1,    1,    1],
        [   3,    1,    1,    1]], device='cuda:0')

In [7]:
# `BucketIterator` automatically adds <sos>, <eos>, <pad>, <unk> to Tensors. 
batch.trg

tensor([[   2,    2,    2,    2],
        [   9,    4,   16,    4],
        [   6,    9,  666,   38],
        [  25,   10,   17,   12],
        [  23,   36, 2008,   63],
        [ 436,    6,  167,  150],
        [   6,    7,   72,  129],
        [  43,  168,    5,   11],
        [  12,   12,    3,  245],
        [  25,    4,    1,    8],
        [ 268,  142,    1,  553],
        [  11,   28,    1,   11],
        [  59,  119,    1, 1626],
        [  77,    8,    1,    5],
        [   5,   27,    1,    3],
        [   3,  286,    1,    1],
        [   1,    5,    1,    1],
        [   1,    3,    1,    1]], device='cuda:0')

# Define Model

In [8]:
IN_DIM = len(SRC.vocab)
OUT_DIM = len(TRG.vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ATTN_DIM = 64
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

## The Encoder

In [9]:
from typing import Tuple

class Encoder(nn.Module):
    def __init__(self,  in_dim: int,  emb_dim: int, 
                 enc_hid_dim: int,  dec_hid_dim: int,  dropout: float):
        super().__init__()

        self.emb = nn.Embedding(in_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, enc_ins: torch.Tensor) -> Tuple[torch.Tensor]:
        # enc_ins: (batch_size, src_len)
        embedded = self.dropout(self.emb(enc_ins))
        # outs: (batch_size, src_len, enc_hid_dim*2)
        # hidden: (2, batch_size, enc_hid_dim)
        outs, hidden = self.rnn(embedded)
        # Concatenate the last hidden states in two directions. 
        # hidden: (batch_size, enc_hid_dim*2)
        hidden = torch.cat([hidden[0], hidden[1]], dim=-1)
        # hidden: (batch_size, dec_hid_dim)
        hidden = torch.tanh(self.fc(hidden))
        return outs, hidden

In [10]:
encoder = Encoder(IN_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT).to(device)
# No initial hidden state provided, default to be zeros. 
enc_outs, dec_hidden = encoder(batch.src.T)
print(enc_outs.size())
print(dec_hidden.size())

torch.Size([4, 18, 128])
torch.Size([4, 64])


## The Attention

In [11]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim: int, dec_hid_dim: int, attn_dim: int):
        super().__init__()

        self.attn_in = enc_hid_dim*2 + dec_hid_dim
        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self, dec_hidden: torch.Tensor, enc_outs: torch.Tensor) -> torch.Tensor:
        src_len = enc_outs.size(1)
        # repeated_dec_hidden: (batch_size, src_len, dec_hid_dim)
        repeated_dec_hidden = dec_hidden.unsqueeze(1).repeat(1, src_len, 1)
        # enc_outs: (batch_size, src_len, enc_hid_dim*2)
        # energy: (batch_size, src_len, attn_dim)
        energy = torch.tanh(self.attn(torch.cat([repeated_dec_hidden, enc_outs], dim=-1)))
        # attn: (batch_size, src_len)
        attn = energy.sum(dim=-1)
        return F.softmax(attn, dim=-1)

In [12]:
attention = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM).to(device)
attn = attention(dec_hidden, enc_outs)
print(attn.size())

torch.Size([4, 18])


## The Decoder

In [13]:
class Decoder(nn.Module):
    def __init__(self, out_dim: int, emb_dim: int, enc_hid_dim: int,  dec_hid_dim: int,  
                 dropout: float, attention: Attention):
        super().__init__()

        self.attention = attention
        self.emb = nn.Embedding(out_dim, emb_dim)
        # Single-directional
        self.rnn = nn.GRU(enc_hid_dim*2 + emb_dim, dec_hid_dim, batch_first=True)
        self.fc = nn.Linear(enc_hid_dim*2 + dec_hid_dim + emb_dim, out_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, dec_ins: torch.Tensor, dec_hidden: torch.Tensor, 
                enc_outs: torch.Tensor) -> Tuple[torch.Tensor]:
        """
        One-step forward. 
        """
        # dec_ins: (batch_size, 1)
        # embedded: (batch_size, 1, dec_emb_dim)
        embedded = self.dropout(self.emb(dec_ins))
        
        # attn: (batch_size, src_len)
        attn = self.attention(dec_hidden, enc_outs)
        # enc_outs: (batch_size, src_len, enc_hid_dim*2)
        # wtd_enc_rep: (batch_size, 1, enc_hid_dim*2)
        wtd_enc_rep = attn.unsqueeze(1).bmm(enc_outs)
        # rnn_ins: (batch_size, 1, enc_hid_dim*2 + dec_emb_dim)
        rnn_ins = torch.cat([embedded, wtd_enc_rep], dim=-1)
        # outs: (batch_size, 1, dec_hid_dim)
        outs, dec_hidden = self.rnn(rnn_ins, dec_hidden.unsqueeze(0))
        # outs: (batch_size, 1, trg_voc_size)
        outs = self.fc(torch.cat([outs, wtd_enc_rep, embedded], dim=-1))
        return outs, dec_hidden.squeeze(0)

In [14]:
decoder = Decoder(OUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attention).to(device)

print(dec_hidden.size())

dec_ins_0 = batch.trg[0].unsqueeze(1)
dec_outs_0, dec_hidden = decoder(dec_ins_0, dec_hidden, enc_outs)
print(dec_hidden.size())
print(dec_outs_0.size())

torch.Size([4, 64])
torch.Size([4, 64])
torch.Size([4, 1, 5893])


## The Seq2Seq Model

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, enc_ins: torch.Tensor, dec_ins: torch.Tensor, 
                teacher_forcing_ratio: float=0.5) -> torch.Tensor:
        # enc_ins: (batch_size, src_len)
        # dec_ins: (batch_size, trg_len)
        # No initial hidden state provided, default to be zeros. 
        enc_outs, dec_hidden = self.encoder(enc_ins)

        dec_outs = []
        # The first input to the decoder is the <sos> token. 
        # dec_ins_t: (batch_size, 1)
        dec_ins_t = dec_ins[:, 0].unsqueeze(1)
        for t in range(1, dec_ins.size(1)):
            # dec_outs_t: (batch_size, 1, trg_voc_size)
            dec_outs_t, dec_hidden = self.decoder(dec_ins_t, dec_hidden, enc_outs)
            top1 = dec_outs_t.max(dim=-1)[1]
            if np.random.rand() < teacher_forcing_ratio:
                dec_ins_t = dec_ins[:, t].unsqueeze(1)
            else:
                dec_ins_t = top1
            dec_outs.append(dec_outs_t)
        return torch.cat(dec_outs, dim=1)

    def translate(self, enc_ins: torch.Tensor, 
                  sos: int, eos: int, max_len: int=20) -> torch.Tensor:
        enc_outs, dec_hidden = self.encoder(enc_ins)
        top1s = []

        # The first input to the decoder is the <sos> token. 
        # dec_ins_t: (batch_size=1, 1)
        dec_ins_t = torch.ones(enc_ins.size(0), 1, dtype=torch.long, device=self.device) * sos
        for t in range(max_len):
            # dec_outs_t: (batch_size=1, 1, trg_voc_size)
            dec_outs_t, dec_hidden = self.decoder(dec_ins_t, dec_hidden, enc_outs)
            top1 = dec_outs_t.max(dim=-1)[1]
            dec_ins_t = top1
            top1s.append(top1)
            if dec_ins_t.item() == eos:
                break
        return torch.cat(top1s, dim=1)

In [16]:
model = Seq2Seq(encoder, decoder, device).to(device)
dec_outs = model(batch.src.T, batch.trg.T)
dec_outs.size()

torch.Size([4, 17, 5893])

In [17]:
# Check if data are mixed across different samples in a batch.
model.eval()
dec_outs_012 = model(batch.src.T[:3], batch.trg.T[:3], 1)
dec_outs_123 = model(batch.src.T[1:], batch.trg.T[1:], 1)
dec_outs_012[1:] == dec_outs_123[:2]

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

        [[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]], device='cuda:0')

In [18]:
TRG_SOS_IDX = TRG.vocab.stoi['<sos>']
TRG_EOS_IDX = TRG.vocab.stoi['<eos>']

model.translate(batch.src.T[:1], TRG_SOS_IDX, TRG_EOS_IDX)

tensor([[2801,  532, 4413, 2061, 2615,  804, 5777, 4392, 4163,  748, 4600,  712,
         3128, 2249, 3905, 5049, 2866,  470, 1802, 5247]], device='cuda:0')

# Train Model

In [19]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [20]:
encoder = Encoder(IN_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attention = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
decoder = Decoder(OUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device).to(device)

model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,856,685 trainable parameters


### Notes of Padding
What should be noticed when using a mini-batch with sequences of different lengths? -> Padding
* For both input and output sequences, if the RNNs are bidirectional, the initial hidden states from the backward direction is incorrect, as the hidden states have passed some padding positions. 
    * Use `pack_padded_sequence` and `pad_packed_sequence`.
* For input sequence, some attention weights may be applied to the padding positions. 
* For input sequence, the pooling operation along the sequence may include the padding positions. 
* For output sequence, the loss calculation may include the padding positions. 
    * Use `ignore_index` parameter when creating the loss function. 

In [21]:
# NOTE: When scoring the model performance, tell the `nn.CrossEntropyLoss` function to ignore the indices where the target is padding. 
PAD_IDX = TRG.vocab.stoi['<pad>']
print("Padding Index: %d" % PAD_IDX)

# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient.
loss_func = nn.CrossEntropyLoss(ignore_index=PAD_IDX, reduction='mean')
optimizer = optim.AdamW(model.parameters())
#optimizer = optim.Adadelta(model.parameters(), lr=1.0, rho=0.95)

Padding Index: 1


In [22]:
BATCH_SIZE = 128

# The iterators work like `DataLoader`.
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_sizes=(BATCH_SIZE, BATCH_SIZE*2, BATCH_SIZE*2), 
    device=device)

for i, batch in enumerate(train_iterator):
    print(batch.src.size())
    print(batch.trg.size())
    break

torch.Size([30, 128])
torch.Size([29, 128])


In [23]:
import time
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    # Training
    epoch_loss = 0
    for i, batch in enumerate(train_iterator):
        # Forward pass
        dec_outs = model(batch.src.T, batch.trg.T)
        dec_outs_flattened = dec_outs.view(-1, dec_outs.size(-1))
        trg_flattened = batch.trg.T[:, 1:].flatten()
        # Calculate loss
        loss = loss_func(dec_outs_flattened, trg_flattened)
        epoch_loss += loss.item()

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        # Update weights
        optimizer.step()
    train_loss = epoch_loss / len(train_iterator)

    # Validating
    model.eval()
    epoch_loss = 0
    for i, batch in enumerate(valid_iterator):
        with torch.no_grad():
            # Forward pass
            dec_outs = model(batch.src.T, batch.trg.T, teacher_forcing_ratio=0)  #turn off teacher forcing
            dec_outs_flattened = dec_outs.view(-1, dec_outs.size(-1))
            trg_flattened = batch.trg.T[:, 1:].flatten()
            # Calculate loss
            loss = loss_func(dec_outs_flattened, trg_flattened)
            epoch_loss += loss.item()
    valid_loss = epoch_loss / len(valid_iterator)
    model.train()

    epoch_secs = time.time() - start_time
    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 22s
	Train Loss: 5.675 | Train PPL: 291.471
	 Val. Loss: 5.229 |  Val. PPL: 186.566
Epoch: 02 | Time: 0m 22s
	Train Loss: 4.979 | Train PPL: 145.352
	 Val. Loss: 5.118 |  Val. PPL: 166.937
Epoch: 03 | Time: 0m 22s
	Train Loss: 4.749 | Train PPL: 115.487
	 Val. Loss: 4.978 |  Val. PPL: 145.185
Epoch: 04 | Time: 0m 22s
	Train Loss: 4.600 | Train PPL:  99.510
	 Val. Loss: 4.874 |  Val. PPL: 130.843
Epoch: 05 | Time: 0m 22s
	Train Loss: 4.453 | Train PPL:  85.881
	 Val. Loss: 4.861 |  Val. PPL: 129.190
Epoch: 06 | Time: 0m 22s
	Train Loss: 4.351 | Train PPL:  77.579
	 Val. Loss: 4.748 |  Val. PPL: 115.300
Epoch: 07 | Time: 0m 22s
	Train Loss: 4.261 | Train PPL:  70.870
	 Val. Loss: 4.690 |  Val. PPL: 108.815
Epoch: 08 | Time: 0m 22s
	Train Loss: 4.160 | Train PPL:  64.068
	 Val. Loss: 4.658 |  Val. PPL: 105.404
Epoch: 09 | Time: 0m 22s
	Train Loss: 4.072 | Train PPL:  58.647
	 Val. Loss: 4.594 |  Val. PPL:  98.916
Epoch: 10 | Time: 0m 22s
	Train Loss: 3.989 | Train PPL

In [24]:
# Testing
model.eval()
epoch_loss = 0
for i, batch in enumerate(test_iterator):
    with torch.no_grad():
        # Forward pass
        dec_outs = model(batch.src.T, batch.trg.T, teacher_forcing_ratio=0)  #turn off teacher forcing
        dec_outs_flattened = dec_outs.view(-1, dec_outs.size(-1))
        trg_flattened = batch.trg.T[:, 1:].flatten()
        # Calculate loss
        loss = loss_func(dec_outs_flattened, trg_flattened)
        epoch_loss += loss.item()
test_loss = epoch_loss / len(test_iterator)
model.train()

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |')

| Test Loss: 4.416 | Test PPL:  82.727 |


# Test Model

In [25]:
model.eval()

SRC_SOS_IDX = SRC.vocab.stoi['<sos>']
SRC_EOS_IDX = SRC.vocab.stoi['<eos>']
SRC_UNK_IDX = SRC.vocab.stoi['<unk>']

for i in range(10):
    IDX = np.random.randint(0, len(test_data))
    print("De:", " ".join(test_data[IDX].src))

    src = [SRC_SOS_IDX] + [SRC.vocab.stoi.get(w, SRC_UNK_IDX) for w in test_data[IDX].src] + [SRC_EOS_IDX]
    src = torch.tensor(src, dtype=torch.long, device=device).unsqueeze(0)

    trans = model.translate(src, TRG_SOS_IDX, TRG_EOS_IDX).squeeze(0)
    trans = " ".join([TRG.vocab.itos[i.item()] for i in trans])
    print("En (Trans):", trans)
    print("En (Real):", " ".join(test_data[IDX].trg))
    print("=" * 50)

De: ein kleines kind in grünen stiefeln spielt in einer schlammpfütze .
En (Trans): a little boy in a a a a a a a a a . <eos>
En (Real): a young child wearing green boots playing in a mud puddle
De: menschen fahren bei nacht mit mopeds die straße hinunter .
En (Trans): people are on a <unk> of a <unk> . <eos>
En (Real): people are driving scooters down the street at night .
De: eine gruppe asiatischer kinder in weißen hemden und kappen gibt eine vorstellung vor einer menge .
En (Trans): a group of people in a and and a and a and a and a and a . <eos>
En (Real): group of asian children dressed in white shirts and hats performing with a crowd looking on .
De: ein skateboarder fährt eine betonwand hoch und fällt beinahe beim versuch , einen trick zu machen .
En (Trans): a boy is a a a a a a a a the ball . <eos>
En (Real): a skateboarder rides up a concrete wall , nearly falling off as he tries a trick .
De: das ist eine gruppe von leuten , die auf einem event herumstehen .
En (Trans): a g