In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Load data

Install `spacy` and download the raw data for the English and German Spacy tokenizers.  
NOTE: Administrator permission required. 
```bash
$ pip install spacy
$ python -m spacy download en
$ python -m spacy download de
```

In [2]:
from torchtext.datasets import Multi30k
from torchtext.data import Field

SRC = Field(tokenize = "spacy",
            tokenizer_language="de",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = "spacy",
            tokenizer_language="en",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG), 
                                                    root='data/')
print(train_data[0].src)
print(train_data[0].trg)

['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [3]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# A dict mapping word to index
print(list(SRC.vocab.stoi.keys())[:5])
print(list(SRC.vocab.stoi.values())[:5])
# A list mapping index to word
print(SRC.vocab.itos[:8])
print(TRG.vocab.itos[:8])

['<unk>', '<pad>', '<sos>', '<eos>', '.']
[0, 1, 2, 3, 4]
['<unk>', '<pad>', '<sos>', '<eos>', '.', 'ein', 'einem', 'in']
['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the']


## `BucketIterator`: Iterate over the Datasets of Texts

In [4]:
from torch.utils.data import DataLoader
BATCH_SIZE = 4
# The default collate function checks if the batch contains tensors, numpy-arrays, ...
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=lambda x: x)

for i, batch in enumerate(train_loader):
    print(batch)
    break

[<torchtext.data.example.Example object at 0x000002788AF3FD88>, <torchtext.data.example.Example object at 0x000002788AF3FF48>, <torchtext.data.example.Example object at 0x000002788B3F5D08>, <torchtext.data.example.Example object at 0x000002788AF35188>]


In [5]:
from torchtext.data import BucketIterator
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 4

# `BucketIterator` automatically transforms word sequences to tensors with paddings. 
train_iterator = BucketIterator(train_data, batch_size=BATCH_SIZE, device=device)
for i, batch in enumerate(train_iterator):
    print(batch.src.size())
    print(batch.trg.size())
    break

torch.Size([19, 4])
torch.Size([20, 4])


In [6]:
# `BucketIterator` automatically adds <sos>, <eos>, <pad>, <unk> to Tensors. 
batch.src

tensor([[   2,    2,    2,    2],
        [   5,    5,    8,    5],
        [  49,  227,   36,   13],
        [  31, 3787,   22,   11],
        [  21,   26,   80, 3077],
        [   6,    9,  280,   10],
        [ 107,  217,   68,   14],
        [   9,   19,   20,    0],
        [  37, 1192,   88,   12],
        [   5,   12,   27,   14],
        [2387,   14,   14,  546],
        [   7,  923,    0,   34],
        [  15,    7,    5,    4],
        [  81,    6,    0,    3],
        [  10, 1288,    4,    1],
        [1160,    9,    3,    1],
        [   4,  119,    1,    1],
        [   3,    4,    1,    1],
        [   1,    3,    1,    1]])

In [7]:
# `BucketIterator` automatically adds <sos>, <eos>, <pad>, <unk> to Tensors. 
batch.trg

tensor([[   2,    2,    2,    2],
        [   4,    4,    4,    4],
        [  55,  262,   38,    9],
        [  15,  667,   12,    8],
        [  32,   34,   24,    4],
        [  20,   10,   19,  101],
        [   4,  230,  389,   39],
        [  94,    4,    4,    6],
        [  15,  350, 1661,    4],
        [ 140,    8,  134, 1135],
        [   4,    4,    6,  197],
        [1293, 1000,   43,   11],
        [   6,    6,   12,    0],
        [  44,    4,    4,  630],
        [ 181, 1019, 1616,    5],
        [  11,    5,  443,    3],
        [  10,    3,   57,    1],
        [ 883,    1,    5,    1],
        [   5,    1,    3,    1],
        [   3,    1,    1,    1]])

# Define Model

In [8]:
IN_DIM = len(SRC.vocab)
OUT_DIM = len(TRG.vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ATTN_DIM = 64
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
ENC_HID_DIM = 64
DEC_HID_DIM = 64
ATTN_DIM = 8
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

## The Encoder

In [9]:
from typing import Tuple

class Encoder(nn.Module):
    def __init__(self,  in_dim: int,  emb_dim: int, 
                 enc_hid_dim: int,  dec_hid_dim: int,  dropout: float):
        super().__init__()

        self.emb = nn.Embedding(in_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, enc_ins: torch.Tensor) -> Tuple[torch.Tensor]:
        # enc_ins: (batch_size, src_len)
        embedded = self.dropout(self.emb(enc_ins))
        # outs: (batch_size, src_len, enc_hid_dim*2)
        # hidden: (2, batch_size, enc_hid_dim)
        outs, hidden = self.rnn(embedded)
        # Concatenate the last hidden states in two directions. 
        # hidden: (batch_size, enc_hid_dim*2)
        hidden = torch.cat([hidden[0], hidden[1]], dim=-1)
        # hidden: (batch_size, dec_hid_dim)
        hidden = torch.tanh(self.fc(hidden))
        return outs, hidden

In [10]:
encoder = Encoder(IN_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
# No initial hidden state provided, default to be zeros. 
enc_outs, dec_hidden = encoder(batch.src.T)
print(enc_outs.size())
print(dec_hidden.size())

torch.Size([4, 19, 128])
torch.Size([4, 64])


## The Attention

In [11]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim: int, dec_hid_dim: int, attn_dim: int):
        super().__init__()

        self.attn_in = enc_hid_dim*2 + dec_hid_dim
        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self, dec_hidden: torch.Tensor, enc_outs: torch.Tensor) -> torch.Tensor:
        src_len = enc_outs.size(1)
        # repeated_dec_hidden: (batch_size, src_len, dec_hid_dim)
        repeated_dec_hidden = dec_hidden.unsqueeze(1).repeat(1, src_len, 1)
        # enc_outs: (batch_size, src_len, enc_hid_dim*2)
        # energy: (batch_size, src_len, attn_dim)
        energy = torch.tanh(self.attn(torch.cat([repeated_dec_hidden, enc_outs], dim=-1)))
        # attn: (batch_size, src_len)
        attn = energy.sum(dim=-1)
        return F.softmax(attn, dim=-1)

In [12]:
attention = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
attn = attention(dec_hidden, enc_outs)
print(attn.size())

torch.Size([4, 19])


## The Decoder

In [13]:
class Decoder(nn.Module):
    def __init__(self, out_dim: int, emb_dim: int, enc_hid_dim: int,  dec_hid_dim: int,  
                 dropout: float, attention: Attention):
        super().__init__()

        self.attention = attention
        self.emb = nn.Embedding(out_dim, emb_dim)
        # Single-directional
        self.rnn = nn.GRU(enc_hid_dim*2 + emb_dim, dec_hid_dim, batch_first=True)
        self.fc = nn.Linear(enc_hid_dim*2 + dec_hid_dim + emb_dim, out_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, dec_ins: torch.Tensor, dec_hidden: torch.Tensor, 
                enc_outs: torch.Tensor) -> Tuple[torch.Tensor]:
        """
        One-step forward. 
        """
        # dec_ins: (batch_size, 1)
        # embedded: (batch_size, 1, dec_emb_dim)
        embedded = self.dropout(self.emb(dec_ins))
        
        # attn: (batch_size, src_len)
        attn = self.attention(dec_hidden, enc_outs)
        # enc_outs: (batch_size, src_len, enc_hid_dim*2)
        # wtd_enc_rep: (batch_size, 1, enc_hid_dim*2)
        wtd_enc_rep = attn.unsqueeze(1).bmm(enc_outs)
        # rnn_ins: (batch_size, 1, enc_hid_dim*2 + dec_emb_dim)
        rnn_ins = torch.cat([embedded, wtd_enc_rep], dim=-1)
        # outs: (batch_size, 1, dec_hid_dim)
        outs, dec_hidden = self.rnn(rnn_ins, dec_hidden.unsqueeze(0))
        # outs: (batch_size, 1, trg_voc_size)
        outs = self.fc(torch.cat([outs, wtd_enc_rep, embedded], dim=-1))
        return outs, dec_hidden.squeeze(0)

In [14]:
decoder = Decoder(OUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attention)

print(dec_hidden.size())

dec_ins_0 = batch.trg[0].unsqueeze(1)
dec_outs_0, dec_hidden = decoder(dec_ins_0, dec_hidden, enc_outs)
print(dec_hidden.size())
print(dec_outs_0.size())

torch.Size([4, 64])
torch.Size([4, 64])
torch.Size([4, 1, 5893])


## The Seq2Seq Model

In [15]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, enc_ins: torch.Tensor, dec_ins: torch.Tensor, 
                teacher_forcing_ratio: float=0.5) -> torch.Tensor:
        # enc_ins: (batch_size, src_len)
        # dec_ins: (batch_size, trg_len)
        # No initial hidden state provided, default to be zeros. 
        enc_outs, dec_hidden = self.encoder(enc_ins)

        dec_outs = []
        # The first input to the decoder is the <sos> token. 
        # dec_ins_t: (batch_size, 1)
        dec_ins_t = dec_ins[:, 0].unsqueeze(1)
        for t in range(1, dec_ins.size(1)):
            # dec_outs_t: (batch_size, 1, trg_voc_size)
            dec_outs_t, dec_hidden = decoder(dec_ins_t, dec_hidden, enc_outs)
            top1 = dec_outs_t.max(dim=-1)[1]
            if np.random.rand() < teacher_forcing_ratio:
                dec_ins_t = dec_ins[:, t].unsqueeze(1)
            else:
                dec_ins_t = top1
            dec_outs.append(dec_outs_t)
        return torch.cat(dec_outs, dim=1)

    def translate(self, enc_ins: torch.Tensor, 
                  sos: int, eos: int, max_len: int=20) -> torch.Tensor:
        enc_outs, dec_hidden = self.encoder(enc_ins)
        top1s = []

        # The first input to the decoder is the <sos> token. 
        # dec_ins_t: (batch_size=1, 1)
        dec_ins_t = torch.ones(enc_ins.size(0), 1, dtype=torch.long) * sos
        for t in range(max_len):
            # dec_outs_t: (batch_size=1, 1, trg_voc_size)
            dec_outs_t, dec_hidden = decoder(dec_ins_t, dec_hidden, enc_outs)
            top1 = dec_outs_t.max(dim=-1)[1]
            dec_ins_t = top1
            top1s.append(top1)
            if dec_ins_t.item() == eos:
                break
        return torch.cat(top1s, dim=1)

In [16]:
model = Seq2Seq(encoder, decoder, device).to(device)
dec_outs = model(batch.src.T, batch.trg.T)
dec_outs.size()

torch.Size([4, 19, 5893])

In [17]:
# Check if data are mixed across different samples in a batch.
model.eval()
dec_outs_012 = model(batch.src.T[:3], batch.trg.T[:3], 1)
dec_outs_123 = model(batch.src.T[1:], batch.trg.T[1:], 1)
dec_outs_012[1:] == dec_outs_123[:2]

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

        [[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])

In [18]:
TRG_SOS_IDX = TRG.vocab.stoi['<sos>']
TRG_EOS_IDX = TRG.vocab.stoi['<eos>']

model.translate(batch.src.T[:1], TRG_SOS_IDX, TRG_EOS_IDX)

tensor([[2835, 4841,  545, 2087, 3754, 1951,  311, 5354, 1201, 2783, 4106, 4593,
         1528, 5818, 2057, 4030, 3182, 3798, 3611, 5134]])

# Train Model

In [19]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [20]:
encoder = Encoder(IN_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
attention = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
decoder = Decoder(OUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device).to(device)

model.apply(init_weights)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,856,685 trainable parameters


### Notes of Padding
What should be noticed when using a mini-batch with sequences of different lengths? -> Padding
* For both input and output sequences, if the RNNs are bidirectional, the initial hidden states from the backward direction is incorrect, as the hidden states have passed some padding positions. 
    * Use `pack_padded_sequence` and `pad_packed_sequence`.
* For input sequence, some attention weights may be applied to the padding positions. 
* For input sequence, the pooling operation along the sequence may include the padding positions. 
* For output sequence, the loss calculation may include the padding positions. 
    * Use `ignore_index` parameter when creating the loss function. 

In [21]:
# NOTE: When scoring the model performance, tell the `nn.CrossEntropyLoss` function to ignore the indices where the target is padding. 
PAD_IDX = TRG.vocab.stoi['<pad>']
print("Padding Index: %d" % PAD_IDX)

# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient.
loss_func = nn.CrossEntropyLoss(ignore_index=PAD_IDX, reduction='mean')
optimizer = optim.AdamW(model.parameters())
#optimizer = optim.Adadelta(model.parameters(), lr=1.0, rho=0.95)

Padding Index: 1


In [22]:
BATCH_SIZE = 128

# The iterators work like `DataLoader`.
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_sizes=(BATCH_SIZE, BATCH_SIZE*2, BATCH_SIZE*2), 
    device=device)

for i, batch in enumerate(train_iterator):
    print(batch.src.size())
    print(batch.trg.size())
    break

torch.Size([27, 128])
torch.Size([26, 128])


In [23]:
import time
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    # Training
    epoch_loss = 0
    for i, batch in enumerate(train_iterator):
        # Forward pass
        dec_outs = model(batch.src.T, batch.trg.T)
        dec_outs_flattened = dec_outs.view(-1, dec_outs.size(-1))
        trg_flattened = batch.trg.T[:, 1:].flatten()
        # Calculate loss
        loss = loss_func(dec_outs_flattened, trg_flattened)
        epoch_loss += loss.item()

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        # Update weights
        optimizer.step()
    train_loss = epoch_loss / len(train_iterator)

    # Validating
    model.eval()
    epoch_loss = 0
    for i, batch in enumerate(valid_iterator):
        with torch.no_grad():
            # Forward pass
            dec_outs = model(batch.src.T, batch.trg.T)
            dec_outs_flattened = dec_outs.view(-1, dec_outs.size(-1))
            trg_flattened = batch.trg.T[:, 1:].flatten()
            # Calculate loss
            loss = loss_func(dec_outs_flattened, trg_flattened)
            epoch_loss += loss.item()
    valid_loss = epoch_loss / len(valid_iterator)
    model.train()

    epoch_secs = time.time() - start_time
    epoch_mins, epoch_secs = int(epoch_secs // 60), int(epoch_secs % 60)
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {np.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 3m 27s
	Train Loss: 5.687 | Train PPL: 294.946
	 Val. Loss: 5.050 |  Val. PPL: 156.088
Epoch: 02 | Time: 3m 21s
	Train Loss: 5.023 | Train PPL: 151.845
	 Val. Loss: 4.754 |  Val. PPL: 116.033
Epoch: 03 | Time: 3m 24s
	Train Loss: 4.747 | Train PPL: 115.195
	 Val. Loss: 4.497 |  Val. PPL:  89.757
Epoch: 04 | Time: 3m 19s
	Train Loss: 4.561 | Train PPL:  95.724
	 Val. Loss: 4.433 |  Val. PPL:  84.177
Epoch: 05 | Time: 3m 12s
	Train Loss: 4.414 | Train PPL:  82.581
	 Val. Loss: 4.193 |  Val. PPL:  66.214
Epoch: 06 | Time: 3m 10s
	Train Loss: 4.328 | Train PPL:  75.782
	 Val. Loss: 4.065 |  Val. PPL:  58.272
Epoch: 07 | Time: 3m 11s
	Train Loss: 4.234 | Train PPL:  68.960
	 Val. Loss: 4.198 |  Val. PPL:  66.535
Epoch: 08 | Time: 3m 13s
	Train Loss: 4.138 | Train PPL:  62.683
	 Val. Loss: 3.940 |  Val. PPL:  51.443
Epoch: 09 | Time: 3m 12s
	Train Loss: 4.032 | Train PPL:  56.349
	 Val. Loss: 3.868 |  Val. PPL:  47.836
Epoch: 10 | Time: 3m 12s
	Train Loss: 3.942 | Train PPL

In [24]:
# Testing
model.eval()
epoch_loss = 0
for i, batch in enumerate(test_iterator):
    with torch.no_grad():
        # Forward pass
        dec_outs = model(batch.src.T, batch.trg.T)
        dec_outs_flattened = dec_outs.view(-1, dec_outs.size(-1))
        trg_flattened = batch.trg.T[:, 1:].flatten()
        # Calculate loss
        loss = loss_func(dec_outs_flattened, trg_flattened)
        epoch_loss += loss.item()
test_loss = epoch_loss / len(test_iterator)
model.train()

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |')

| Test Loss: 3.775 | Test PPL:  43.593 |


# Test Model

In [25]:
model.eval()

SRC_SOS_IDX = SRC.vocab.stoi['<sos>']
SRC_EOS_IDX = SRC.vocab.stoi['<eos>']
SRC_UNK_IDX = SRC.vocab.stoi['<unk>']

for i in range(10):
    IDX = np.random.randint(0, len(test_data))
    print("De:", " ".join(test_data[IDX].src))

    src = [SRC_SOS_IDX] + [SRC.vocab.stoi.get(w, SRC_UNK_IDX) for w in test_data[IDX].src] + [SRC_EOS_IDX]
    src = torch.tensor(src, dtype=torch.long).unsqueeze(0)

    trans = model.translate(src, TRG_SOS_IDX, TRG_EOS_IDX).squeeze(0)
    trans = " ".join([TRG.vocab.itos[i.item()] for i in trans])
    print("En (Trans):", trans)
    print("En (Real):", " ".join(test_data[IDX].trg))
    print("=" * 50)

De: geschäftiges asiatisches einkaufszentrum mit papierlaternen und einkäufern .
En (Trans): a <unk> <unk> <unk> <unk> in a <unk> in a <unk> . <eos>
En (Real): busy asian mall with paper lanterns and shoppers .
De: sechs männer sitzen auf einem acker mit holzkisten .
En (Trans): three men are sitting on a a in a a in a . . <eos>
En (Real): six men sit in a field of crops containing wooden crates .
De: der weiße hund läuft im flachen wasser .
En (Trans): a dog dog dog in a a a a a . <eos>
En (Real): the white dog is running in the shallow water .
De: eine gruppe stammesangehöriger füllt in der wüste wasserbehälter .
En (Trans): a group of people are in a a in a a . . <eos>
En (Real): a tribal group filling water jugs in the desert .
De: vier personen spielen fußball auf einem strand .
En (Trans): three people are in a a in a a in a . . <eos>
En (Real): four people are playing soccer on a beach .
De: der mann im weißen t-shirt macht sich daran , auf einen fels zu klettern .
En (Trans): a