# Transformer by PyTorch (Attention Is All You Need)

![Transformer](fig/transformer.png)

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Preparing Data

In [2]:
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    """
    Tokenize German text. 
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenize English text.
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [3]:
from torchtext.data import Field, BucketIterator

# Set `batch_first=False` in the `Field`.
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', 
            lower=True, include_lengths=True, batch_first=False)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', 
            lower=True, include_lengths=True, batch_first=False)

In [4]:
from torchtext.datasets import Multi30k

train_data, valid_data, test_data = Multi30k.splits(exts=['.de', '.en'], 
                                                    # fields=[SRC, TRG], 
                                                    fields=[('src', SRC), ('trg', TRG)], 
                                                    root='data/')

In [5]:
print(train_data[0].src)
print(train_data[0].trg)

['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [6]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

len(SRC.vocab), len(TRG.vocab)

(7855, 5893)

In [7]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, device=device)

In [8]:
for batch in train_iterator:
    batch_src, batch_src_lens = batch.src
    batch_trg, batch_trg_lens = batch.trg
    break
print(batch_src)
print(batch_src_lens)
print(batch_trg)
print(batch_trg_lens)

tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  5,   5,  43,  ...,   5,  18,  18],
        [ 13,  13, 253,  ...,  13,  30,   0],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([14, 17, 12, 11, 17, 21, 12, 16, 14, 11, 23, 23,  8, 11,  9, 14, 19, 20,
        12, 16,  9, 11, 13, 20, 21, 29, 13, 22, 14, 16, 10,  9, 15, 12, 17, 10,
        14, 22, 17, 20, 23, 23, 12, 17, 15, 19, 17, 15, 16,  7, 14, 15, 16, 12,
        17, 14, 18, 18, 14, 14, 17, 21, 12, 12,  9, 19, 12, 14, 12, 11, 10, 13,
        18, 14,  9, 11, 10, 12, 10, 25, 14, 18, 15, 16, 15, 18, 13,  9, 21, 11,
        20, 12, 13, 14, 14, 17, 10, 13, 18, 30, 14, 12, 13,  9, 10, 15, 13, 10,
        12, 15, 13, 18, 17, 13, 11, 12, 10, 16, 12, 13, 24, 14, 19, 19, 10, 20,
        12, 11])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    4,   48,  ...,    4,   16,   16],
        [   9,    9,   25,  .

## Building the Model
### Multi-Head Attention

`nn.MultiheadAttention.forward`
* `key_padding_mask` is an binary mask - when the value is `True`, the corresponding value on the (`trg_step` * `src_step`) energy matrix will be filled with `-inf` before passing to `softmax`.  
* `attn_mask` is an additive mask (i.e. the values will be added to the energy matrix before `softmax`). Hence, the value being `-inf` means "masked", and the value being `0` means "not-masked". 
    * This mask aims to prevent attention to certain positions.  
    * A 2D mask will be broadcasted for all the batches while a 3D mask allows to specify a different mask for the entries of each batch. 

In [26]:
SRC_IN_DIM = len(SRC.vocab)
TRG_IN_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
ENC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
DEC_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]


attention = nn.MultiheadAttention(embed_dim=HID_DIM, num_heads=ENC_HEADS, 
                                  dropout=ENC_DROPOUT).to(device)
emb = nn.Embedding(SRC_IN_DIM, HID_DIM, padding_idx=ENC_PAD_IDX).to(device)
attention.eval()
emb.eval()

# mask: (batch, step)
mask = (batch_src == emb.padding_idx).T
# Q: (step, batch, hid_dim)
Q = emb(batch_src)
# attened_values: (trg_step, batch, hid_dim)
# attens: (batch, trg_step, src_step)
attened_values, attens = attention(Q, Q, Q, key_padding_mask=mask)

print(batch_src.size())
print(attens.size())
print(attened_values.size())

torch.Size([30, 128])
torch.Size([128, 30, 30])
torch.Size([30, 128, 256])


In [20]:
print((attens.sum(dim=-1) - 1).abs().max())
print(((attens == 0) == mask.unsqueeze(1)).all())

tensor(2.3842e-07, grad_fn=<MaxBackward1>)
tensor(True)


### Encoder Layer

In [28]:
model = nn.TransformerEncoderLayer?

[1;31mInit signature:[0m
[0mnn[0m[1;33m.[0m[0mTransformerEncoderLayer[0m[1;33m([0m[1;33m
[0m    [0md_model[0m[1;33m,[0m[1;33m
[0m    [0mnhead[0m[1;33m,[0m[1;33m
[0m    [0mdim_feedforward[0m[1;33m=[0m[1;36m2048[0m[1;33m,[0m[1;33m
[0m    [0mdropout[0m[1;33m=[0m[1;36m0.1[0m[1;33m,[0m[1;33m
[0m    [0mactivation[0m[1;33m=[0m[1;34m'relu'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
TransformerEncoderLayer is made up of self-attn and feedforward network.
This standard encoder layer is based on the paper "Attention Is All You Need".
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
in a different way during application.

Args:
    d_model: the number of expected features in the input (required

In [14]:
model.forward?

[1;31mSignature:[0m
[0mmodel[0m[1;33m.[0m[0mforward[0m[1;33m([0m[1;33m
[0m    [0msrc[0m[1;33m,[0m[1;33m
[0m    [0mtgt[0m[1;33m,[0m[1;33m
[0m    [0msrc_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtgt_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmemory_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msrc_key_padding_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtgt_key_padding_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmemory_key_padding_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Take in and process masked source/target sequences.

Args:
    src: the sequence to the encoder (required).
    tgt: the sequence to the decoder (required).
    src_mask: the additive mask for the src sequence (optional).
    tgt_mask: the additive mask for the tgt sequence (optional).
    memory_mask: the

In [10]:
enc_layer = nn.TransformerEncoderLayer(512, 8)

In [12]:
enc_layer.forward?

[1;31mSignature:[0m [0menc_layer[0m[1;33m.[0m[0mforward[0m[1;33m([0m[0msrc[0m[1;33m,[0m [0msrc_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0msrc_key_padding_mask[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Pass the input through the encoder layer.

Args:
    src: the sequence to the encoder layer (required).
    src_mask: the mask for the src sequence (optional).
    src_key_padding_mask: the mask for the src keys per batch (optional).

Shape:
    see the docs in Transformer class.
[1;31mFile:[0m      e:\anaconda3\lib\site-packages\torch\nn\modules\transformer.py
[1;31mType:[0m      method
