In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn
import torch.nn.functional as F
from PositionwiseFFN import PositionwiseFFN
from DotProductAttention import DotProductAttention
from MultiHeadAttention import MultiHeadAttention

In [2]:
# Test PFFN
dmodel=64
inlen = 60
ffn = PositionwiseFFN(dmodel)
x = torch.rand(32, 10, 64)
assert ffn(x).shape == x.shape


In [3]:
# Test dot queries = torch.normal(0, 1, (2, 1, 2))
### Test code from d2l.ai
queries = torch.normal(0, 1, (2, 10, 2))
keys = torch.normal(0, 1, (2, 10, 2))
values = torch.normal(0, 1, (2, 10, 4))
valid_lens = torch.tensor([2, 6])

attention = DotProductAttention(dropout=0.5)
attention.eval()
assert attention(queries, keys, values).shape == (2, 10, 4)


In [5]:
# Test MHA
d_model, num_heads = 100, 5
attention = MultiHeadAttention(d_model, d_model, d_model, num_heads)
batch_size = 2

X = torch.ones((batch_size, inlen, d_model))

assert attention(X, X, X, "None").shape == (batch_size, inlen, d_model)
assert attention(X, X, X, "future").shape == (batch_size, inlen, d_model)

In [16]:
# Visualize shaping of attention and masks
q = torch.rand((2,5,8))
v = torch.rand((2,5,8))
k = q

K = q.shape[1]  # Sequence length

dk = q.shape[1]**.5 
I = (q@k.transpose(1,2))

future_mask = torch.triu(torch.ones((K, K)), diagonal=1).bool()
I = I.masked_fill(future_mask, float('-inf'))
a = nn.functional.softmax(I/dk, dim=-1)@v 
print(f"I = {I}")
print(f"Attention = {a}")


I = tensor([[[2.8006,   -inf,   -inf,   -inf,   -inf],
         [1.2165, 2.0385,   -inf,   -inf,   -inf],
         [2.1119, 1.6105, 2.7295,   -inf,   -inf],
         [2.4833, 2.5296, 2.7958, 4.0144,   -inf],
         [2.4044, 1.8091, 1.8190, 2.8045, 3.2882]],

        [[1.0388,   -inf,   -inf,   -inf,   -inf],
         [0.7718, 1.0677,   -inf,   -inf,   -inf],
         [1.4157, 1.3924, 2.8830,   -inf,   -inf],
         [0.9099, 0.6977, 1.2599, 1.2861,   -inf],
         [1.3134, 1.2860, 1.9563, 0.8996, 2.5729]]])
Attention = tensor([[[0.9416, 0.8047, 0.8595, 0.8868, 0.8382, 0.5065, 0.6264, 0.4394],
         [0.5185, 0.8382, 0.7014, 0.8138, 0.5614, 0.3301, 0.3265, 0.6241],
         [0.5208, 0.4814, 0.5057, 0.6066, 0.5135, 0.4826, 0.3880, 0.5943],
         [0.4516, 0.5290, 0.6525, 0.6475, 0.5607, 0.3722, 0.5285, 0.3947],
         [0.6134, 0.6087, 0.6090, 0.6030, 0.4233, 0.4834, 0.5184, 0.4084]],

        [[0.5517, 0.2241, 0.8043, 0.6366, 0.1877, 0.9753, 0.4362, 0.0721],
         [0.3938, 

In [17]:
class EncoderBlock(nn.Module):
    """Transformer Encoder Block from Attention is All You Need. 
    Layer 1: MHA
    Layer 2: Layer normalization + residual connection
    Layer 3: Positionwise FFN 
    Layer 4: Layer normalization + residual connection
    
    Parameters 
        ----------
        d_model:
            Dimension model's latent space
        q:
            Query tensor shape (batch_size, K, d_model)
        k:
            Key tensor shape (batch_size, K, d_model)
        v:
            Value tensor shape (batch_size, K, d_model)

        Returns
        ----------
        Self attention:
            Tensor shape (batch_size, K, d_model)

    """