# 加性注意力例子

加性注意力分数计算：

$$a(\mathbf q, \mathbf k) = \mathbf w_v^\top \text{tanh}(\mathbf W_q\mathbf q + \mathbf W_k \mathbf k) \in \mathbb{R},$$

where learnable parameters

$\mathbf W_q\in\mathbb R^{h\times q}$, $\mathbf W_k\in\mathbb R^{h\times k}$, and $\mathbf w_v\in\mathbb R^{h}$.

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

from lib.d2l_torch import masked_softmax

## 加性注意力例子

-  batch size 为 2，对于每一个 batch，有：
- - 两个 `query`，每个的长度为 $20$
- 有 $10$ 个 `key-value pair`
- - 其中 `key` 的长度为 $2$；
- - 其中 `value` 的长度为 $4$；

假设 `h` 的大小为 $16$，也就是都转换为长度为 $16$ 的向量。

In [3]:
queries, keys = torch.normal(0, 1, (2, 2, 20)), torch.ones((2, 10, 2))
# The two value matrices in the values minibatch are identical
values = torch.arange(40, dtype=torch.float32).reshape(1, 10, 4).repeat(2, 1, 1)
# 打印大小, 第一位都是 batch size
print(queries.shape, keys.shape, values.shape)

torch.Size([2, 2, 20]) torch.Size([2, 10, 2]) torch.Size([2, 10, 4])


In [4]:
# 分别计算 Wq, Wk, 转换为长度为 h 的向量
num_hiddens = 16
W_k = nn.LazyLinear(num_hiddens, bias=False)
W_q = nn.LazyLinear(num_hiddens, bias=False)

_keys, _queries, = W_k(keys), W_q(queries)
print(_queries.shape, _keys.shape) # 打印 query 和 key 的大小



torch.Size([2, 2, 16]) torch.Size([2, 10, 16])


In [5]:
# 计算 Wq + Wk_i, 一个 q 要和所有 k 现加
# shape of queries: (batch_size, no. of queries, num_hiddens) --> (batch_size, no. of queries, 1, num_hiddens)
# shape of keys: (batch_size, no. of key-value pairs, num_hiddens).  --> (batch_size, 1, no. of key-value pairs, num_hiddens). 
wq_wk = _queries.unsqueeze(2) +  _keys.unsqueeze(1)
print(wq_wk.shape)  # 一共有两个 query, 每个 query 和 10 个 key 得到一个长度为 16 的向量

torch.Size([2, 2, 10, 16])


In [6]:
# 计算 tanh(w), 这里大小不变
features = torch.tanh(wq_wk)
print(features.shape)

torch.Size([2, 2, 10, 16])


In [7]:
# 计算 w_v * tanh(wq + wk) 的值, 一个 query 和 key 得到一个值
w_v = nn.LazyLinear(1, bias=False)
scores = w_v(features)
print('Before Squeeze', scores.shape)
scores.squeeze_(-1)
print('After Squeeze', scores.shape)
print(scores[0][0].shape) # 一个 query 和所有 key 的值, 形状
print(scores[0][0]) # 一个 query 和所有 key 的值, 数值

Before Squeeze torch.Size([2, 2, 10, 1])
After Squeeze torch.Size([2, 2, 10])
torch.Size([10])
tensor([0.3273, 0.3273, 0.3273, 0.3273, 0.3273, 0.3273, 0.3273, 0.3273, 0.3273,
        0.3273], grad_fn=<SelectBackward0>)


In [8]:
# 对数值进行归一化
attention_weight = F.softmax(scores, dim=2)
print(attention_weight.shape)
print(attention_weight[0][0])

torch.Size([2, 2, 10])
tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000], grad_fn=<SelectBackward0>)


In [9]:
# 利用 attention mask, 有一些是不会被包含在内的
valid_lens = torch.tensor([2, 6]) # 第一个查看前 2 个, 第二个查看前 6 个
attention_weight = masked_softmax(scores, valid_lens=valid_lens)

print(attention_weight.shape)
print(attention_weight[0][0])

torch.Size([2, 2, 10])
tensor([0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], grad_fn=<SelectBackward0>)


## 定义「加性注意力」

In [31]:
class AdditiveAttention(nn.Module):
    """Additive attention."""
    def __init__(self, num_hiddens, dropout, **kwargs):
        super(AdditiveAttention, self).__init__(**kwargs)
        self.W_k = nn.LazyLinear(num_hiddens, bias=False) # k --> h
        self.W_q = nn.LazyLinear(num_hiddens, bias=False) # q --> h
        self.w_v = nn.LazyLinear(1, bias=False) # h --> 1
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens):
        # 注意这里维度的变化, 会有四个维度
        # valid_lens, 考虑多少个 key-value pair
        queries, keys = self.W_q(queries), self.W_k(keys)
        # After dimension expansion, 
        # shape of queries: (batch_size, no. of queries, 1, num_hiddens)
        # shape of keys: (batch_size, 1, no. of key-value pairs, num_hiddens). 
        # Sum them up with broadcasting
        # 最终结果的维度是, (batch_size, no. of queries, no. of key-value pairs, num_hiddens)
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = torch.tanh(features)
        # There is only one output of self.w_v, so we remove the last
        # one-dimensional entry from the shape. 
        # Shape of scores: (batch_size, no. of queries, no. of key-value pairs)
        # 对每一个 query, 都有 key-value pair 的大小
        scores = self.w_v(features).squeeze(-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        # Shape of values: (batch_size, no. of key-value pairs, value dimension)
        return torch.bmm(self.dropout(self.attention_weights), values)

In [33]:
# 有 1 个 query, query 的长度是 20
# 有 10 个 key, key 的长度是 2
# 有 10 个 value, value 的长度是 4
queries, keys = torch.normal(0, 1, (2, 1, 20)), torch.ones((2, 10, 2))
# The two value matrices in the values minibatch are identical
values = torch.arange(40, dtype=torch.float32).reshape(1, 10, 4).repeat(2, 1, 1)
valid_lens = torch.tensor([2, 6]) # 第一个查看前 2 个, 第二个查看前 6 个

attention = AdditiveAttention(num_hiddens=8, dropout=0.1)
attention.eval()
attention(queries, keys, values, valid_lens)

tensor([[[ 2.0000,  3.0000,  4.0000,  5.0000]],

        [[10.0000, 11.0000, 12.0000, 13.0000]]], grad_fn=<BmmBackward0>)