[参考：The Annotated Transformer](http://nlp.seas.harvard.edu/annotated-transformer/)

In [2]:
import copy
import math
import time
import numpy as np
import matplotlib.pyplot as plt
import pdb
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Embeddings
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model  #表示embedding的维度

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

![](https://ai-studio-static-online.cdn.bcebos.com/f0f23768fde44668862580e5121e4a418a456ef7082c47e0b7f57864c17e3898)
![](https://ai-studio-static-online.cdn.bcebos.com/8ae4070f176e44fc9617562b8aef4ed9c41026cd5f8e4e63ab274b392af54cef)

In [5]:
# Scaled Dot-Product Attention
'''
注意力机制；可以参考网上的一些讲解： 如这里提供的：经典论文复现Transformer(理论篇)

对于单头注意力机制，简单理解过程如下：下面论述中，小写的qkv 表示某一个词向量对应的qkv， 大写的QKV表示整个输入句子对应的QKV

1. 输入是一句话，一句话中有很多单词，句子的长度为 length
2. embedding：句子是文本，计算机无法直接处理，需要把每个字转换为词向量，这就是 word_embedding， 每个词向量的维度是 d_model
3. 这样输入 input 的维度是： [batch, length, d_model]
4. 每次词都会生成 Q,K,V; 其中 Wq,Wk,Wv 的 shape 为 [d_model, d_k], [d_model, d_k], [d_model, d_v]
   Q = input * Wq
   K = input * Wk
   V = input * Wv
  Q:[batch, length, d_k]; K:[batch, length, d_k]; V:[batch, length, d_v]
5. 得到的 Q K V 就是上面第一个图的输入
   Q 乘以 K 的transpose，然后除以 d_k 的算术平方根， 结果的 shape 为 [batch, length, lenght]
   然后再乘以 V， 最后的shape 为 [batch, length, d_v]
'''
def attention(query, key, value, mask=None, dropout=None):
    "计算Attention即点乘V"
    d_k = query.size(-1)
    # 单头：输入：query: [B, L, d_k]; key: [B, L, d_k]； 输出：scores：[B, L, L]
    # 多头：输入：query: [B, h, L, d_k]; key: [B, h, L, d_k]； 输出：scores：[B, h, L, L]
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    # 单头: scores: [B, L, L]; p_attn:[B, L, L]
    # 多头: scores: [B, h, L, L]; p_attn:[B, h, L, L]
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)

    # 单头: 输出1：[B, L, d_v]; 输出2 p_attn:[B, L, L]
    # 多头: 输出1：[B, h, L, d_v]; 输出2 p_attn:[B, h, L, L]
    return torch.matmul(p_attn, value), p_attn


'''
但是论文里面的是多头注意力， 如第二幅图.
如果是多头注意力:
沿着上述单头注意力机制的理解：
1. 输入 input 依然是 [batch, length, d_model]
2. 每个词也都会生成 Q K V, 但由于是多头，所以 Q K V 的 shape 为 
    Q:[batch, head, length, d_k], K:[batch, head, length, d_k], V:[batch, head, length, d_v]
    d_model = length * d_k
3. 然后Q 乘以 K 的transpose，然后除以 d_k 的算术平方根， 结果的 shape 为 [batch, head, length, lenght]
   然后再乘以 V， 最后的shape 为 [batch, head, length, d_v]
4. 最后还原成输入的 shape ： [batch, head, length, d_v] -> [batch, length, d_model]

实际实现中，上述第二幅图中， Q,K,V对应的就是 input，和第一幅图中的 Q, K, V不一样，是为了方便计算，第一幅中的Q, K, V是为了方便大家理解。
Q, K, V都是输入，然后经过 linear 层，才得到上面所说的shape 为 [batch, head, length, d_k]的 Q,K,V

所以论文中第二幅图中的Q,K,V 都是输入， 对应上述中的第一步；
经过linear层后，得到上述第二步中的shape为 [batch, head, length, d_k] 的Q,K,V
这样做的目的是封装性更好。

'''
class MultiHeadedAttention(nn.Module):
    """
        实现MultiHeadedAttention。
           输入的Q,K,V是形状 [batch, L, d_model],本质上就是输入句子的词向量
           输出的形状同上。
    """
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h * d_k
        # 这一步QKV变化:[batch, L, d_model] -> [batch, h, L, d_k]
        # 本质上就是计算输入每一个单词的q,k,v, d_model = h * d_k
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        # QKV :[batch, h, L, d_model/h] -->x:[b, h, L, d_v], attn[b, h, L, L]
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        # 上一步的结果合并在一起还原成原始输入序列的形状
        x = ( # [b, h, L, d_v]
            x.transpose(1, 2) # [b, L, h, d_v]
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k) # [b, L, d_model]
        )
        del query
        del key
        del value
        # 最后再过一个线性层
        return self.linears[-1](x) # [b, L, d_model]


In [20]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (be equal to nn.LayerNorm)."

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


In [None]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))