# 注意力机制的思想与原理

文字，全流程详细讲解，无代码 https://blog.csdn.net/benzhujie1245com/article/details/117173090

视频，原因讲解 https://www.bilibili.com/video/BV1dt4y1J7ov/

视频，讲解详细 https://www.bilibili.com/video/BV1v3411r78R/

文字，简单讲解，有代码 https://blog.csdn.net/qq_52785473/article/details/124537101

文字，简单讲解，有代码 https://blog.csdn.net/Datawhale/article/details/120320116


In [5]:
from IPython.display import Image
Image(url= "33.png")

In [2]:
Image(url= "34.png")

# transformer中的注意力机制的代码实现

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import copy

## transformer几个重要的功能函数

### masked_fill 掩码

mask（掩码、掩膜）是深度学习中的常见操作。简单而言，其相当于在原始张量上盖上一层掩膜，从而屏蔽或选择一些特定元素，因此常用于构建张量的过滤器（见下图）

按照上述定义，非线性激活函数Relu（根据输出的正负区间进行简单粗暴的二分）、dropout机制（根据概率进行二分）都可以理解为泛化的mask操作。

从任务适应性上，mask在图像和自然语言处理中都广为应用，其应用包括但不局限于：图像兴趣区提取、图像屏蔽、图像结构特征提取、语句padding对齐的mask、语言模型中sequence mask等。

从使用mask的具体流程上，其可以作用于数据的预处理（如原始数据的过滤）、模型中间层（如relu、drop等）和模型损失计算上（如padding序列的损失忽略）

https://aistudio.csdn.net/63aaf7f90d4fc52e3cfc4359.html

In [64]:
Image(url= "36.png")

In [4]:
tensor = torch.arange(0,16).view(4,4)
print('origin tensor:\n{}\n'.format(tensor))

mask = torch.eye(4,dtype=torch.bool)
print('mask tensor:\n{}\n'.format(mask))

tensor = tensor.masked_fill(mask,100)
print('filled tensor:\n{}'.format(tensor))


origin tensor:
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15]])

mask tensor:
tensor([[ True, False, False, False],
        [False,  True, False, False],
        [False, False,  True, False],
        [False, False, False,  True]])

filled tensor:
tensor([[100,   1,   2,   3],
        [  4, 100,   6,   7],
        [  8,   9, 100,  11],
        [ 12,  13,  14, 100]])


### Dropout

Dropout是一种常用的正则化方法，通过随机将部分神经元的输出置为0来减少过拟合

In [8]:
m = nn.Dropout(p=0.2)
input = torch.randn(20, 16)
output = m(input)
print(input[0])
print(output[0])

#有一部分的值变为了0，这些值大约占据总数的0.2。
#其它非0参数都除以0.8，使得值变大了。

tensor([-1.6486,  0.9010, -2.5143,  0.2972,  2.9130, -0.6835,  1.0512,  1.4107,
         0.1304,  1.6109, -0.0490, -3.6601, -0.3592,  0.8066, -2.2930, -0.7336])
tensor([-2.0607,  1.1262, -3.1429,  0.3716,  3.6412, -0.8543,  1.3140,  0.0000,
         0.1630,  2.0137, -0.0613, -0.0000, -0.4490,  1.0083, -2.8662, -0.9170])


In [36]:
-0.6835/0.8

-0.854375

### view

view squeeze transpose比较

https://blog.csdn.net/lsb2002/article/details/132905346

* 通过手工指定，将一个一维tensor变换为3*8维的tensor

* 如果某个参数为-1，则表示该维度取决于其它维度，由Pytorch自己补充

* 将tensor展平成一维

In [39]:
#通过手工指定，将一个一维tensor变换为3*8维的tensor

a1 = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
                   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
 
a2 = a1.view(4, 6)
print(a1)
print(a2)
print(a1.shape)
print(a2.shape)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24])
tensor([[ 1,  2,  3,  4,  5,  6],
        [ 7,  8,  9, 10, 11, 12],
        [13, 14, 15, 16, 17, 18],
        [19, 20, 21, 22, 23, 24]])
torch.Size([24])
torch.Size([4, 6])


In [13]:
#如果某个参数为-1，则表示该维度取决于其它维度，由Pytorch自己补充
a3 = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
 
a4 = a3.view(4, -1)
a5 = a3.view(2, 3, -1)
a6 = a3.view(-1, 3, 2)
 
print(a3)
print(a4)
print(a5)
print(a6)
print(a3.shape)
print(a4.shape)
print(a5.shape)
print(a6.shape)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24])
tensor([[ 1,  2,  3,  4,  5,  6],
        [ 7,  8,  9, 10, 11, 12],
        [13, 14, 15, 16, 17, 18],
        [19, 20, 21, 22, 23, 24]])
tensor([[[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]],

        [[13, 14, 15, 16],
         [17, 18, 19, 20],
         [21, 22, 23, 24]]])
tensor([[[ 1,  2],
         [ 3,  4],
         [ 5,  6]],

        [[ 7,  8],
         [ 9, 10],
         [11, 12]],

        [[13, 14],
         [15, 16],
         [17, 18]],

        [[19, 20],
         [21, 22],
         [23, 24]]])
torch.Size([24])
torch.Size([4, 6])
torch.Size([2, 3, 4])
torch.Size([4, 3, 2])


In [14]:
# 将tensor展平成一维
 
a7 = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                   [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]])
a8 = a6.view(-1)
print(a7)
print(a8)
print(a7.shape)
print(a8.shape)

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
        [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]])
tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24])
torch.Size([2, 12])
torch.Size([24])


### register_buffer

https://blog.csdn.net/devil_son1234/article/details/130699031

* Parameter与Buffer

模型保存下来的参数有两种：一种是需要更新的Parameter，另一种是不需要更新的buffer。在模型中，利用backward反向传播，可以通过requires_grad来得到buffer和parameter的梯度信息，但是利用optimizer进行更新的是parameter，buffer不会更新，这也是两者最重要的区别。这两种参数都存在于model.state_dict()的OrderedDict中，也会随着模型“移动”（model.cuda()）。

In [10]:
class my_model(nn.Module):
    def __init__(self):
        super(my_model, self).__init__()
        self.conv = nn.Conv2d(1, 1, 3, 1, 1)
        self.tensor = torch.randn(size=(1, 1, 5, 5))
        self.register_buffer('my_buffer', self.tensor)
 
    def forward(self, x):
        return self.conv(x) + self.my_buffer  # 这里不再是self.tensor
 
 
x = torch.randn(size=(1, 1, 5, 5))
x = x.to('cuda')
model = my_model().cuda()
model(x)
print(model.state_dict())
print('..........')
print(model.tensor)
print(model.my_buffer)

OrderedDict([('my_buffer', tensor([[[[-1.4281, -2.0015,  0.5312,  1.1952, -0.7626],
          [ 0.0796, -1.5467,  1.0079, -0.4127, -0.0910],
          [ 0.3273, -0.5740,  0.7187,  0.6561, -1.1638],
          [-0.1531,  1.1297,  0.7064,  1.2909,  0.3901],
          [ 0.7776,  0.5719,  0.8909,  0.0940,  0.8989]]]], device='cuda:0')), ('conv.weight', tensor([[[[ 0.2759, -0.3194,  0.2303],
          [ 0.0626, -0.3022,  0.3288],
          [-0.1622,  0.0643, -0.2257]]]], device='cuda:0')), ('conv.bias', tensor([0.1275], device='cuda:0'))])
..........
tensor([[[[-1.4281, -2.0015,  0.5312,  1.1952, -0.7626],
          [ 0.0796, -1.5467,  1.0079, -0.4127, -0.0910],
          [ 0.3273, -0.5740,  0.7187,  0.6561, -1.1638],
          [-0.1531,  1.1297,  0.7064,  1.2909,  0.3901],
          [ 0.7776,  0.5719,  0.8909,  0.0940,  0.8989]]]])
tensor([[[[-1.4281, -2.0015,  0.5312,  1.1952, -0.7626],
          [ 0.0796, -1.5467,  1.0079, -0.4127, -0.0910],
          [ 0.3273, -0.5740,  0.7187,  0.6561, 

### Variable

https://blog.csdn.net/Mr_zhuo_/article/details/108132061

pytorch两个基本对象：Tensor（张量）和Variable（变量）其中，tensor不能反向传播，variable可以反向传播。

In [19]:
#Variable(torch.zeros(8, 4, 4))

### torch.matmul

https://blog.csdn.net/weixin_44225182/article/details/126655303


各个相乘函数的比较

https://blog.csdn.net/jizhidexiaoming/article/details/82502724

如果两个参数都是二维的，则返回矩阵-矩阵乘积
也就是 正常的矩阵乘法 (m * n) * (n * k) = (m * k)

In [9]:
tensor1 = torch.Tensor([[1,2,3],
                        [1,2,3]])
tensor2 =torch.Tensor([[4,5],
                       [4,5],
                       [4,5]])
ans = torch.matmul(tensor1, tensor2)

print('ans :', ans)
print('ans.size :', ans.size())


ans : tensor([[24., 30.],
        [24., 30.]])
ans.size : torch.Size([2, 2])


### nn.ModuleList()

https://blog.csdn.net/AdamCY888/article/details/131270295

nn.ModuleList() 是 PyTorch 中的一个类，用于管理神经网络模型中的子模块列表。它允许我们将多个子模块组织在一起，并将它们作为整个模型的一部分进行管理和操作。


In [17]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        self.module_list = nn.ModuleList([
            nn.Linear(10, 20),
            nn.ReLU(),
            nn.Linear(20, 10),
        ])

    def forward(self, x):
        for module in self.module_list:
            x = module(x)
        return x

model = MyModel()
input_tensor = torch.randn(32, 10)
output_tensor = model(input_tensor)


### contiguous

https://blog.csdn.net/m0_48241022/article/details/132804698

张量的连续性、contiguous函数

在pytorch中，tensor的实际数据以一维数组（storage）的形式存储于某个连续的内存中，以“行优先”进行存储
 tensor连续（contiguous）是指tensor的storage元素排列顺序与其按行优先时的元素排列顺序相同


 tensor不连续会导致某些操作无法进行，比如view()就无法进行。在上面的例子中：由于 b 是不连续的，所以对其进行view()操作会报错；b.view(3,3)没报错，因为b本身的shape就是(3,3)。

  tensor.contiguous()返回一个与原始tensor有相同元素的 “连续”tensor，如果原始tensor本身就是连续的，则返回原始tensor。

In [7]:
Image(url= "37.png")

In [3]:
a = torch.tensor([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])
print(a)
print(a.storage())
print(a.is_contiguous())  # a是连续的

 
b = a.t()  # b是a的转置
print(b)
print(b.storage())
print(b.is_contiguous())  # b是不连续的

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 9]
True
tensor([[1, 4, 7],
        [2, 5, 8],
        [3, 6, 9]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 9]
False


  print(a.storage())


In [8]:
c = b.contiguous()
print(b)
print(c)
print(b.storage())
print(c.storage())

tensor([[1, 4, 7],
        [2, 5, 8],
        [3, 6, 9]])
tensor([[1, 4, 7],
        [2, 5, 8],
        [3, 6, 9]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 9]
 1
 4
 7
 2
 5
 8
 3
 6
 9
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 9]


## transformer 主要类

### 词嵌入



In [40]:
#词嵌入
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
    # d_model:词嵌入维度
    # vocab:字典大小
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)


In [41]:
d_model = 512  # embedding_size
vocab = 1000  # 词典大小
x=torch.tensor([[100, 2, 421, 508], [491, 998, 1, 221]], dtype=torch.long)
emb = Embeddings(d_model, vocab)
embr = emb(x)
print(embr.shape)


torch.Size([2, 4, 512])


In [42]:
embr

tensor([[[  4.4606,  34.4301, -24.5994,  ...,  17.1663,  28.1237,  16.9237],
         [  5.1524,   9.8639,  31.0702,  ...,   3.0191,   9.8241,  26.0486],
         [ -6.2616,   7.7158,  23.8920,  ...,  -9.6690,  17.4992,  22.7776],
         [ 14.7686,  -5.9697,  17.2761,  ..., -19.2139, -36.0296, -19.4070]],

        [[-15.1041,  16.8019,  11.4768,  ...,  17.9029,  27.7799,   1.4143],
         [-17.6204,   1.9429,  16.7623,  ..., -51.7329, -12.3380,   1.6418],
         [ 13.7802,  -3.5118,   5.1110,  ...,   9.2191, -26.6751,  28.3472],
         [-21.7961, -42.0103, -23.3687,  ...,   6.7964, -30.1595,  46.5827]]],
       grad_fn=<MulBackward0>)

### 位置编码

In [44]:
#位置编码
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
    # d_model:词嵌入维度
    # dropout:置零比率
    # max_len:每个句子最大的长度
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0,  max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(1000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return self.dropout(x)

In [45]:
dropout = 0.1
max_len = 60
pe = PositionalEncoding(d_model, dropout, max_len)
pe_result = pe(embr)
print(pe_result.shape)

torch.Size([2, 4, 512])


In [46]:
pe_result

tensor([[[  4.9562,  39.3668, -27.3327,  ...,  20.1848,  31.2486,  19.9152],
         [  6.6598,  11.5602,  35.4411,  ...,   4.4657,  10.9168,   0.0000],
         [ -5.9470,   8.1107,  27.5802,  ...,  -9.6322,   0.0000,  26.4196],
         [ 16.5664,  -7.7330,  19.4397,  ..., -20.2377, -40.0294,  -0.0000]],

        [[-16.7823,  19.7799,  12.7519,  ...,  21.0032,   0.0000,   0.0000],
         [ -0.0000,   2.7591,  19.5435,  ...,  -0.0000, -13.7077,   2.9353],
         [ 16.3216,  -4.3644,   6.7124,  ...,  11.3546, -29.6367,  32.6080],
         [-24.0611, -47.7781, -25.7211,  ...,   8.6627, -33.5071,  52.8697]]],
       grad_fn=<MulBackward0>)

### 多头自注意力机制

In [50]:
#mask == 0

In [34]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)

    if dropout is not None:
        p_attn = dropout(p_attn)
    
    return torch.matmul(p_attn, value), p_attn

# 深层拷贝
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class MultiHeadedAttention(nn.Module):
    def __init__(self, head, embedding_dim, dropout=0.1):
        # head:代表几个头
        # embedding_dim:词嵌入维度
        # dropout:置0比率
        super(MultiHeadedAttention, self).__init__()

        # 确认embedding_dim能够被head整除
        assert embedding_dim % head == 0
        self.head = head
        self.d_k = embedding_dim // head
        # 获得4个线性层， 分别是Q、K、V、以及最终的输出的线形层
        self.linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(0)
        
        batch_size = query.size(0)

        print(query.shape)
        
        print(len(self.linears))

        # 经过线性层投影后分成head个注意力头
        query, key, value = [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) for model, x in zip(self.linears, (query, key, value))]
        # 各自计算每个头的注意力
        print(query.shape)
        
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        # 转换回来
        print(x.shape)
        
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.head * self.d_k)
        # 经过最后一个线性层得到最终多头注意力机制的结果
        return self.linears[-1](x)


In [29]:
for i in zip([1,2,3,4],[2,2,2]):
    print(i)

(1, 2)
(2, 2)
(3, 2)


In [52]:
#mask

In [53]:
512/8

64.0

In [51]:
head = 8
embedding_dim = 512
dropout = 0.2
query = key = value = pe_result
mask = Variable(torch.zeros(8, 4, 4))
mha = MultiHeadedAttention(head, embedding_dim, dropout)
mha_result = mha(query, key, value, mask)
print(mha_result)
print(mha_result.shape)


torch.Size([2, 4, 512])
4
torch.Size([2, 8, 4, 64])
torch.Size([2, 8, 4, 64])
tensor([[[-8.2004,  0.5804,  1.3639,  ...,  1.7793, -2.9470, -1.8913],
         [-6.3798,  2.6990,  3.7775,  ...,  1.6971, -2.1978, -0.7955],
         [-6.0106,  2.9720,  4.2135,  ..., -2.0315, -5.0306, -4.9978],
         [-6.6885,  0.0595,  1.7017,  ...,  2.5037, -2.7890,  2.4970]],

        [[-2.7299, -2.4048,  3.6923,  ..., -7.7733,  1.3931,  1.7657],
         [ 3.8099, -1.5517,  0.7698,  ..., -3.5534,  0.2886,  2.9241],
         [-2.1036, -3.9115, -4.4982,  ..., -4.2154,  2.1434,  2.2444],
         [ 1.0188, -1.2110,  0.7608,  ..., -3.6656,  6.5101,  4.7269]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


In [27]:
query = key = value = pe_result
mask = Variable(torch.zeros(2, 4, 4))
attn, p_attn = attention(query, key, value,mask=mask)
# print(attn)
# print(attn.shape)
# print(p_attn)
# print(p_attn.shape)


### 前馈全连接层（PositionwiseFeedForward）

考虑注意力机制可能对复杂的情况拟合程度不够，因此增加两层网络来增强模型的能力。

前馈全连接层就是两次线性层+Relu激活

In [61]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.w2(self.dropout(F.relu(self.w1(x))))


In [62]:
d_model = 512
d_ff = 64
dropout = 0.2
x = mha_result
ff = PositionwiseFeedForward(d_model, d_ff, dropout=dropout)
ff_result = ff(x)
print(ff_result)
print(ff_result.shape)

tensor([[[ 1.1700e+00, -8.7976e-02, -1.8591e+00,  ..., -7.9013e-01,
          -1.3219e+00, -9.0196e-01],
         [-3.7141e-01,  6.4589e-01, -9.5103e-01,  ...,  7.5773e-01,
          -1.1638e+00, -2.1573e-01],
         [ 1.3626e+00, -4.1240e-01, -8.5201e-01,  ...,  1.1944e+00,
          -9.4036e-01,  4.4812e-01],
         [ 1.2279e-01, -5.7696e-01, -1.8156e+00,  ..., -3.2551e-01,
          -1.9518e+00,  6.0685e-01]],

        [[-1.6618e+00, -2.1277e+00, -1.0500e+00,  ..., -5.7305e-01,
           5.0484e-01, -4.1263e-01],
         [-1.0694e+00, -1.1956e+00, -1.0528e+00,  ..., -3.3320e-01,
          -7.3139e-01, -9.6569e-01],
         [-2.0961e+00, -2.2061e-01, -8.0619e-01,  ..., -2.7707e-03,
          -9.8822e-01, -4.2281e-01],
         [-2.9646e+00, -8.1585e-01, -1.3249e+00,  ...,  6.8580e-01,
          -6.6974e-01,  8.2907e-01]]], grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


### LayerNorm 规范化层

BatchNorm简单来说就是对一批样本按照每个特征维度进行归一化

Layer Norm是对每个单词的Embedding做归一化

https://blog.csdn.net/qq_43827595/article/details/121877901

https://liumin.blog.csdn.net/article/details/85075706

In [61]:
Image(url= "35.png")

In [64]:
x

tensor([[[-8.2004,  0.5804,  1.3639,  ...,  1.7793, -2.9470, -1.8913],
         [-6.3798,  2.6990,  3.7775,  ...,  1.6971, -2.1978, -0.7955],
         [-6.0106,  2.9720,  4.2135,  ..., -2.0315, -5.0306, -4.9978],
         [-6.6885,  0.0595,  1.7017,  ...,  2.5037, -2.7890,  2.4970]],

        [[-2.7299, -2.4048,  3.6923,  ..., -7.7733,  1.3931,  1.7657],
         [ 3.8099, -1.5517,  0.7698,  ..., -3.5534,  0.2886,  2.9241],
         [-2.1036, -3.9115, -4.4982,  ..., -4.2154,  2.1434,  2.2444],
         [ 1.0188, -1.2110,  0.7608,  ..., -3.6656,  6.5101,  4.7269]]],
       grad_fn=<ViewBackward0>)

In [71]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a2 = nn.Parameter(torch.ones(features))
        self.b2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(-1, keepdim = True)
        std = x.std(-1, keepdim = True)
        return self.a2 * (x - mean) / (std + self.eps) + self.b2

In [70]:
ln = LayerNorm(512)
lnn = nn.LayerNorm(512)
ln_result = ln(x)
lnn_result = lnn(x)
print(ln_result)
print(lnn_result)

tensor([[[-1.9218,  0.0708,  0.2487,  ...,  0.3429, -0.7296, -0.4901],
         [-1.5170,  0.5434,  0.7881,  ...,  0.3160, -0.5679, -0.2497],
         [-1.5732,  0.7361,  1.0552,  ..., -0.5502, -1.3212, -1.3128],
         [-1.5733, -0.0602,  0.3080,  ...,  0.4879, -0.6989,  0.4864]],

        [[-0.6454, -0.5691,  0.8614,  ..., -1.8287,  0.3220,  0.4094],
         [ 0.8695, -0.3313,  0.1886,  ..., -0.7796,  0.0808,  0.6711],
         [-0.5172, -0.9272, -1.0602,  ..., -0.9961,  0.4457,  0.4686],
         [ 0.1812, -0.2806,  0.1277,  ..., -0.7889,  1.3183,  0.9490]]],
       grad_fn=<AddBackward0>)
tensor([[[-1.9237,  0.0709,  0.2489,  ...,  0.3433, -0.7304, -0.4906],
         [-1.5185,  0.5439,  0.7889,  ...,  0.3163, -0.5685, -0.2499],
         [-1.5747,  0.7368,  1.0563,  ..., -0.5508, -1.3225, -1.3141],
         [-1.5748, -0.0603,  0.3083,  ...,  0.4883, -0.6996,  0.4868]],

        [[-0.6460, -0.5697,  0.8623,  ..., -1.8305,  0.3223,  0.4098],
         [ 0.8704, -0.3317,  0.1888,  ..

### 子层连接结构(SublayerConnection)

Add&Norm

In [59]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout=0.1):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p=dropout) 
        self.size = size
    
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))


In [60]:
size = 512
dropout = 0.2
head = 8
d_model = 512
x = pe_result
mask = Variable(torch.zeros(8, 4, 4))
self_attn = MultiHeadedAttention(head, d_model)
sublayer = lambda x: self_attn(x, x, x, mask)
sc = SublayerConnection(size, dropout)
sc_result = sc(x, sublayer)
print(sc_result)
print(sc_result.shape)


tensor([[[ 0.3507],
         [ 0.6553],
         [-1.0369],
         [ 0.2879]],

        [[ 0.2382],
         [-0.2214],
         [ 0.7459],
         [-0.4876]]], grad_fn=<MeanBackward1>)
tensor([[[24.9129],
         [23.3701],
         [24.4109],
         [22.9386]],

        [[24.8099],
         [24.7052],
         [24.3432],
         [23.9425]]], grad_fn=<StdBackward0>)
Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1