In [1]:
# 先熟悉一下自注意力机制
# Q K V，通过同一个X得到，序列长度不变[32,512]，单个词向量的维度多少？
# Q（q1、q2、q3）和K（每一个，包括k1、k2、k3）相乘如何表示（Q点乘K的转置）？得到 q1对每个k的相关度
# 相关度除以根号d_k，归一化（softmax）得到概率权重（a1、a2、a3）
# 根据概率加权求和 a1 * q1 + a2 * q2 + a3 * q3，但是实际上会引入V，对v1、v2、v3求加权结果

In [15]:
# 多头注意力机制的实现
# 输入x [B,T,D]
# 输出y [B,T,D] 序列元素蕴含了T长度序列中与不同元素之间的信息
import torch
import torch.nn.functional as F
from torch import  layer_norm,nn
import math

class MySelfAttention(nn.Module):
    def __init__(self,latent_dim,num_head,dropout):
        super().__init__()
        self.num_head = num_head
        self.query = nn.Linear(latent_dim,latent_dim, bias=False)
        self.key = nn.Linear(latent_dim,latent_dim, bias=False)
        self.value = nn.Linear(latent_dim,latent_dim, bias=False)
        self.norm = nn.LayerNorm(latent_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,x):
        """
        X [B,T,D]
        """
        B, T, D = x.shape
        H = self.num_head

        #1.计算QKV
        # B,T,1,D
        Q = self.query(self.norm(x)).unsqueeze(2)
        K = self.key(self.norm(x)).unsqueeze(1)

        #2.计算注意力参数
        # B,T,H,D'
        Q = Q.view(B,T,H,-1)
        # B,H,T,D'
        K = K.view(B,T,H,-1)
        attention = torch.einsum('bnhd,bmhd->bnmh',Q,K) / math.sqrt(D//H)
        weight = self.dropout(F.softmax(attention,dim=2))

        #3.与V加权求和
        V = self.value(self.norm(x)).unsqueeze(2)
        V = V.view(B,T,H,-1)
        # B,T,H,D'
        y = torch.einsum('bnmh,bmhd->bndh',weight,V).reshape(B,T,D) # bndh可以不？
        return y

input_tensor = torch.randn(32, 100, 512)  # batch of 32, sequence length of 100, embedding size of 512
# sa = MySelfAttention(512,8,0.1)
# output_tensor = sa(input_tensor)
# print(output_tensor)

# gobal aveage pooling
gap = nn.AvgPool1d(kernel_size=512)
# gaap = nn.AdaptiveAvgPool1d(1)
# adaptive avg pool
# aap = nn.AdaptiveAvgPool1d(256)
output_tensor = gap(input_tensor)
# output_tensor_2 = gaap(input_tensor)

fc1 = nn.Linear(1,512)
relu = nn.ReLU()
fc2 = nn.Linear(512,512)
sigmoid = nn.Sigmoid()
y = relu(fc1(output_tensor))
y = sigmoid(fc2(y))
print(y)

tensor([[[0.5328, 0.5225, 0.4589,  ..., 0.4284, 0.5289, 0.5732],
         [0.5318, 0.5257, 0.4594,  ..., 0.4305, 0.5265, 0.5766],
         [0.5328, 0.5228, 0.4590,  ..., 0.4286, 0.5286, 0.5736],
         ...,
         [0.5331, 0.5204, 0.4583,  ..., 0.4274, 0.5306, 0.5706],
         [0.5333, 0.5184, 0.4578,  ..., 0.4268, 0.5321, 0.5686],
         [0.5329, 0.5220, 0.4588,  ..., 0.4282, 0.5293, 0.5726]],

        [[0.5327, 0.5229, 0.4590,  ..., 0.4286, 0.5285, 0.5737],
         [0.5328, 0.5227, 0.4590,  ..., 0.4285, 0.5287, 0.5734],
         [0.5316, 0.5265, 0.4595,  ..., 0.4310, 0.5260, 0.5774],
         ...,
         [0.5329, 0.5215, 0.4586,  ..., 0.4279, 0.5298, 0.5719],
         [0.5329, 0.5218, 0.4587,  ..., 0.4281, 0.5295, 0.5723],
         [0.5315, 0.5266, 0.4595,  ..., 0.4311, 0.5259, 0.5776]],

        [[0.5320, 0.5251, 0.4593,  ..., 0.4301, 0.5269, 0.5760],
         [0.5328, 0.5223, 0.4589,  ..., 0.4283, 0.5291, 0.5729],
         [0.5333, 0.5176, 0.4577,  ..., 0.4267, 0.5327, 0.

### 位置编码
### mask注意力机制，能否和运动序列结合