# transformer代码讲解

文字，全流程详细讲解，无代码 https://blog.csdn.net/benzhujie1245com/article/details/117173090

视频，原因讲解 https://www.bilibili.com/video/BV1dt4y1J7ov/

视频，讲解详细 https://www.bilibili.com/video/BV1v3411r78R/

文字，简单讲解，有代码 https://blog.csdn.net/qq_52785473/article/details/124537101

文字，简单讲解，有代码 https://blog.csdn.net/Datawhale/article/details/120320116

讲解更详细  https://wmathor.com/index.php/archives/1438/


从零实现Transformer的简易版与强大版：从300多行到3000多行：

讲解更完整 https://blog.csdn.net/v_JULY_v/article/details/130090649


In [1]:
from IPython.display import Image
Image(url= "29.png",width=500)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import math
from torch.autograd import Variable
import matplotlib.pyplot as plt
import numpy as np
import copy

## transformer 主要类

### 词嵌入（Embeddings）



In [2]:
#词嵌入
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
    # d_model:词嵌入维度
    # vocab:字典大小
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)


In [3]:
d_model = 512  # embedding_size
vocab = 1000  # 词典大小
x=torch.tensor([[100, 2, 421, 508], [491, 998, 1, 221]], dtype=torch.long)
emb = Embeddings(d_model, vocab)
embr = emb(x)
print(embr.shape)


torch.Size([2, 4, 512])


In [42]:
embr

tensor([[[  4.4606,  34.4301, -24.5994,  ...,  17.1663,  28.1237,  16.9237],
         [  5.1524,   9.8639,  31.0702,  ...,   3.0191,   9.8241,  26.0486],
         [ -6.2616,   7.7158,  23.8920,  ...,  -9.6690,  17.4992,  22.7776],
         [ 14.7686,  -5.9697,  17.2761,  ..., -19.2139, -36.0296, -19.4070]],

        [[-15.1041,  16.8019,  11.4768,  ...,  17.9029,  27.7799,   1.4143],
         [-17.6204,   1.9429,  16.7623,  ..., -51.7329, -12.3380,   1.6418],
         [ 13.7802,  -3.5118,   5.1110,  ...,   9.2191, -26.6751,  28.3472],
         [-21.7961, -42.0103, -23.3687,  ...,   6.7964, -30.1595,  46.5827]]],
       grad_fn=<MulBackward0>)

### 位置编码（PositionalEncoding）

In [4]:
#位置编码
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
    # d_model:词嵌入维度
    # dropout:置零比率
    # max_len:每个句子最大的长度
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0,  max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(1000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return self.dropout(x)

In [5]:
dropout = 0.1
max_len = 60
pe = PositionalEncoding(d_model, dropout, max_len)
pe_result = pe(embr)
print(pe_result.shape)

torch.Size([2, 4, 512])


In [46]:
pe_result

tensor([[[  4.9562,  39.3668, -27.3327,  ...,  20.1848,  31.2486,  19.9152],
         [  6.6598,  11.5602,  35.4411,  ...,   4.4657,  10.9168,   0.0000],
         [ -5.9470,   8.1107,  27.5802,  ...,  -9.6322,   0.0000,  26.4196],
         [ 16.5664,  -7.7330,  19.4397,  ..., -20.2377, -40.0294,  -0.0000]],

        [[-16.7823,  19.7799,  12.7519,  ...,  21.0032,   0.0000,   0.0000],
         [ -0.0000,   2.7591,  19.5435,  ...,  -0.0000, -13.7077,   2.9353],
         [ 16.3216,  -4.3644,   6.7124,  ...,  11.3546, -29.6367,  32.6080],
         [-24.0611, -47.7781, -25.7211,  ...,   8.6627, -33.5071,  52.8697]]],
       grad_fn=<MulBackward0>)

### 多头自注意力机制（MultiHeadedAttention-attention）

In [50]:
#mask == 0

In [18]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)

    if dropout is not None:
        p_attn = dropout(p_attn)
    
    return torch.matmul(p_attn, value), p_attn

# 深层拷贝
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class MultiHeadedAttention(nn.Module):
    def __init__(self, head, embedding_dim, dropout=0.1):
        # head:代表几个头
        # embedding_dim:词嵌入维度
        # dropout:置0比率
        super(MultiHeadedAttention, self).__init__()

        # 确认embedding_dim能够被head整除
        assert embedding_dim % head == 0
        self.head = head
        self.d_k = embedding_dim // head
        # 获得4个线性层， 分别是Q、K、V、以及最终的输出的线形层
        self.linears = clones(nn.Linear(embedding_dim, embedding_dim), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(0)
        
        batch_size = query.size(0)

        # 经过线性层投影后分成head个注意力头
        query, key, value = [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) for model, x in zip(self.linears, (query, key, value))]
        # 各自计算每个头的注意力
        
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        # 转换回来
        
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.head * self.d_k)
        # 经过最后一个线性层得到最终多头注意力机制的结果
        return self.linears[-1](x)


In [29]:
for i in zip([1,2,3,4],[2,2,2]):
    print(i)

(1, 2)
(2, 2)
(3, 2)


In [52]:
#mask

In [53]:
512/8

64.0

In [7]:
head = 8
embedding_dim = 512
dropout = 0.2
query = key = value = pe_result
mask = Variable(torch.zeros(8, 4, 4))
mha = MultiHeadedAttention(head, embedding_dim, dropout)
mha_result = mha(query, key, value, mask)
print(mha_result)
print(mha_result.shape)


torch.Size([2, 4, 512])
4
torch.Size([2, 8, 4, 64])
torch.Size([2, 8, 4, 64])
tensor([[[ 3.1447,  9.3652, -0.8106,  ..., -1.9810,  7.4806, -0.8098],
         [ 4.6991,  9.4880, -0.0394,  ..., -1.9257,  5.2841, -2.6955],
         [ 3.0011,  9.2706,  2.6056,  ..., -6.5442,  4.7360,  2.8949],
         [ 4.4883,  8.8427,  0.3545,  ..., -6.7603,  7.7433,  0.1657]],

        [[-2.8519, -5.7107, -1.0561,  ...,  7.5895, -1.4115, -0.8601],
         [-2.4921, -3.4672, -4.0760,  ...,  2.5844,  1.5122, -3.2994],
         [ 0.7264, -6.5059, -3.1966,  ...,  9.5092, -1.2292, -1.1438],
         [-0.9436, -8.2308, -1.7429,  ...,  7.9383, -0.5190,  0.6261]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


In [8]:
query = key = value = pe_result
mask = Variable(torch.zeros(2, 4, 4))
attn, p_attn = attention(query, key, value,mask=mask)
# print(attn)
# print(attn.shape)
# print(p_attn)
# print(p_attn.shape)


### 前馈全连接层（PositionwiseFeedForward）

考虑注意力机制可能对复杂的情况拟合程度不够，因此增加两层网络来增强模型的能力。

前馈全连接层就是两次线性层+Relu激活

In [9]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w1 = nn.Linear(d_model, d_ff)
        self.w2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.w2(self.dropout(F.relu(self.w1(x))))


In [10]:
d_model = 512
d_ff = 64
dropout = 0.2
x = mha_result
ff = PositionwiseFeedForward(d_model, d_ff, dropout=dropout)
ff_result = ff(x)
print(ff_result)
print(ff_result.shape)

tensor([[[-0.6185,  0.0881,  1.4086,  ...,  1.2355, -1.5647,  1.1489],
         [-2.3992, -0.9414, -0.4582,  ...,  0.4506, -0.5344,  1.8814],
         [-0.5688,  0.0891,  1.4188,  ...,  1.1826, -0.4016,  0.9184],
         [-0.6219, -1.2090,  1.4474,  ...,  0.3424, -0.5402,  1.8289]],

        [[-1.1074,  2.0183,  0.5329,  ..., -1.3109,  0.8419,  0.2912],
         [-2.1584,  3.1729,  0.0619,  ..., -1.6472,  1.3028,  0.3951],
         [-1.6058,  2.4022,  0.3588,  ..., -0.4772,  0.4304, -0.1591],
         [-0.6955,  1.6537,  0.6962,  ..., -0.1930,  0.5402,  0.2190]]],
       grad_fn=<ViewBackward0>)
torch.Size([2, 4, 512])


### 规范化层（LayerNorm）

BatchNorm简单来说就是对一批样本按照每个特征维度进行归一化

Layer Norm是对每个单词的Embedding做归一化

https://blog.csdn.net/qq_43827595/article/details/121877901

https://liumin.blog.csdn.net/article/details/85075706

In [14]:
Image(url= "44.png")

In [61]:
Image(url= "35.png")

In [64]:
x

tensor([[[-8.2004,  0.5804,  1.3639,  ...,  1.7793, -2.9470, -1.8913],
         [-6.3798,  2.6990,  3.7775,  ...,  1.6971, -2.1978, -0.7955],
         [-6.0106,  2.9720,  4.2135,  ..., -2.0315, -5.0306, -4.9978],
         [-6.6885,  0.0595,  1.7017,  ...,  2.5037, -2.7890,  2.4970]],

        [[-2.7299, -2.4048,  3.6923,  ..., -7.7733,  1.3931,  1.7657],
         [ 3.8099, -1.5517,  0.7698,  ..., -3.5534,  0.2886,  2.9241],
         [-2.1036, -3.9115, -4.4982,  ..., -4.2154,  2.1434,  2.2444],
         [ 1.0188, -1.2110,  0.7608,  ..., -3.6656,  6.5101,  4.7269]]],
       grad_fn=<ViewBackward0>)

In [11]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a2 = nn.Parameter(torch.ones(features))
        self.b2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(-1, keepdim = True)
        std = x.std(-1, keepdim = True)
        return self.a2 * (x - mean) / (std + self.eps) + self.b2

In [70]:
ln = LayerNorm(512)
lnn = nn.LayerNorm(512)
ln_result = ln(x)
lnn_result = lnn(x)
print(ln_result)
print(lnn_result)

tensor([[[-1.9218,  0.0708,  0.2487,  ...,  0.3429, -0.7296, -0.4901],
         [-1.5170,  0.5434,  0.7881,  ...,  0.3160, -0.5679, -0.2497],
         [-1.5732,  0.7361,  1.0552,  ..., -0.5502, -1.3212, -1.3128],
         [-1.5733, -0.0602,  0.3080,  ...,  0.4879, -0.6989,  0.4864]],

        [[-0.6454, -0.5691,  0.8614,  ..., -1.8287,  0.3220,  0.4094],
         [ 0.8695, -0.3313,  0.1886,  ..., -0.7796,  0.0808,  0.6711],
         [-0.5172, -0.9272, -1.0602,  ..., -0.9961,  0.4457,  0.4686],
         [ 0.1812, -0.2806,  0.1277,  ..., -0.7889,  1.3183,  0.9490]]],
       grad_fn=<AddBackward0>)
tensor([[[-1.9237,  0.0709,  0.2489,  ...,  0.3433, -0.7304, -0.4906],
         [-1.5185,  0.5439,  0.7889,  ...,  0.3163, -0.5685, -0.2499],
         [-1.5747,  0.7368,  1.0563,  ..., -0.5508, -1.3225, -1.3141],
         [-1.5748, -0.0603,  0.3083,  ...,  0.4883, -0.6996,  0.4868]],

        [[-0.6460, -0.5697,  0.8623,  ..., -1.8305,  0.3223,  0.4098],
         [ 0.8704, -0.3317,  0.1888,  ..

### 子层连接结构(SublayerConnection)

Add&Norm

In [2]:
Image(url= "38.png")

In [12]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout=0.1):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p=dropout) 
        self.size = size
    
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))


In [13]:
size = 512
dropout = 0.2
head = 8
d_model = 512
x = pe_result
mask = Variable(torch.zeros(8, 4, 4))
self_attn = MultiHeadedAttention(head, d_model)
sublayer = lambda x: self_attn(x, x, x, mask)
sc = SublayerConnection(size, dropout)
sc_result = sc(x, sublayer)
print(sc_result)
print(sc_result.shape)


torch.Size([2, 4, 512])
4
torch.Size([2, 8, 4, 64])
torch.Size([2, 8, 4, 64])
tensor([[[-7.6168e+00, -2.2202e-01, -3.4862e+01,  ..., -1.2145e+01,
           2.2603e+01, -1.2610e+01],
         [ 5.2091e+01, -8.4343e+00, -3.5232e+01,  ...,  2.0129e+01,
           1.1804e+01,  8.5235e+00],
         [ 1.3720e+01, -5.2680e+00, -7.8298e+00,  ..., -2.2894e+01,
           7.0936e+00,  2.1066e+01],
         [-3.3744e+01, -3.4042e+01, -2.4341e+01,  ...,  3.5039e+00,
          -9.1709e+00,  1.7512e-02]],

        [[ 5.8780e+01, -2.4038e+01, -4.8911e-02,  ..., -6.9139e+00,
          -4.7905e+01, -1.3536e+01],
         [ 4.7775e+00,  1.3581e+01,  3.4360e+00,  ..., -1.1574e+01,
          -9.6948e-02,  3.6499e+01],
         [ 2.0096e+01, -2.7572e+01,  6.4582e+00,  ..., -3.8138e+01,
          -2.2617e+00, -7.6446e+01],
         [-4.3623e+01, -5.4308e-01,  1.4093e+00,  ...,  2.2701e+01,
           8.3426e+00, -1.8410e-01]]], grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


In [5]:
Image(url= "48.png")

## transformer的二级部件

* Encoder-Decoder架构

编码器-解码器（Encoder-Decoder）是深度学习模型的抽象概念。许多模型都基于这一架构，比如CNN，RNN，LSTM和Transformer等。

编码器（Encoder）：负责将输入（Input）转化为特征（Feature）

解码器（Decoder）：负责将特征（Feature）转化为目标（Target）

https://blog.csdn.net/deer2019530/article/details/129675690

### 1.编码器层（EncoderLayer）

In [3]:
Image(url= "38.png")

In [14]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        # size:词嵌入的维度
        # self_attn：代表输入的多头子注意力层的实例化对象
        # feed_forward：代表前馈全连接层的实例化对象
        # dropout：置0比例
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)


In [15]:
size = 512
head = 8
d_model = 512
d_ff = 64
x = pe_result
dropout = 0.2
self_attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
mask = Variable(torch.zeros(8, 4, 4))
el = EncoderLayer(size, self_attn, ff, dropout)
el_result = el(x, mask)
print(el_result)
print(el_result.shape)

torch.Size([2, 4, 512])
4
torch.Size([2, 8, 4, 64])
torch.Size([2, 8, 4, 64])
tensor([[[-7.5962e+00,  5.7254e-01, -3.5566e+01,  ..., -1.2890e+01,
           2.2956e+01, -1.2507e+01],
         [ 5.2098e+01, -8.1339e+00, -3.4996e+01,  ...,  1.9243e+01,
           1.2127e+01,  7.4026e+00],
         [ 1.3874e+01, -5.0351e+00, -8.0736e+00,  ..., -2.4052e+01,
           6.9527e+00,  2.0552e+01],
         [-3.3589e+01, -3.3747e+01, -2.4778e+01,  ...,  3.1240e+00,
          -9.1623e+00, -5.1010e-01]],

        [[ 5.8880e+01, -2.3874e+01,  4.2584e-02,  ..., -6.7574e+00,
          -4.7951e+01, -1.3427e+01],
         [ 4.5244e+00,  1.3642e+01,  3.0564e+00,  ..., -1.1101e+01,
           1.3809e-01,  3.6584e+01],
         [ 1.9917e+01, -2.7515e+01,  6.9420e+00,  ..., -3.7486e+01,
          -1.3902e+00, -7.6213e+01],
         [-4.3678e+01, -3.0355e-01,  1.4843e+00,  ...,  2.3051e+01,
           8.2299e+00,  8.0651e-02]]], grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


### 2.编码器（Encoder）

In [6]:
Image(url= "39.jpg")

In [19]:
# 编码器
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)


In [20]:
size = 512
head = 8
d_model = 512
d_ff = 64
c = copy.deepcopy
dropout = 0.2

attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
mask = Variable(torch.zeros(8, 4, 4))
layer = EncoderLayer(size, c(attn), c(ff), dropout)
N = 8
mask = Variable(torch.zeros(8, 4, 4))
en = Encoder(layer, N)
en_result = en(x, mask)
print(en_result)
print(en_result.shape)

tensor([[[-0.4271, -0.1136, -1.5222,  ..., -0.6932,  0.9212, -0.5832],
         [ 2.0813, -0.3794, -1.6242,  ...,  0.6505,  0.5845,  0.3873],
         [ 0.2288, -0.2102, -0.4917,  ..., -1.0011,  0.3007,  0.7645],
         [-1.6307, -1.4349, -1.0386,  ...,  0.0529, -0.3985,  0.0364]],

        [[ 2.2717, -1.0465,  0.0164,  ..., -0.4038, -2.1660, -0.6705],
         [ 0.1267,  0.6160,  0.2929,  ..., -0.5215, -0.0237,  1.5602],
         [ 0.7160, -1.1489,  0.2235,  ..., -1.6382, -0.2661, -3.0683],
         [-1.8700, -0.0800,  0.2219,  ...,  1.0834,  0.4636,  0.0974]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


### 3.解码器层（DecoderLayer）

In [8]:
Image(url= "40.png")

In [3]:
Image(url= "46.gif")

In [21]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, source_mask, target_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, target_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, source_mask))
        return self.sublayer[2](x, self.feed_forward)


In [22]:
head = 8
size = d_model = 512
d_ff = 64
dropout = 0.2
self_attn =  src_attn = MultiHeadedAttention(head, d_model, dropout)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
x = pe_result
memory = en_result
mask = Variable(torch.zeros(8, 4, 4))
source_mask = target_mask = mask # 这里为了方便演示就直接设置一样了
dl = DecoderLayer(size, self_attn, src_attn, ff, dropout)
dl_result = dl(x, memory, source_mask, target_mask)
print(dl_result)
print(dl_result.shape)


tensor([[[-7.9105e+00, -2.2174e-01, -3.5165e+01,  ..., -1.2805e+01,
           2.3126e+01, -1.2007e+01],
         [ 5.1976e+01, -7.9442e+00, -3.5792e+01,  ...,  2.0362e+01,
           1.2573e+01,  7.9624e+00],
         [ 1.4152e+01, -5.5714e+00, -7.6872e+00,  ..., -2.2731e+01,
           7.5578e+00,  2.0932e+01],
         [-3.3293e+01, -3.3924e+01, -2.4315e+01,  ...,  2.9771e+00,
          -9.0354e+00,  8.2886e-02]],

        [[ 5.8963e+01, -2.3734e+01,  1.4181e-01,  ..., -6.5148e+00,
          -4.7695e+01, -1.3804e+01],
         [ 4.4570e+00,  1.3936e+01,  3.1438e+00,  ..., -1.0971e+01,
          -5.5008e-02,  3.6598e+01],
         [ 2.0076e+01, -2.7106e+01,  6.5314e+00,  ..., -3.7407e+01,
          -2.0241e+00, -7.6543e+01],
         [-4.3527e+01, -3.7362e-02,  1.7245e+00,  ...,  2.3904e+01,
           8.9378e+00, -3.7832e-01]]], grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


### 4.解码器（Decoder）

In [24]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    def forward(self, x, memory, source_mask, target_mask):
        # x:词嵌入维度
        # memory：代表编码器的输出张量
        # source_mask：原数据的掩码张量
        # target_mask：目标数据的掩码张量
        for layer in self.layers:
            x = layer(x, memory, source_mask, target_mask)
        return self.norm(x)

In [25]:
size = 512
head = 8
d_model = 512
d_ff = 64
c = copy.deepcopy
dropout = 0.2
attn = MultiHeadedAttention(head, d_model)
ff = PositionwiseFeedForward(d_model, d_ff, dropout)
layer = DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout)
memory = en_result
mask = Variable(torch.zeros(8, 4, 4))
source_mask = target_mask = mask 
N = 8
x = pe_result

de = Decoder(layer, N)
de_result = de(x, memory, source_mask, target_mask)
print(de_result)
print(de_result.shape)


tensor([[[-0.1914,  0.0316, -1.4516,  ..., -0.5109,  0.8120, -0.6192],
         [ 2.4436, -0.1856, -1.5101,  ...,  1.0533,  0.3636,  0.4281],
         [ 0.7144, -0.0735, -0.3538,  ..., -0.8280,  0.1390,  0.9766],
         [-1.3158, -1.3548, -1.1717,  ...,  0.2616, -0.5626,  0.0988]],

        [[ 2.2384, -1.1134, -0.1828,  ..., -0.3058, -1.9598, -0.8514],
         [ 0.2237,  0.6105,  0.0993,  ..., -0.4678, -0.0176,  1.4192],
         [ 0.9140, -1.1424,  0.1541,  ..., -1.2749, -0.0882, -3.3717],
         [-1.8234,  0.0832, -0.0134,  ...,  1.0313,  0.4380, -0.2029]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


### 5.输出层（Generator）

In [9]:
Image(url= "41.png")

In [26]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Generator, self).__init__()
        self.project = nn.Linear(d_model, vocab_size)
    def forward(self, x):
        return F.softmax(self.project(x), dim=-1)

In [27]:
vocab_size = 1000
gen = Generator(d_model, vocab_size)
x = de_result
gen_result = gen(x)
print(gen_result)
print(gen_result.shape)


tensor([[[0.0031, 0.0003, 0.0004,  ..., 0.0008, 0.0005, 0.0014],
         [0.0010, 0.0012, 0.0005,  ..., 0.0005, 0.0009, 0.0004],
         [0.0014, 0.0027, 0.0004,  ..., 0.0013, 0.0024, 0.0006],
         [0.0015, 0.0016, 0.0011,  ..., 0.0011, 0.0007, 0.0014]],

        [[0.0008, 0.0003, 0.0004,  ..., 0.0006, 0.0015, 0.0006],
         [0.0016, 0.0005, 0.0002,  ..., 0.0003, 0.0013, 0.0008],
         [0.0012, 0.0005, 0.0033,  ..., 0.0003, 0.0006, 0.0018],
         [0.0014, 0.0004, 0.0018,  ..., 0.0004, 0.0009, 0.0004]]],
       grad_fn=<SoftmaxBackward0>)
torch.Size([2, 4, 1000])


### 6.输入层

## transformer的一级部件

### 1.编码器-解码器

In [10]:
Image(url= "39.jpg")

In [28]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        # encoder:编码器对象
        # decoder:解码器对象
        # source_embed:原数据词嵌入
        # target_embed:目标数据词嵌入
        # generator：输出部分类别生成器
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = source_embed
        self.tgt_embed = target_embed
        self.generator = generator
    def forward(self, source, target, source_mask, target_mask):
        return self.decode(self.encode(source, source_mask), source_mask, target, target_mask)
    
    def encode(self, source, source_mask):
        return self.encoder(self.src_embed(source), source_mask)
    
    def decode(self, memory,  source_mask, target, target_mask):
        return self.decoder(self.tgt_embed(target), memory, source_mask, target_mask)


In [29]:
vocab_size = 1000
d_model = 512
encoder = en
decoder = de
source_embed = nn.Embedding(vocab_size, d_model)
target_embed = nn.Embedding(vocab_size, d_model)
generator = gen
source = target = Variable(torch.LongTensor([[100, 2, 421, 508], [491, 998, 1, 221]]))
source_mask = target_mask = Variable(torch.zeros(8, 4, 4))
ed = EncoderDecoder(encoder, decoder, source_embed, target_embed, generator)
ed_result = ed(source, target, source_mask, target_mask)
print(ed_result)
print(ed_result.shape)


tensor([[[-0.1339,  0.2593,  0.4469,  ...,  0.0245,  0.2096, -0.2820],
         [-0.0375, -0.2829,  0.6640,  ..., -1.2519, -0.4895, -0.2747],
         [ 0.7554, -1.0355,  0.7525,  ..., -0.2328,  0.6143, -0.7466],
         [-0.3477, -0.0398,  0.6659,  ..., -0.0829, -0.1654, -0.8066]],

        [[ 0.6578,  0.8913,  0.9278,  ...,  1.4095, -0.3353, -1.8564],
         [-0.3979,  1.1808,  0.5054,  ...,  1.4843, -0.7904, -2.4899],
         [ 0.2102,  0.9135,  0.5119,  ...,  1.8622, -0.2029, -1.5313],
         [ 0.3884,  1.8151,  1.0496,  ...,  1.0963, -1.1170, -2.5009]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])


### tramsformer中的掩码

In [12]:
Image(url= "42.png")

In [13]:
Image(url= "43.png")

### transformer的实现

In [30]:
def make_transformer_model(source_vocab, target_vocab, N=6, d_model=512, d_ff=64, head=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(head, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
    Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), 
    Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), 
    nn.Sequential(Embeddings(d_model, source_vocab), c(position)),
    nn.Sequential(Embeddings(d_model, target_vocab), c(position)),
    Generator(d_model, target_vocab))

    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform(p)
    return model


In [31]:
source_vocab = 11
target_vocab = 11
N = 6
res = make_transformer_model(source_vocab, target_vocab, N)
print(res)


EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w1): Linear(in_features=512, out_features=64, bias=True)
          (w2): Linear(in_features=64, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0-1): 2 x SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm): LayerNorm()
  )
  (decoder): Decoder(
    (layers): ModuleList(
      (0-5): 6 x DecoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_featur

  nn.init.xavier_uniform(p)


## 繁荣的transformer 家族

https://arxiv.org/pdf/2304.13712.pdf


* 深入浅出Prompt Learning要旨及常用方法
* llm lora 大模型微调

In [4]:
Image(url= "47.png")