In [1]:
import torch
import numpy
from torch import nn
from torch.nn import functional as TF

## 1. 定义常量

In [2]:
# 批次大小
BATCH_SIZE = 2
# 单词表大小
TOKEN_CNT = 8
# 句子的最大长度
MAX_INPUT_LEN = 5
MAX_LABEL_LEN = 5

In [3]:
# 特征维度大小
FEATURE_DIM = 8  # 原论文是512，这里为了直观定义成8

## 2. 手写训练数据集（演示用）

### 2.1 生成句子长度列表

In [4]:
# 输入数据为一个批次的（2个）句子，长度分别为2,4
train_input_lengths = torch.Tensor([2, 4]).to(torch.int32)
# 标签数据为一个批次的（2个）句子，长度分别为4,3
train_label_lengths = torch.Tensor([4, 3]).to(torch.int32)

print(train_input_lengths)
print(train_label_lengths)

tensor([2, 4], dtype=torch.int32)
tensor([4, 3], dtype=torch.int32)


### 2.2 随机生成句子

句子由token组成，为了方便，这里的token就是一个int数字

In [5]:
train_input = []
for length in train_input_lengths:
    # 按照长度随机生成句子
    input = torch.randint(1, TOKEN_CNT, (length,))
    # 将句子padding到最大长度
    input = TF.pad(input, (0, MAX_INPUT_LEN - length))
    train_input.append(input)

train_label = []
for length in train_label_lengths:
    # 按照长度随机生成句子
    label = torch.randint(1, TOKEN_CNT, (length,))
    # 将句子padding到最大长度
    label = TF.pad(label, (0, MAX_INPUT_LEN - length))
    train_label.append(label)

print(train_input)
print(train_label)

[tensor([7, 2, 0, 0, 0]), tensor([2, 4, 1, 6, 0])]
[tensor([2, 3, 4, 6, 0]), tensor([2, 2, 2, 0, 0])]


### 2.3 把多个句子拼接构成输入矩阵和标签矩阵

#### 2.4 先把每个句子变成二维的矩阵形式

In [6]:
for index, value in enumerate(train_input):
    value = torch.squeeze(value)
    train_input[index] = torch.unsqueeze(value, dim=0)

for index, value in enumerate(train_label):
    value = torch.squeeze(value)
    train_label[index] = torch.unsqueeze(value, dim=0)

print(train_input)
print(train_label)

[tensor([[7, 2, 0, 0, 0]]), tensor([[2, 4, 1, 6, 0]])]
[tensor([[2, 3, 4, 6, 0]]), tensor([[2, 2, 2, 0, 0]])]


#### 2.5 再把所有矩阵拼接

In [7]:
train_input = torch.cat(train_input, dim=0)
train_label = torch.cat(train_label, dim=0)

print(train_input)
print(train_label)

tensor([[7, 2, 0, 0, 0],
        [2, 4, 1, 6, 0]])
tensor([[2, 3, 4, 6, 0],
        [2, 2, 2, 0, 0]])


## 3. 构造Embedding

### 3.1 获取Embedding表

In [9]:
# 加一个Embedding数量是为了给pad出来的0
input_embedding_table = nn.Embedding(TOKEN_CNT + 1, FEATURE_DIM)
lable_embedding_table = nn.Embedding(TOKEN_CNT + 1, FEATURE_DIM)
print(input_embedding_table.weight)
print(lable_embedding_table.weight)

Parameter containing:
tensor([[-0.2832,  0.6215, -0.2966,  0.2469, -1.4012,  1.3872,  1.5670, -1.3480],
        [-0.3355, -0.9772,  0.7343, -0.9183,  0.8904, -0.6822, -1.5822, -0.2810],
        [-0.0872, -0.6031,  1.0068,  0.5891,  1.3712,  1.3158, -0.8492, -1.4613],
        [ 1.8162, -0.2163, -0.2869, -0.1422, -0.8720, -1.2279, -0.3913,  1.2112],
        [ 1.7345,  0.8102, -0.3895,  0.0219, -0.6082, -0.3863,  0.0518,  0.3908],
        [ 0.7476,  0.0781,  0.5560,  0.7909, -0.0520,  0.4376,  0.2725,  0.1683],
        [-0.0133,  0.2394,  0.1681,  0.3647, -0.5152, -0.2608, -0.6800,  1.4019],
        [-2.2764,  0.8400,  0.2562,  0.6510,  1.9867,  0.9069, -0.2101,  0.1026],
        [-0.2368, -0.4331, -0.2906,  0.3106,  0.2850, -0.9016,  0.3648, -0.5448]],
       requires_grad=True)
Parameter containing:
tensor([[-0.5264, -2.1155,  0.6598,  0.6040,  0.1247, -0.8928,  0.8802, -0.2922],
        [-1.8382, -1.1926, -1.0516,  0.3331,  0.0288, -0.3170,  0.6016, -0.0354],
        [-0.5501,  0.3884,

### 3.2 将输入和输出的token转为Embedding

In [11]:
train_input = input_embedding_table(train_input)
train_label = lable_embedding_table(train_label)

print(train_input)
print(train_label)

tensor([[[-2.2764,  0.8400,  0.2562,  0.6510,  1.9867,  0.9069, -0.2101,
           0.1026],
         [-0.0872, -0.6031,  1.0068,  0.5891,  1.3712,  1.3158, -0.8492,
          -1.4613],
         [-0.2832,  0.6215, -0.2966,  0.2469, -1.4012,  1.3872,  1.5670,
          -1.3480],
         [-0.2832,  0.6215, -0.2966,  0.2469, -1.4012,  1.3872,  1.5670,
          -1.3480],
         [-0.2832,  0.6215, -0.2966,  0.2469, -1.4012,  1.3872,  1.5670,
          -1.3480]],

        [[-0.0872, -0.6031,  1.0068,  0.5891,  1.3712,  1.3158, -0.8492,
          -1.4613],
         [ 1.7345,  0.8102, -0.3895,  0.0219, -0.6082, -0.3863,  0.0518,
           0.3908],
         [-0.3355, -0.9772,  0.7343, -0.9183,  0.8904, -0.6822, -1.5822,
          -0.2810],
         [-0.0133,  0.2394,  0.1681,  0.3647, -0.5152, -0.2608, -0.6800,
           1.4019],
         [-0.2832,  0.6215, -0.2966,  0.2469, -1.4012,  1.3872,  1.5670,
          -1.3480]]], grad_fn=<EmbeddingBackward0>)
tensor([[[-0.5501,  0.3884, -0.5186,

## 4. 位置编码

偶数位置用sin，奇数位置用cos：
$$
\begin{equation}
\begin{split}
\text{PE}_{pos, 2i} &= \sin{(pos/10000^{2i/d_{\text{model}}})} \\
\text{PE}_{pos, 2i+1} &= \cos{(pos/10000^{2i/d_{\text{model}}})}
\end{split}
\end{equation}
$$
其中，pos表示行，i表示列