In [1]:
import torch
import numpy
from torch import nn
from torch.nn import functional as TF

## 1. 定义常量

In [2]:
# 批次大小
BATCH_SIZE = 2
# 单词表大小
TOKEN_CNT = 8
# 句子的最大长度
MAX_INPUT_LEN = 5
MAX_LABEL_LEN = 5

In [3]:
# 特征维度大小
FEATURE_DIM = 8  # 原论文是512，这里为了直观定义成8

## 2. 手写训练数据集（演示用）

### 2.1 生成句子长度列表

In [4]:
# 输入数据为一个批次的（2个）句子，长度分别为2,4
train_input_lengths = torch.Tensor([2, 4]).to(torch.int32)
# 标签数据为一个批次的（2个）句子，长度分别为4,3
train_label_lengths = torch.Tensor([4, 3]).to(torch.int32)

print(train_input_lengths)
print(train_label_lengths)

tensor([2, 4], dtype=torch.int32)
tensor([4, 3], dtype=torch.int32)


### 2.2 随机生成句子

句子由token组成，为了方便，这里的token就是一个int数字

In [5]:
train_input = []
for length in train_input_lengths:
    # 按照长度随机生成句子
    input = torch.randint(1, TOKEN_CNT, (length,))
    # 将句子padding到最大长度
    input = TF.pad(input, (0, MAX_INPUT_LEN - length))
    train_input.append(input)

train_label = []
for length in train_label_lengths:
    # 按照长度随机生成句子
    label = torch.randint(1, TOKEN_CNT, (length,))
    # 将句子padding到最大长度
    label = TF.pad(label, (0, MAX_INPUT_LEN - length))
    train_label.append(label)

print(train_input)
print(train_label)

[tensor([1, 5, 0, 0, 0]), tensor([1, 1, 7, 2, 0])]
[tensor([3, 7, 3, 6, 0]), tensor([3, 7, 4, 0, 0])]


### 2.3 把多个句子拼接构成输入矩阵和标签矩阵

#### 2.4 先把每个句子变成二维的矩阵形式

In [6]:
for index, value in enumerate(train_input):
    value = torch.squeeze(value)
    train_input[index] = torch.unsqueeze(value, dim=0)

for index, value in enumerate(train_label):
    value = torch.squeeze(value)
    train_label[index] = torch.unsqueeze(value, dim=0)

print(train_input)
print(train_label)

[tensor([[1, 5, 0, 0, 0]]), tensor([[1, 1, 7, 2, 0]])]
[tensor([[3, 7, 3, 6, 0]]), tensor([[3, 7, 4, 0, 0]])]


#### 2.5 再把所有矩阵拼接

In [7]:
train_input = torch.cat(train_input, dim=0)
train_label = torch.cat(train_label, dim=0)

print(train_input)
print(train_label)

tensor([[1, 5, 0, 0, 0],
        [1, 1, 7, 2, 0]])
tensor([[3, 7, 3, 6, 0],
        [3, 7, 4, 0, 0]])


## 3. 构造Embedding

### 3.1 获取Embedding表

In [8]:
# 加一个Embedding数量是为了给pad出来的0
input_embedding_table = nn.Embedding(TOKEN_CNT + 1, FEATURE_DIM)
lable_embedding_table = nn.Embedding(TOKEN_CNT + 1, FEATURE_DIM)
print(input_embedding_table.weight)
print(lable_embedding_table.weight)

Parameter containing:
tensor([[-3.7422e-01,  6.7983e-01,  6.0466e-01,  5.3488e-01,  5.1561e-01,
          3.8306e-01,  6.5662e-01, -6.1498e-02],
        [-1.7563e-01,  1.7750e+00, -6.9537e-02, -7.2879e-01,  1.6425e-01,
         -1.5033e+00,  4.5809e-01, -1.4157e-01],
        [ 9.2038e-02,  3.8760e-01, -5.3536e-01,  4.4507e-01, -4.2964e-01,
         -9.5809e-01,  3.2246e-02,  8.2909e-01],
        [ 1.6994e-03, -6.3266e-01, -2.7601e-01, -1.8993e+00, -8.6873e-01,
         -7.3563e-01,  1.4079e+00, -1.6753e-01],
        [-1.6914e+00, -6.0648e-01, -1.2424e+00,  1.7541e+00,  2.4180e-02,
         -7.3011e-01,  2.6621e-03, -1.9021e+00],
        [ 4.1080e-01,  3.2339e+00,  5.5507e-01, -2.8856e+00,  1.6935e-01,
         -2.4707e-01, -4.7805e-01, -4.4455e-01],
        [-9.7700e-01, -4.0286e-01,  3.1571e-01,  3.1108e-01,  8.5578e-01,
          8.1497e-01, -2.5994e-01,  1.2496e-01],
        [ 4.3483e-01, -3.8921e-01,  1.0953e+00,  3.9218e-01, -2.8754e-01,
         -3.2304e-01,  7.3571e-01, -1.3373e

### 3.2 将输入和输出的token转为Embedding

In [9]:
train_input = input_embedding_table(train_input)
train_label = lable_embedding_table(train_label)

print(train_input)
print(train_label)

tensor([[[-0.1756,  1.7750, -0.0695, -0.7288,  0.1643, -1.5033,  0.4581,
          -0.1416],
         [ 0.4108,  3.2339,  0.5551, -2.8856,  0.1693, -0.2471, -0.4781,
          -0.4445],
         [-0.3742,  0.6798,  0.6047,  0.5349,  0.5156,  0.3831,  0.6566,
          -0.0615],
         [-0.3742,  0.6798,  0.6047,  0.5349,  0.5156,  0.3831,  0.6566,
          -0.0615],
         [-0.3742,  0.6798,  0.6047,  0.5349,  0.5156,  0.3831,  0.6566,
          -0.0615]],

        [[-0.1756,  1.7750, -0.0695, -0.7288,  0.1643, -1.5033,  0.4581,
          -0.1416],
         [-0.1756,  1.7750, -0.0695, -0.7288,  0.1643, -1.5033,  0.4581,
          -0.1416],
         [ 0.4348, -0.3892,  1.0953,  0.3922, -0.2875, -0.3230,  0.7357,
          -1.3373],
         [ 0.0920,  0.3876, -0.5354,  0.4451, -0.4296, -0.9581,  0.0322,
           0.8291],
         [-0.3742,  0.6798,  0.6047,  0.5349,  0.5156,  0.3831,  0.6566,
          -0.0615]]], grad_fn=<EmbeddingBackward0>)
tensor([[[-0.2452,  0.1869,  0.3548,

## 4. 位置编码

偶数位置用sin，奇数位置用cos：
$$
\begin{equation}
\begin{split}
\text{PE}_{pos, 2i} &= \sin{(pos/10000^{2i/d_{\text{model}}})} \\
\text{PE}_{pos, 2i+1} &= \cos{(pos/10000^{2i/d_{\text{model}}})}
\end{split}
\end{equation}
$$
其中，pos表示行，i表示列

In [13]:
pos_mat = torch.arange(MAX_INPUT_LEN).reshape((-1, 1))
i_mat = torch.arange(0, FEATURE_DIM, 2).reshape((1, -1))
frac_bottom = torch.pow(10000, i_mat / FEATURE_DIM)
op = pos_mat / frac_bottom

pe = torch.zeros(MAX_INPUT_LEN, FEATURE_DIM)
# 偶数列
pe[:, 0::2] = torch.sin(op)
# 基数列
pe[:, 1::2] = torch.cos(op)

print(pe)

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01]])
