In [1]:
import torch
import numpy
from torch import nn
from torch.nn import functional as TF

1. 定义常量

In [2]:
# Batch大小
BATCH_SIZE = 2
# 词袋模型：单词表中的单词总数
TOKEN_CNT = 8
# 句子的最大长度
MAX_SENTENCE_LEN = 5

In [3]:
# 模型维度（特征维度）大小
MODEL_DIM = 8  # 原论文是512，演示起见，这里定义成8

2. 演示起见，手写训练数据集

In [6]:
"""生成一个batch的随机句子当做训练集
   句子由token组成，有独热编码和词袋模型两种表示token的方式
   这里采用词袋模型，也就是一个词用一个独特的整数表示
"""

# Batch size为2，也就是2个句子
# 输入的句子长度分别为2,4
train_input_lengths = torch.Tensor([2, 4]).to(torch.int32)
# 标签的句子长度分别为4,3
train_label_lengths = torch.Tensor([4, 3]).to(torch.int32)

train_input_sentences = []
for length in train_input_lengths:
    # 按照长度随机生成句子
    input = torch.randint(1, TOKEN_CNT, (length,))
    # 将句子padding到最大长度
    input = TF.pad(input, (0, MAX_SENTENCE_LEN - length))
    train_input_sentences.append(input)

train_label_sentences = []
for length in train_label_lengths:
    # 按照长度随机生成句子
    label = torch.randint(1, TOKEN_CNT, (length,))
    # 将句子padding到最大长度
    label = TF.pad(label, (0, MAX_SENTENCE_LEN - length))
    train_label_sentences.append(label)

print(f"训练集输入句子：{train_input_sentences}")
print(f"训练集标签句子：{train_label_sentences}")

训练集输入句子：[tensor([4, 5, 0, 0, 0]), tensor([2, 2, 7, 3, 0])]
训练集标签句子：[tensor([2, 5, 5, 2, 0]), tensor([2, 2, 5, 0, 0])]


In [8]:
"""把句子列表变成矩阵
   把每个句子先变成二维矩阵，再拼接到一起成为真正的输入矩阵和标签矩阵
"""

for index, value in enumerate(train_input_sentences):
    value = torch.squeeze(value)
    train_input_sentences[index] = torch.unsqueeze(value, dim=0)

for index, value in enumerate(train_label_sentences):
    value = torch.squeeze(value)
    train_label_sentences[index] = torch.unsqueeze(value, dim=0)

train_input_mat = torch.cat(train_label_sentences, dim=0)
train_label_mat = torch.cat(train_label_sentences, dim=0)

print(f"训练集输入矩阵：\n{train_input_mat}")
print(f"训练集标签矩阵：\n{train_label_mat}")

训练集输入矩阵：
tensor([[2, 5, 5, 2, 0],
        [2, 2, 5, 0, 0]])
训练集标签矩阵：
tensor([[2, 5, 5, 2, 0],
        [2, 2, 5, 0, 0]])


3. 构造Embedding

In [9]:
# num_embedding参数+1是因为多了个pad出来的0
input_embedding_table = nn.Embedding(TOKEN_CNT + 1, MODEL_DIM)
lable_embedding_table = nn.Embedding(TOKEN_CNT + 1, MODEL_DIM)

print(input_embedding_table.weight)
print(lable_embedding_table.weight)

Parameter containing:
tensor([[ 0.0425,  1.9791, -1.9645,  1.2165, -0.9497,  1.1116,  0.1297, -2.3036],
        [-0.8854, -0.0480,  2.0325, -0.9215,  0.4905,  0.4984, -0.5039,  1.7998],
        [ 1.4225, -1.4392, -1.0206,  1.2507,  1.1182, -0.3806,  0.3524, -1.0706],
        [ 0.0544, -1.0229,  0.0530,  1.1227,  0.0159,  0.7875, -0.1771,  0.0470],
        [-1.1928,  0.1478, -1.7844, -0.5550, -0.7484,  0.3757,  0.5352, -0.3112],
        [-1.3368, -0.4440, -0.8991,  0.1203,  0.9217,  1.0369, -0.7691, -0.5544],
        [ 1.6839,  2.7043, -0.1043,  1.4168, -2.1272,  0.1547, -0.8379, -0.1797],
        [ 0.4358, -1.0582,  1.1119,  0.1677, -0.3939,  1.6558, -0.2100,  0.8067],
        [ 0.4582, -0.3723, -0.2612,  1.2082,  0.3122, -0.7487, -0.3969, -1.3711]],
       requires_grad=True)
Parameter containing:
tensor([[-0.2170,  1.0516,  1.5410, -1.2915, -0.8874,  0.8474,  0.2192,  0.7642],
        [-0.0463,  1.3378,  0.4566,  0.2533,  0.5210, -1.1080,  0.3959,  0.2111],
        [ 0.1288, -0.0469,

In [10]:
train_input_embedding = input_embedding_table(train_input_mat)
train_label_embedding = lable_embedding_table(train_label_mat)

print(train_input_embedding)
print(train_label_embedding)

tensor([[[ 1.4225, -1.4392, -1.0206,  1.2507,  1.1182, -0.3806,  0.3524,
          -1.0706],
         [-1.3368, -0.4440, -0.8991,  0.1203,  0.9217,  1.0369, -0.7691,
          -0.5544],
         [-1.3368, -0.4440, -0.8991,  0.1203,  0.9217,  1.0369, -0.7691,
          -0.5544],
         [ 1.4225, -1.4392, -1.0206,  1.2507,  1.1182, -0.3806,  0.3524,
          -1.0706],
         [ 0.0425,  1.9791, -1.9645,  1.2165, -0.9497,  1.1116,  0.1297,
          -2.3036]],

        [[ 1.4225, -1.4392, -1.0206,  1.2507,  1.1182, -0.3806,  0.3524,
          -1.0706],
         [ 1.4225, -1.4392, -1.0206,  1.2507,  1.1182, -0.3806,  0.3524,
          -1.0706],
         [-1.3368, -0.4440, -0.8991,  0.1203,  0.9217,  1.0369, -0.7691,
          -0.5544],
         [ 0.0425,  1.9791, -1.9645,  1.2165, -0.9497,  1.1116,  0.1297,
          -2.3036],
         [ 0.0425,  1.9791, -1.9645,  1.2165, -0.9497,  1.1116,  0.1297,
          -2.3036]]], grad_fn=<EmbeddingBackward0>)
tensor([[[ 0.1288, -0.0469, -0.4642,

## 4. 位置编码

偶数位置用sin，奇数位置用cos：
$$
\begin{equation}
\begin{split}
\text{PE}_{pos, 2i} &= \sin{(pos/10000^{2i/d_{\text{model}}})} \\
\text{PE}_{pos, 2i+1} &= \cos{(pos/10000^{2i/d_{\text{model}}})}
\end{split}
\end{equation}
$$
其中，pos表示行，i表示列

In [None]:
pos_mat = torch.arange(MAX_SENTENCE_LEN).reshape((-1, 1))
i_mat = torch.arange(0, MODEL_DIM, 2).reshape((1, -1))
frac_bottom = torch.pow(10000, i_mat / MODEL_DIM)
op = pos_mat / frac_bottom

pe = torch.zeros(MAX_SENTENCE_LEN, MODEL_DIM)
# 偶数列
pe[:, 0::2] = torch.sin(op)
# 基数列
pe[:, 1::2] = torch.cos(op)

print(pe)

In [None]:
pe_embedding = nn.Embedding(MAX_SENTENCE_LEN, MODEL_DIM)
pe_embedding.weight = nn.Parameter(pe, requires_grad=False)

print(pe_embedding.weight)
