In [None]:
'''
知识点：
self-embedding
position-embediing
深入理解softmax
encoder的mask
'''

In [75]:
import numpy as np
import torch
import numpy
import torch.nn as nn
import torch.nn.functional as F

# 关于word Embedding 以序列建模为例
# 考虑source sentence 和 target sentence
# 构建序列 序列的字符以其在词表中的索引的形式表示
batch_size = 2

# 模型的特征维度，可以理解为每一个字对应的Embedding后的向量维度
model_dim = 8

# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8

# 序列最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5

# 随机生成batch中每个句子的长度
# src_len = torch.randint(2,5,(batch_size,))
# tgt_len = torch.randint(2,5,(batch_size,))
src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([4,3]).to(torch.int32)

print(src_len) #源长度的序列
print(tgt_len) #目标长度的序列

tensor([2, 4], dtype=torch.int32)
tensor([4, 3], dtype=torch.int32)


In [76]:
# 生成每个句子的具体内容，具体内容由单词索引表示
src_seq = [torch.randint(1,max_num_src_words,(L,)) for L in src_len]
tgt_seq = [torch.randint(1,max_num_tgt_words,(L,)) for L in tgt_len]
print('==========词向量生成==========')
print(src_seq)
print(tgt_seq)

# 还需要对每一个seq进行pad，来填充短句子
for i in range(batch_size):
    src_seq[i] = F.pad(src_seq[i],(0,max_src_seq_len-src_len[i]))
    tgt_seq[i] = F.pad(tgt_seq[i],(0,max_tgt_seq_len-tgt_len[i]))
print('==========序列填充==========')
print(src_seq)
print(tgt_seq)

for i in range(batch_size):
    # unsqueeze()函数起升维的作用,参数表示在哪个地方加一个维度。
    # 例如由[4]变成[1,4]
    src_seq[i] = torch.unsqueeze(src_seq[i],0)
    tgt_seq[i] = torch.unsqueeze(tgt_seq[i],0)
src_seq = torch.cat(src_seq)
tgt_seq = torch.cat(tgt_seq)
print('==========向量拼接==========')
print(src_seq)
print(tgt_seq)

[tensor([1, 5]), tensor([6, 2, 2, 7])]
[tensor([3, 5, 1, 3]), tensor([3, 7, 6])]
[tensor([1, 5, 0, 0, 0]), tensor([6, 2, 2, 7, 0])]
[tensor([3, 5, 1, 3, 0]), tensor([3, 7, 6, 0, 0])]
tensor([[1, 5, 0, 0, 0],
        [6, 2, 2, 7, 0]])
tensor([[3, 5, 1, 3, 0],
        [3, 7, 6, 0, 0]])


In [77]:
# 构造Embedding
# 总共有1-8 8个可能的单词，但是还有一个负责padding的0，所以要加一个0，也就是8+1=9
# 相当于就是一个9*8的矩阵
src_embedding_table = nn.Embedding(max_num_src_words+1,model_dim)
tgt_embedding_table = nn.Embedding(max_num_src_words+1,model_dim)
print('==========构造嵌入表==========')
print(src_embedding_table.weight)
print(tgt_embedding_table.weight)

Parameter containing:
tensor([[-1.4095e-01,  7.1341e-02,  5.8432e-01, -2.9907e-01, -1.7384e+00,
         -4.7295e-01,  2.8286e-01, -3.2231e-01],
        [-2.1713e+00,  5.4870e-03,  2.9440e-02,  2.2757e-01,  4.2504e-01,
          1.7870e+00,  3.9924e-01, -1.4182e+00],
        [-9.5577e-01, -3.4160e-01,  2.2756e-01, -1.6129e-01,  4.5111e-01,
          1.2678e+00,  1.9698e+00, -1.3986e+00],
        [ 1.0459e+00,  6.1831e-02, -2.8123e-01, -8.2985e-01, -7.1830e-01,
         -3.4776e-01,  8.8192e-01, -1.6952e+00],
        [ 1.0538e+00,  9.5340e-01,  1.1411e+00, -8.5860e-01, -3.1489e-01,
          1.6556e+00, -1.0655e+00, -5.3791e-01],
        [-1.0360e-03,  7.9850e-02, -1.2795e+00, -2.9511e-01,  2.4285e-01,
         -8.4747e-03, -3.9911e-01,  5.8132e-01],
        [-3.1552e-01, -2.7018e-01,  5.2580e-01,  2.0464e+00, -2.9935e-01,
          5.6005e-01,  8.9928e-01, -2.6875e-01],
        [-1.3401e-01, -1.8950e+00, -7.1632e-01, -4.3917e-01,  6.6857e-01,
          1.0991e+00,  2.5450e-02, -6.5047e

In [78]:
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)
print('==========根据嵌入表将seq转换为向量==========')
print(src_seq)
print(src_embedding)
print(tgt_seq)
print(tgt_embedding)

tensor([[1, 5, 0, 0, 0],
        [6, 2, 2, 7, 0]])
tensor([[[-2.1713e+00,  5.4870e-03,  2.9440e-02,  2.2757e-01,  4.2504e-01,
           1.7870e+00,  3.9924e-01, -1.4182e+00],
         [-1.0360e-03,  7.9850e-02, -1.2795e+00, -2.9511e-01,  2.4285e-01,
          -8.4747e-03, -3.9911e-01,  5.8132e-01],
         [-1.4095e-01,  7.1341e-02,  5.8432e-01, -2.9907e-01, -1.7384e+00,
          -4.7295e-01,  2.8286e-01, -3.2231e-01],
         [-1.4095e-01,  7.1341e-02,  5.8432e-01, -2.9907e-01, -1.7384e+00,
          -4.7295e-01,  2.8286e-01, -3.2231e-01],
         [-1.4095e-01,  7.1341e-02,  5.8432e-01, -2.9907e-01, -1.7384e+00,
          -4.7295e-01,  2.8286e-01, -3.2231e-01]],

        [[-3.1552e-01, -2.7018e-01,  5.2580e-01,  2.0464e+00, -2.9935e-01,
           5.6005e-01,  8.9928e-01, -2.6875e-01],
         [-9.5577e-01, -3.4160e-01,  2.2756e-01, -1.6129e-01,  4.5111e-01,
           1.2678e+00,  1.9698e+00, -1.3986e+00],
         [-9.5577e-01, -3.4160e-01,  2.2756e-01, -1.6129e-01,  4.5111e-0

In [79]:
# further：word_embedding是可训练的？

In [80]:
# 构造position embedding
pos_mat = torch.arange(max_position_len).reshape(-1,1)
i_mat = torch.arange(0,model_dim,2).reshape(1,-1)/model_dim
i_mat = torch.pow(10000,i_mat)
pe_embedding_table = torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:,1::2] = torch.cos(pos_mat / i_mat)
print(pos_mat)
print(i_mat)
print('==========Position Embedding 表的构造==========')
print(pe_embedding_table)

tensor([[0],
        [1],
        [2],
        [3],
        [4]])
tensor([[   1.,   10.,  100., 1000.]])
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01]])


In [81]:
# 将刚刚得到的数组转换为一个nn.Embedding
pe_embedding = nn.Embedding(max_position_len,model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table,requires_grad=True)
print(pe_embedding.weight)

Parameter containing:
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01]], requires_grad=True)


In [88]:
# 现在应该传入位置的索引而不是某个词的索引
src_pos = torch.arange(max_src_seq_len)
tgt_pos = torch.arange(max_tgt_seq_len)
print(src_pos)
print(tgt_pos)
src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)
print(src_pe_embedding)
print(tgt_pe_embedding)

tensor([0, 1, 2, 3, 4])
tensor([0, 1, 2, 3, 4])
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01]], grad_fn=<EmbeddingBackward0>)
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,

In [89]:
# softmax演示
score = torch.randn(5)
prob = F.softmax(score,dim=0)
print(score)
print(prob)

tensor([0.2522, 0.3805, 0.7832, 1.1428, 0.0763])
tensor([0.1406, 0.1598, 0.2391, 0.3426, 0.1179])


In [97]:
# 是否归一化对softmax后的影响
alpha1 = 0.1
alpha2 = 5
prob1 = F.softmax(score*alpha1,dim=0)
prob2 = F.softmax(score*alpha2*alpha2,dim=0)
print(prob1)
print(prob2)

tensor([0.1944, 0.1969, 0.2050, 0.2125, 0.1910])
tensor([2.1414e-10, 5.2918e-09, 1.2466e-04, 9.9988e-01, 2.6381e-12])


In [100]:
def softmax_func(score):
    return F.softmax(score,dim=0)

jaco_mat1 = torch.autograd.functional.jacobian(softmax_func,score*alpha1)
jaco_mat2 = torch.autograd.functional.jacobian(softmax_func,score*alpha2)
print(jaco_mat1)
print(jaco_mat2)

tensor([[ 0.1566, -0.0383, -0.0399, -0.0413, -0.0371],
        [-0.0383,  0.1582, -0.0404, -0.0419, -0.0376],
        [-0.0399, -0.0404,  0.1630, -0.0436, -0.0392],
        [-0.0413, -0.0419, -0.0436,  0.1674, -0.0406],
        [-0.0371, -0.0376, -0.0392, -0.0406,  0.1545]])
tensor([[ 9.5768e-03, -1.7761e-04, -1.3301e-03, -8.0303e-03, -3.8815e-05],
        [-1.7761e-04,  1.8029e-02, -2.5262e-03, -1.5251e-02, -7.3719e-05],
        [-1.3301e-03, -2.5262e-03,  1.1863e-01, -1.1422e-01, -5.5208e-04],
        [-8.0303e-03, -1.5251e-02, -1.1422e-01,  1.4083e-01, -3.3331e-03],
        [-3.8815e-05, -7.3719e-05, -5.5208e-04, -3.3331e-03,  3.9977e-03]])


In [107]:
# 构造encoder的mask，一般而言mask放在softmax中
# mask的shape应该是[batch_size,max_src_len,max_src_len],值为1或者为-inf
valid_encoder_pos = [torch.ones(L) for L in src_len]
print('==========合法的位置用1表示==========')
print(valid_encoder_pos)

print('==========填充不合法的位置为0==========')
for i in range(batch_size):
    valid_encoder_pos[i] = F.pad(valid_encoder_pos[i],(0,max(src_len)-src_len[i]))
print(valid_encoder_pos)

for i in range(batch_size):
    valid_encoder_pos[i] = torch.unsqueeze(valid_encoder_pos[i],0)
valid_encoder_pos = torch.cat(valid_encoder_pos)
print('==========拼接好的矩阵==========')
print(valid_encoder_pos)

valid_encoder_pos = torch.unsqueeze(valid_encoder_pos,2)
print('==========扩维后的矩阵==========')
print(valid_encoder_pos.shape)
print(valid_encoder_pos)

[tensor([1., 1.]), tensor([1., 1., 1., 1.])]
[tensor([1., 1., 0., 0.]), tensor([1., 1., 1., 1.])]
tensor([[1., 1., 0., 0.],
        [1., 1., 1., 1.]])
torch.Size([2, 4, 1])
tensor([[[1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.]]])


In [109]:
'''
计算两个tensor的矩阵乘法，torch.bmm(a,b),tensor a 的size为(b,h,w),tensor b的size为(b,w,m) 也就是说两个tensor的第一维是相等的，然后第一个数组的第三维和第二个数组的第二维度要求一样，对于剩下的则不做要求，输出维度 （b,h,m）

输出为该矩阵的含义是:当这个句子中的词的数量为2的时候，第一个字和第一个字是有关联的，同理第一个字和第二个字也是有关联的，但是第一个字和第三个字是没有关联的，因为第三个字压根不存在
tensor([[[1., 1., 0., 0.],
         [1., 1., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],
'''
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos,valid_encoder_pos.transpose(1,2))
print(valid_encoder_pos_matrix.shape)
print(valid_encoder_pos_matrix)

torch.Size([2, 4, 4])
tensor([[[1., 1., 0., 0.],
         [1., 1., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])


In [114]:
invalid_encoder_pos_matrix = 1-valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)
print('==========转换为非法矩阵==========')
print(invalid_encoder_pos_matrix)
print('==========将非法矩阵转换为bool形式==========')
print(mask_encoder_self_attention)

score = torch.randn(batch_size,max(src_len),max(src_len))
# 这个-1e9如果换成numpy.inf那么softmax之后就会有nan出现
masked_score = score.masked_fill(mask_encoder_self_attention,-1e9)
print('==========查看score值==========')
print(score)
print('==========查看mask后的score值==========')
print(masked_score)

prob3 = F.softmax(masked_score,dim = 2)
print('==========查看softmax之后的概率值==========')
print(prob3)

tensor([[[0., 0., 1., 1.],
         [0., 0., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])
tensor([[[False, False,  True,  True],
         [False, False,  True,  True],
         [ True,  True,  True,  True],
         [ True,  True,  True,  True]],

        [[False, False, False, False],
         [False, False, False, False],
         [False, False, False, False],
         [False, False, False, False]]])
tensor([[[-2.1512e+00,  1.3586e+00,  5.5207e-01,  4.6500e-02],
         [ 3.6031e-01, -3.7255e-01,  7.7691e-01,  4.7475e-01],
         [ 1.7037e-03, -6.1950e-01, -2.4835e-01, -3.8190e-02],
         [-6.6644e-01, -2.6302e-01,  3.9302e-01, -2.6971e-01]],

        [[ 1.5728e+00,  2.2050e-01, -4.5096e-01, -7.0078e-01],
         [ 4.2933e-01, -1.5702e+00,  8.0941e-01,  1.4206e-01],
         [-1.9292e+00, -4.0121e-01, -1.7111e+00,  4.5773e-01],
         [ 2.0755e-01