In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## nn.Embedding介绍
- [https://pytorch.org/docs/stable/nn.html](https://pytorch.org/docs/stable/nn.html)
----
- pytorch里面实现word embedding是通过一个函数来实现的:nn.Embedding。Embedding的作用就是将词语向量化，通常会将词语表示为一个连续箱梁。
- 官方介绍
    - 一个简单的查找表，用于存储固定字典和大小的嵌入。
    - 此模块通常用于存储单词嵌入并使用索引检索它们。模块的输入是索引列表，输出是相应的字嵌入。
    
- CLASS torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None)

- 参数：
    - num_embeddings (int) – 词典的大小，也就是词典有多少个词。比如你有一个30000的词典，这里就是30000.

    - embedding_dim (int) – the size of each embedding vector。将词语表示成embedding_dim维的向量。指定向量的维度。

    - padding_idx (int, optional) – If given, pads the output with the embedding vector at padding_idx (initialized to zeros) whenever it encounters the index.设置padding_idx后，padding_idx中的嵌入向量将初始化为全零。但是，请注意，之后可以修改该向量，例如，使用定制的初始化方法，从而改变用于填充输出的向量。嵌入中此向量的渐变始终为零。

    - max_norm (float, optional) – If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm.
    - norm_type (float, optional) – The p of the p-norm to compute for the max_norm option. Default 2.
    - scale_grad_by_freq (boolean, optional) – If given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False.
    - sparse (bool, optional) – If True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients.
- 看个例子吧

In [2]:
# 定义一个词典：word2idx = {'hello': 0, 'world': 1, '!':2}，每个单词我们需要用一个数字去表示它，这里对于hello这个词，用0来表示它。
word2idx = {'hello': 0, 'world': 1, '!':2}

# 定义Embedding层，这里的3表示词典共有3个词，5表示5维度，其实也就是一个3x5的矩阵.
# 如果你有1000个词，每个词希望是100维，你就可以这样建立一个word embedding，nn.Embedding(1000, 100)。
# 这就相当于词语和表示词语的向量建立了一张表，想知道一个词的向量表示可以通过这张表去查。
embeds = nn.Embedding(3, 5)

# 如何查询hello这个词的向量表示呢？

# 通过词语在原来字典中的索引查词向量，hello的索引是0
hello_idx = torch.LongTensor([word2idx['hello']])

# 特别注意这里需要一个Variable，因为我们需要访问nn.Embedding里面定义的元素，且word embeding算是神经网络里面的参数，所以我们需要定义Variable
hello_idx = Variable(hello_idx)
# 现在输入Variable格式的索引就可以查看词向量了
hello_embed = embeds(hello_idx)

# 输出hello这个词的<初始词向量>
print(hello_embed)

tensor([[ 0.3746, -1.0797, -0.6317, -0.6281, -0.4120]],
       grad_fn=<EmbeddingBackward>)


## 注意
- 注意这里的词向量的建立只是**初始的词向量**，并没有经过任何修改优化，我们需要建立神经网络通过learning修改word embedding里面的参数使得word embedding每一个词向量能够表示每一个不同的词。

## nn.LSTM

In [3]:
rnn = nn.LSTM(10, 20)     

In [4]:
input_data = torch.randn(5, 3, 10)  # seq_len, batch, input_size

In [5]:
input_data

tensor([[[ 0.5035, -0.3076,  0.3162,  1.4244, -0.8264, -0.3948,  0.6733,
           0.6104, -1.1864, -0.1052],
         [ 0.2803, -0.5760,  0.8076,  1.1587,  0.6862, -1.7525,  0.8406,
           0.8473,  0.6313, -0.6193],
         [ 0.2741,  1.3035, -0.0708, -0.0701,  1.9604, -0.5892,  0.0282,
          -0.3905, -1.4863,  0.7503]],

        [[ 0.0970, -1.4411,  1.6571, -0.3589, -0.1051, -0.3100,  1.3421,
           0.8556,  0.5329, -0.2677],
         [-1.9752, -0.2660, -0.9931,  0.3350,  0.1206, -1.0908, -0.5099,
          -0.7131,  0.1717,  0.4139],
         [ 0.0510, -0.8117, -0.4690,  1.0724, -0.3611, -2.0950, -1.0926,
           0.2660, -0.8654, -1.2987]],

        [[ 0.0899,  1.7766, -1.2969, -1.2072,  0.8000, -0.9159, -0.1823,
          -1.8170,  0.8746, -1.1954],
         [-1.3784,  0.0154, -1.2797,  0.3126, -2.1986,  0.6790,  1.2085,
          -0.7803,  0.4920,  0.8820],
         [ 0.6599, -0.3139,  0.5517, -1.3554, -0.1092, -0.2548, -0.5972,
           1.1814, -0.5728, -0.0523

In [6]:
# h0 = torch.randn(1, 3, 20)
# h0

In [7]:
# c0 = torch.randn(1, 3, 20)
# c0

In [8]:
output, (hn, cn) = rnn(input_data)

In [9]:
output.size()

torch.Size([5, 3, 20])

In [10]:
output[0][0]

tensor([-0.0960, -0.1280, -0.1632, -0.0876,  0.1671, -0.0493, -0.0749,  0.0547,
        -0.1365, -0.0356, -0.0913,  0.0701,  0.0188,  0.0768,  0.0356, -0.1786,
        -0.0018,  0.0364, -0.0717,  0.0409], grad_fn=<SelectBackward>)

In [11]:
hn.size()

torch.Size([1, 3, 20])

In [12]:
cn.size()

torch.Size([1, 3, 20])

In [13]:
fc = nn.Linear(20, 20)

In [14]:
out = fc(output)

In [15]:
out.size()

torch.Size([5, 3, 20])

In [16]:
out[0][0]

tensor([ 0.0896, -0.0571, -0.0152, -0.0691, -0.0199, -0.0406,  0.2308, -0.2142,
        -0.1767, -0.0639, -0.0081,  0.1343, -0.1078, -0.2064, -0.1733, -0.1182,
         0.1361, -0.2758,  0.1620,  0.1076], grad_fn=<SelectBackward>)

In [17]:
rnn2 = nn.LSTM(10, 20, batch_first=True) 

In [18]:
# bacth,seq_len, word_vec_dim
input_data2 = torch.randn(32, 5, 10)  # hc层size(layer_num, bacth, out_dim)

In [19]:
out, (hn, cn) = rnn2(input_data2)

In [20]:
print(out.size(), hn.size(), cn.size())

torch.Size([32, 5, 20]) torch.Size([1, 32, 20]) torch.Size([1, 32, 20])


In [21]:
a = torch.randn(2).unsqueeze(0)

In [22]:
a

tensor([[-0.4607, -0.4461]])

## 两个形状相同的tonsor相乘

In [23]:
a = torch.tensor([[1,2,3], 
                  [2,3,4]])
b = torch.tensor([[2,2,2], 
                  [2,2,1]])

In [24]:
c = a*b
c

tensor([[2, 4, 6],
        [4, 6, 4]])

In [25]:
a = torch.randn(2, 5, 10)
a

tensor([[[ 1.0323,  0.1239,  1.1077,  0.2164,  0.4767,  0.4025,  0.2399,
          -0.1330,  0.5671,  0.1008],
         [ 0.0843,  0.7463,  0.6705,  0.1488,  0.6736, -0.2464, -0.5746,
           0.6086,  0.9737,  0.0711],
         [ 1.8035,  0.8987, -0.6655, -1.0680, -0.5026,  0.5141,  1.4208,
           0.2101,  0.4502,  1.3422],
         [ 1.0504,  1.3057,  0.2206, -1.7455, -1.2488, -0.4145,  0.8532,
           0.8481,  0.6439,  0.8128],
         [ 0.7198,  0.6862,  0.3618,  0.1609, -0.0922, -0.3626,  0.5601,
          -0.0230, -1.4171, -0.9066]],

        [[ 1.2316,  0.4093, -0.0454, -0.0887, -0.0333, -0.4516,  0.8812,
           0.6673, -2.0286,  0.5245],
         [-0.6719, -1.0660,  1.0303, -2.0076, -0.1587, -0.7049, -0.4192,
          -1.6840, -0.0913, -0.8211],
         [-0.0361,  1.6295, -0.2098, -0.5681,  0.6524,  0.3682, -1.0326,
           0.8690, -0.8374, -0.7785],
         [-1.1685, -0.4528,  0.8089, -0.6248, -1.4823,  1.7067,  0.0343,
          -0.1233, -0.4694,  0.7405],

In [26]:
c = a.sum(2)
c.size()

torch.Size([2, 5])

In [27]:
x = torch.randn([2,3,5])
x 

tensor([[[-0.1212, -1.1875, -0.8154, -0.2603,  0.1208],
         [-1.7234,  0.9401,  0.5191, -1.1418,  0.0485],
         [-0.5145,  0.5186,  0.4932,  0.5364, -0.8625]],

        [[-0.2743, -0.4564,  1.8824,  0.5533,  2.1877],
         [-0.9605,  1.0729,  0.4791, -0.9585, -0.0132],
         [-1.3529, -0.8772,  0.2921, -0.8205, -0.1080]]])

In [28]:
mask = (x>0)
mask

tensor([[[0, 0, 0, 0, 1],
         [0, 1, 1, 0, 1],
         [0, 1, 1, 1, 0]],

        [[0, 0, 1, 1, 1],
         [0, 1, 1, 0, 0],
         [0, 0, 1, 0, 0]]], dtype=torch.uint8)

In [29]:
d = x[mask]    # 压扁成一个列表
d

tensor([0.1208, 0.9401, 0.5191, 0.0485, 0.5186, 0.4932, 0.5364, 1.8824, 0.5533,
        2.1877, 1.0729, 0.4791, 0.2921])

In [30]:
import torch.nn.functional as F

In [31]:
s = F.logsigmoid(d)

In [32]:
s.mean()   # 得到一个数字

tensor(-0.4262)

In [33]:
torch.log(1 - torch.sigmoid(d)).mean()

tensor(-1.1681)

## nn.Parameter

In [34]:
?nn.Parameter

In [38]:
embed_dim = 10
data = torch.randn(embed_dim)
data.size()

torch.Size([10])

In [39]:
u = nn.Parameter(data)

In [40]:
u

Parameter containing:
tensor([-0.6100,  0.1809,  0.7675, -0.5735,  0.1123,  0.4894, -0.8654, -1.0640,
        -1.2618, -0.8684], requires_grad=True)

In [41]:
embed = torch.randn(4, 8, 10)  #[batch, seq_len, embed]
embed

tensor([[[ 0.5720, -0.4873, -0.1925, -0.1972, -0.0626,  1.5768,  1.0680,
           1.2321, -0.4564, -0.0703],
         [ 0.5284, -0.7402,  0.4011,  0.0832, -1.9947, -0.9723,  0.9434,
           0.4962,  0.3930,  0.5200],
         [ 0.2165,  0.9300,  1.4055, -0.9435, -0.1495, -0.5030,  0.3633,
           0.9413,  1.4385, -0.7606],
         [ 0.2950,  0.9037, -0.9711,  0.0556, -1.5093, -1.8391, -0.6188,
          -1.2471, -0.8059, -0.6884],
         [ 1.0337, -0.6484,  2.1221,  0.9859,  2.1802, -2.2094, -0.0289,
           0.8381,  0.3553,  0.2861],
         [-0.1302,  1.4063,  0.4567, -1.1085,  1.9764, -0.1228,  0.9814,
           0.9537, -0.6450,  0.7175],
         [-0.1446, -0.1117,  0.0594, -1.7827, -0.4155, -0.0715,  0.0213,
           0.7226,  0.6909,  1.2786],
         [ 0.6332,  0.4564, -0.1636,  0.3665,  0.6061, -0.0314, -1.5227,
           0.0369,  1.3917, -0.5979]],

        [[-1.3836, -0.8652, -1.3644, -0.8995, -0.2949,  1.2752,  0.1303,
          -0.1627, -0.0075,  0.5403],

In [46]:
x = u.repeat(embed.size(0), embed.size(1), 1)
x.size()

torch.Size([4, 8, 10])

In [70]:
cos = F.cosine_similarity(embed, x, dim=2)
cos

tensor([[-0.2212, -0.4839, -0.1610,  0.2061, -0.2068, -0.0250, -0.2900, -0.0944],
        [ 0.0531, -0.2761, -0.0455,  0.3600, -0.0608, -0.4872, -0.3845,  0.6251],
        [ 0.0520,  0.1996, -0.5176, -0.5650,  0.6570, -0.0836,  0.0536, -0.3301],
        [-0.3879,  0.2709,  0.0507,  0.6181,  0.4528, -0.0298,  0.2379, -0.4050]],
       grad_fn=<DivBackward0>)

In [77]:
alpha = F.softmax(cos, dim=1)  # 每个序列一行，每每行的每个元素代表对应的单词的权重
alpha

tensor([[0.1154, 0.0888, 0.1226, 0.1769, 0.1171, 0.1404, 0.1077, 0.1310],
        [0.1270, 0.0913, 0.1150, 0.1725, 0.1133, 0.0740, 0.0820, 0.2249],
        [0.1307, 0.1515, 0.0739, 0.0705, 0.2393, 0.1141, 0.1309, 0.0892],
        [0.0724, 0.1398, 0.1122, 0.1979, 0.1677, 0.1035, 0.1353, 0.0711]],
       grad_fn=<SoftmaxBackward>)

In [113]:
d = alpha.unsqueeze(2)
print(embed[0][0][0]*d[0][0][0],embed[0][0][1]*d[0][0][0])

tensor(0.0660, grad_fn=<MulBackward0>) tensor(-0.0562, grad_fn=<MulBackward0>)


In [110]:
embed*alpha.unsqueeze(2)

tensor([[[ 0.0660, -0.0562, -0.0222, -0.0228, -0.0072,  0.1820,  0.1233,
           0.1422, -0.0527, -0.0081],
         [ 0.0469, -0.0657,  0.0356,  0.0074, -0.1770, -0.0863,  0.0837,
           0.0440,  0.0349,  0.0462],
         [ 0.0265,  0.1140,  0.1723, -0.1157, -0.0183, -0.0617,  0.0445,
           0.1154,  0.1763, -0.0932],
         [ 0.0522,  0.1599, -0.1718,  0.0098, -0.2671, -0.3254, -0.1095,
          -0.2207, -0.1426, -0.1218],
         [ 0.1210, -0.0759,  0.2485,  0.1154,  0.2553, -0.2587, -0.0034,
           0.0981,  0.0416,  0.0335],
         [-0.0183,  0.1975,  0.0641, -0.1557,  0.2776, -0.0173,  0.1378,
           0.1339, -0.0906,  0.1008],
         [-0.0156, -0.0120,  0.0064, -0.1921, -0.0448, -0.0077,  0.0023,
           0.0779,  0.0744,  0.1378],
         [ 0.0830,  0.0598, -0.0214,  0.0480,  0.0794, -0.0041, -0.1995,
           0.0048,  0.1823, -0.0783]],

        [[-0.1756, -0.1098, -0.1732, -0.1142, -0.0374,  0.1619,  0.0165,
          -0.0207, -0.0010,  0.0686],

In [90]:
y = torch.sum(embed*alpha.unsqueeze(2), dim=1)
y

tensor([[ 0.3618,  0.3213,  0.3114, -0.3055,  0.0978, -0.5791,  0.0793,  0.3957,
          0.2237,  0.0167],
        [-0.1417, -0.2927, -0.7323, -0.2796, -0.4680, -0.2061, -0.6581, -0.1812,
         -0.8672,  0.0082],
        [-0.1451,  0.0247,  0.1265, -0.2745,  0.0626, -0.3589, -0.2194, -0.0770,
          0.2479, -0.5162],
        [-0.4050, -0.7885,  0.4427, -0.3342, -0.5260,  0.2249,  0.0418, -0.6697,
         -0.5975,  0.1450]], grad_fn=<SumBackward2>)

In [91]:
y.size()

torch.Size([4, 10])

In [92]:
y.squeeze(1)

tensor([[ 0.3618,  0.3213,  0.3114, -0.3055,  0.0978, -0.5791,  0.0793,  0.3957,
          0.2237,  0.0167],
        [-0.1417, -0.2927, -0.7323, -0.2796, -0.4680, -0.2061, -0.6581, -0.1812,
         -0.8672,  0.0082],
        [-0.1451,  0.0247,  0.1265, -0.2745,  0.0626, -0.3589, -0.2194, -0.0770,
          0.2479, -0.5162],
        [-0.4050, -0.7885,  0.4427, -0.3342, -0.5260,  0.2249,  0.0418, -0.6697,
         -0.5975,  0.1450]], grad_fn=<SqueezeBackward1>)

In [97]:
fx = nn.Linear(10,1)(y)
fx

tensor([[0.4853],
        [0.5154],
        [0.4474],
        [0.1978]], grad_fn=<AddmmBackward>)

In [101]:
score = torch.sigmoid(fx) # 0-1之间的数
score

tensor([[0.6190],
        [0.6261],
        [0.6100],
        [0.5493]], grad_fn=<SigmoidBackward>)

In [106]:
torch.round(score) #返回相邻最近的整数，四舍五入

tensor([[1.],
        [1.],
        [1.],
        [1.]], grad_fn=<RoundBackward>)

## torch.nn.functional.cosine_similarity计算方法

In [48]:
x1 = torch.randn(2,3)
x2 = torch.randn(2,3)
print(x1, x2)

tensor([[ 0.3437,  0.3895,  1.4366],
        [-3.1125, -0.4749, -0.6235]]) tensor([[-0.9672,  0.4503, -0.3470],
        [ 0.2393,  1.8958, -1.3947]])


In [49]:
F.cosine_similarity(x1, x2)

tensor([-0.3825, -0.1021])

In [64]:
# cosine_similarity的计算公式
sum(x1[0]*x2[0]) / (torch.norm(x1[0])*torch.norm(x2[0])+1e-8)

tensor(-0.3825)

## 截断函数
----
- torch.ceil(input, out=None)   #返回向正方向取得最小整数
- torch.floor(input, out=None)  #返回向负方向取得最大整数

- torch.round(input, out=None)  #返回相邻最近的整数，四舍五入

- torch.trunc(input, out=None)  #返回整数部分数值
- torch.frac(tensor, out=None)  #返回小数部分数值

- torch.fmod(input, divisor, out=None)  #返回input/divisor的余数
- torch.remainder(input, divisor, out=None)  #同上