In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

## nn.Embedding介绍
- [https://pytorch.org/docs/stable/nn.html](https://pytorch.org/docs/stable/nn.html)
----
- pytorch里面实现word embedding是通过一个函数来实现的:nn.Embedding。Embedding的作用就是将词语向量化，通常会将词语表示为一个连续箱梁。
- 官方介绍
    - 一个简单的查找表，用于存储固定字典和大小的嵌入。
    - 此模块通常用于存储单词嵌入并使用索引检索它们。模块的输入是索引列表，输出是相应的字嵌入。
    
- CLASS torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None)

- 参数：
    - num_embeddings (int) – 词典的大小，也就是词典有多少个词。比如你有一个30000的词典，这里就是30000.

    - embedding_dim (int) – the size of each embedding vector。将词语表示成embedding_dim维的向量。指定向量的维度。

    - padding_idx (int, optional) – If given, pads the output with the embedding vector at padding_idx (initialized to zeros) whenever it encounters the index.设置padding_idx后，padding_idx中的嵌入向量将初始化为全零。但是，请注意，之后可以修改该向量，例如，使用定制的初始化方法，从而改变用于填充输出的向量。嵌入中此向量的渐变始终为零。

    - max_norm (float, optional) – If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm.
    - norm_type (float, optional) – The p of the p-norm to compute for the max_norm option. Default 2.
    - scale_grad_by_freq (boolean, optional) – If given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False.
    - sparse (bool, optional) – If True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients.
- 看个例子吧

In [2]:
# 定义一个词典：word2idx = {'hello': 0, 'world': 1, '!':2}，每个单词我们需要用一个数字去表示它，这里对于hello这个词，用0来表示它。
word2idx = {'hello': 0, 'world': 1, '!':2}

# 定义Embedding层，这里的3表示词典共有3个词，5表示5维度，其实也就是一个3x5的矩阵.
# 如果你有1000个词，每个词希望是100维，你就可以这样建立一个word embedding，nn.Embedding(1000, 100)。
# 这就相当于词语和表示词语的向量建立了一张表，想知道一个词的向量表示可以通过这张表去查。
embeds = nn.Embedding(3, 5)

# 如何查询hello这个词的向量表示呢？

# 通过词语在原来字典中的索引查词向量，hello的索引是0
hello_idx = torch.LongTensor([word2idx['hello']])

# 特别注意这里需要一个Variable，因为我们需要访问nn.Embedding里面定义的元素，且word embeding算是神经网络里面的参数，所以我们需要定义Variable
hello_idx = Variable(hello_idx)
# 现在输入Variable格式的索引就可以查看词向量了
hello_embed = embeds(hello_idx)

# 输出hello这个词的<初始词向量>
print(hello_embed)

tensor([[-0.5467,  0.0368,  0.5616, -0.4377, -1.5678]],
       grad_fn=<EmbeddingBackward>)


## 注意
- 注意这里的词向量的建立只是**初始的词向量**，并没有经过任何修改优化，我们需要建立神经网络通过learning修改word embedding里面的参数使得word embedding每一个词向量能够表示每一个不同的词。

## nn.LSTM

In [3]:
rnn = nn.LSTM(10, 20)     

In [4]:
input_data = torch.randn(5, 3, 10)  # seq_len, batch, input_size

In [5]:
input_data

tensor([[[ 1.0176e+00,  2.1074e+00,  1.0802e+00,  1.6588e+00, -3.0665e-01,
          -1.8412e-01, -1.1473e+00, -1.1237e+00, -8.2876e-02, -5.1729e-01],
         [ 6.3862e-01,  1.1659e-01, -9.6650e-02, -1.1714e+00,  1.4072e-01,
          -1.4833e+00, -2.3820e-01,  1.1966e+00,  4.0544e-01,  1.5256e-01],
         [-1.5074e-01, -2.1995e+00, -8.5176e-01, -2.5534e+00,  4.7603e-01,
          -6.3260e-01, -5.6957e-01,  1.4899e-02, -1.1266e+00,  1.2446e+00]],

        [[ 1.7352e-03, -5.7774e-01, -6.1097e-01, -1.8013e-02, -1.3437e+00,
           9.2696e-01,  1.9702e+00, -4.6417e-01, -1.8800e+00, -1.9298e-01],
         [-9.0322e-02, -9.4674e-01, -5.4999e-01, -1.7946e+00, -8.9853e-01,
          -1.0969e+00,  1.5586e+00,  1.6646e+00, -2.8987e-01, -1.5016e+00],
         [-4.1151e-01, -7.2954e-04,  2.2337e-01, -9.8386e-01,  4.4936e-01,
           9.2989e-01, -6.5802e-01,  1.5812e+00, -1.0958e+00, -6.1788e-02]],

        [[-1.4971e+00,  3.9179e-01,  5.6520e-01, -6.3076e-01,  1.5079e-01,
           3.32

In [6]:
# h0 = torch.randn(1, 3, 20)
# h0

In [7]:
# c0 = torch.randn(1, 3, 20)
# c0

In [8]:
output, (hn, cn) = rnn(input_data)

In [9]:
output.size()

torch.Size([5, 3, 20])

In [10]:
output[0][0]

tensor([-0.0705,  0.0806, -0.0004,  0.1699, -0.0639,  0.2456,  0.1708, -0.0649,
         0.0175, -0.1028,  0.0625, -0.2196, -0.0772,  0.1272,  0.2425, -0.0047,
        -0.1266, -0.1277,  0.1271, -0.1731], grad_fn=<SelectBackward>)

In [11]:
hn.size()

torch.Size([1, 3, 20])

In [12]:
cn.size()

torch.Size([1, 3, 20])

In [13]:
fc = nn.Linear(20, 20)

In [14]:
out = fc(output)

In [15]:
out.size()

torch.Size([5, 3, 20])

In [16]:
out[0][0]

tensor([-0.0397, -0.1044, -0.3107,  0.1052, -0.0923,  0.2201,  0.2007, -0.1051,
         0.2590, -0.0161, -0.0058, -0.0728, -0.0446,  0.1594, -0.0887, -0.1233,
         0.0534,  0.2600, -0.2403, -0.2578], grad_fn=<SelectBackward>)

In [17]:
rnn2 = nn.LSTM(10, 20, batch_first=True) 

In [18]:
# bacth,seq_len, word_vec_dim
input_data2 = torch.randn(32, 5, 10)  # hc层size(layer_num, bacth, out_dim)

In [19]:
out, (hn, cn) = rnn2(input_data2)

In [20]:
print(out.size(), hn.size(), cn.size())

torch.Size([32, 5, 20]) torch.Size([1, 32, 20]) torch.Size([1, 32, 20])


In [21]:
a = torch.randn(2).unsqueeze(0)

In [22]:
a

tensor([[-0.5343,  1.1623]])

## 两个形状相同的tonsor相乘

In [23]:
a = torch.tensor([[1,2,3], 
                  [2,3,4]])
b = torch.tensor([[2,2,2], 
                  [2,2,1]])

In [24]:
c = a*b
c

tensor([[2, 4, 6],
        [4, 6, 4]])

In [25]:
a = torch.randn(2, 5, 10)
a

tensor([[[ 0.3521,  0.3300,  1.8152,  0.2920, -0.3818,  1.3390, -0.6421,
          -0.5181, -0.6548, -1.3408],
         [-1.4215,  1.0278, -0.3704,  1.1732,  0.5826, -1.0977, -0.1229,
           0.7543,  0.1068, -0.9253],
         [ 0.2701, -0.0540,  0.5477,  1.9520,  1.1553,  2.0301, -1.3622,
           0.6728,  1.1550, -0.5492],
         [-0.9010,  0.9355, -1.2293, -0.6187, -0.3745,  1.3888, -0.1698,
           1.5389,  0.2122,  0.3061],
         [-0.8861, -0.2416,  0.1654, -0.4934, -0.3101, -0.0811,  1.3597,
          -1.3795,  0.5985,  1.5860]],

        [[ 0.9069,  0.8176, -1.4616, -1.4370,  1.6841,  0.0073,  2.1817,
          -0.2038,  0.0430,  0.4026],
         [ 0.4725,  1.8585, -0.1899, -1.5201, -1.4479,  1.2178, -2.5778,
          -0.9094,  0.6352,  1.1748],
         [-1.1253,  0.7802,  0.1728,  1.6081, -0.3923, -2.0803,  0.3759,
          -0.8075,  1.2691, -0.1470],
         [ 0.3799, -1.0446,  1.4070,  0.6546, -0.2464,  0.8234,  1.1070,
          -0.3188, -0.4323,  1.3733],

In [26]:
c = a.sum(2)
c.size()

torch.Size([2, 5])

In [27]:
x = torch.randn([2,3,5])
x 

tensor([[[ 1.0955, -0.0487,  1.8012, -0.2057, -0.9501],
         [ 0.1482,  0.0884,  0.2197,  0.1176, -0.3785],
         [-0.1032, -0.3398,  1.2709, -0.1059, -0.3674]],

        [[-0.4318, -0.9047, -0.2204, -1.1353, -2.1245],
         [ 1.2986,  1.5762, -0.6559,  0.8941,  0.4001],
         [ 0.8741,  0.1495,  1.0028,  1.4448, -0.0584]]])

In [28]:
mask = (x>0)
mask

tensor([[[1, 0, 1, 0, 0],
         [1, 1, 1, 1, 0],
         [0, 0, 1, 0, 0]],

        [[0, 0, 0, 0, 0],
         [1, 1, 0, 1, 1],
         [1, 1, 1, 1, 0]]], dtype=torch.uint8)

In [29]:
d = x[mask]    # 压扁成一个列表
d

tensor([1.0955, 1.8012, 0.1482, 0.0884, 0.2197, 0.1176, 1.2709, 1.2986, 1.5762,
        0.8941, 0.4001, 0.8741, 0.1495, 1.0028, 1.4448])

In [30]:
import torch.nn.functional as F

In [31]:
s = F.logsigmoid(d)

In [32]:
s.mean()   # 得到一个数字

tensor(-0.3977)

In [33]:
torch.log(1 - torch.sigmoid(d)).mean()

tensor(-1.2231)