# PyTorch Embedding

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
# 词嵌入调用 torch.nn.Embedding(m, n) 就可以了，
# m 表示单词的总数目，n 表示词嵌入的维度，
# 其实词嵌入就相当于是一个大矩阵，矩阵的每一行表示一个单词
embeds = nn.Embedding(2, 5) # 2 个单词，维度 5

In [3]:
# 通过 weight 得到了整个词嵌入的矩阵，
# 这个矩阵是一个可以改变的 parameter，在网络的训练中会不断更新，
# 同时词嵌入的数值可以直接进行修改，比如我们可以读入一个预训练好的词嵌入
embeds.weight

Parameter containing:
 0.4225 -0.0938 -0.7500 -0.0262 -0.4031
-1.1038  0.5376 -0.7610 -0.3041 -0.2923
[torch.FloatTensor of size 2x5]

In [4]:
# 直接手动修改词嵌入的值
embeds.weight.data = torch.ones(2, 5)
embeds.weight

Parameter containing:
 1  1  1  1  1
 1  1  1  1  1
[torch.FloatTensor of size 2x5]

In [5]:
# 访问第 50 个词的词向量
embeds = nn.Embedding(100, 10)
single_word_embed = embeds(Variable(torch.LongTensor([50])))

In [6]:
single_word_embed

Variable containing:
 0.1141  0.1282  0.9838 -0.0017 -1.0133  0.3654  0.1225  0.0286 -0.5818 -1.5523
[torch.FloatTensor of size 1x10]

## 数据集

In [7]:
# CONTEXT_SIZE 表示由前面几个单词来预测这个单词
CONTEXT_SIZE = 2 # 依据的单词数
# EMBEDDING_DIM 表示词嵌入的维度。
EMBEDDING_DIM = 10 # 词向量的维度

# 使用莎士比亚的诗
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

In [8]:
# 建立训练集，将单词三个分组，前面两个作为输入，最后一个作为预测的结果。
trigram = [((test_sentence[i], test_sentence[i+1]), test_sentence[i+2]) 
            for i in range(len(test_sentence)-2)]

# 总的数据量
len(trigram)

113

In [9]:
# 取出第一个数据看看
trigram[0]

(('When', 'forty'), 'winters')

In [10]:
# 建立每个词与数字的编码，据此构建词嵌入
vocb = set(test_sentence) # 使用 set 将重复的元素去掉
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [11]:
word_to_idx

{"'This": 59,
 'And': 56,
 'How': 58,
 'If': 96,
 'Proving': 5,
 'Shall': 76,
 'Then': 73,
 'This': 30,
 'Thy': 21,
 'To': 70,
 'Were': 31,
 'When': 41,
 'Where': 12,
 'Will': 85,
 'a': 89,
 'all': 0,
 'all-eating': 74,
 'an': 54,
 'and': 65,
 'answer': 80,
 'art': 42,
 'asked,': 28,
 'be': 27,
 'beauty': 82,
 "beauty's": 52,
 'being': 2,
 'besiege': 83,
 'blood': 87,
 'brow,': 1,
 'by': 33,
 'child': 53,
 'cold.': 92,
 'couldst': 3,
 'count,': 49,
 'days;': 29,
 'deep': 68,
 "deserv'd": 22,
 'dig': 13,
 "excuse,'": 86,
 'eyes,': 62,
 'fair': 19,
 "feel'st": 45,
 'field,': 7,
 'forty': 25,
 'gazed': 39,
 'held:': 48,
 'his': 9,
 'in': 72,
 'it': 50,
 'lies,': 11,
 'livery': 69,
 'lusty': 81,
 'made': 57,
 'make': 77,
 'mine': 67,
 'more': 47,
 'much': 46,
 'my': 63,
 'new': 26,
 'now,': 43,
 'of': 36,
 'old': 18,
 'old,': 64,
 'on': 34,
 'own': 38,
 'praise': 60,
 'praise.': 16,
 'proud': 91,
 'say,': 32,
 'see': 20,
 'shall': 24,
 'shame,': 71,
 'small': 15,
 'so': 94,
 'succession': 

In [12]:
idx_to_word

{0: 'all',
 1: 'brow,',
 2: 'being',
 3: 'couldst',
 4: 'treasure',
 5: 'Proving',
 6: 'to',
 7: 'field,',
 8: 'worth',
 9: 'his',
 10: 'thine!',
 11: 'lies,',
 12: 'Where',
 13: 'dig',
 14: 'succession',
 15: 'small',
 16: 'praise.',
 17: 'where',
 18: 'old',
 19: 'fair',
 20: 'see',
 21: 'Thy',
 22: "deserv'd",
 23: 'sum',
 24: 'shall',
 25: 'forty',
 26: 'new',
 27: 'be',
 28: 'asked,',
 29: 'days;',
 30: 'This',
 31: 'Were',
 32: 'say,',
 33: 'by',
 34: 'on',
 35: 'thou',
 36: 'of',
 37: 'thine',
 38: 'own',
 39: 'gazed',
 40: 'within',
 41: 'When',
 42: 'art',
 43: 'now,',
 44: 'trenches',
 45: "feel'st",
 46: 'much',
 47: 'more',
 48: 'held:',
 49: 'count,',
 50: 'it',
 51: 'warm',
 52: "beauty's",
 53: 'child',
 54: 'an',
 55: "youth's",
 56: 'And',
 57: 'made',
 58: 'How',
 59: "'This",
 60: 'praise',
 61: 'were',
 62: 'eyes,',
 63: 'my',
 64: 'old,',
 65: 'and',
 66: 'use,',
 67: 'mine',
 68: 'deep',
 69: 'livery',
 70: 'To',
 71: 'shame,',
 72: 'in',
 73: 'Then',
 74: 'all-ea

## 定义N-Gram模型

In [13]:
# 定义模型
class n_gram(nn.Module):
    def __init__(self, vocab_size, context_size=CONTEXT_SIZE, n_dim=EMBEDDING_DIM):
        super(n_gram, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, n_dim)
        self.classify = nn.Sequential(
            nn.Linear(context_size * n_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, vocab_size)
        )
        
    def forward(self, x):
        voc_embed = self.embed(x) # 得到词嵌入
        voc_embed = voc_embed.view(1, -1) # 将两个词向量拼在一起
        out = self.classify(voc_embed)
        return out

## 训练模型

In [14]:
# 最后输出是条件概率，相当于是一个分类问题，可以使用交叉熵来衡量误差
net = n_gram(len(word_to_idx))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2, weight_decay=1e-5)

In [15]:
for e in range(100):
    train_loss = 0
    for word, label in trigram: # 使用前 100 个作为训练集
        word = Variable(torch.LongTensor([word_to_idx[i] for i in word])) # 将两个词作为输入
        label = Variable(torch.LongTensor([word_to_idx[label]]))
        # 前向传播
        out = net(word)
        loss = criterion(out, label)
        train_loss += loss.data[0]
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (e + 1) % 20 == 0:
        print('epoch: {}, Loss: {:.6f}'.format(e + 1, train_loss / len(trigram)))

epoch: 20, Loss: 0.749247
epoch: 40, Loss: 0.139854
epoch: 60, Loss: 0.090571
epoch: 80, Loss: 0.073060
epoch: 100, Loss: 0.063460


## 进行测试

In [16]:
net = net.eval()

# 测试一下结果
word, label = trigram[19]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
out = net(word)
pred_label_idx = out.max(1)[1].data[0]
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: ('so', 'gazed')
label: on
()
real word is on, predicted word is on


In [17]:
word, label = trigram[75]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
out = net(word)
pred_label_idx = out.max(1)[1].data[0]
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: ("'This", 'fair')
label: child
()
real word is child, predicted word is child
