# WORD EMBEDDINGS: ENCODING LEXICAL SEMANTICS
## Word Embeddings in Pytorch

In [51]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

"""
在神经网络中，参数默认是进行随机初始化的。如果不设置的话每次训练时的初始化都是随机的，导致结果不确定。如果设置初始化，则每次初始化都是固定的。

如果使用多个GPU，应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子。
"""

# 设置CPU的的随机数固定，使得紧跟着的rand()函数生成的值是固定的随机
torch.manual_seed(1)
torch.rand(5)

tensor([0.7576, 0.2793, 0.4031, 0.7347, 0.0293])

In [44]:
torch.manual_seed(1)
torch.rand(5)

tensor([0.7576, 0.2793, 0.4031, 0.7347, 0.0293])

In [55]:
torch.rand(5)

tensor([0.3138, 0.1980, 0.4162, 0.2843, 0.3398])

In [56]:
# Sets the seed for generating random numbers. Returns a `torch.Generator` object.
torch.manual_seed(1)

<torch._C.Generator at 0x7fc5784c5270>

In [73]:
word_to_ix = {"hello": 0, "world": 1}

# 2 words in vocab, 5 dimensional embeddings
embeds = nn.Embedding(num_embeddings=2, embedding_dim=5)

# 获取单词‘hello’的索引（int 0）并将其转为tensor类型
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)

lookup_tensor,type(lookup_tensor),lookup_tensor.shape

(tensor([0]), torch.Tensor, torch.Size([1]))

In [72]:
hello_embed = embeds(lookup_tensor)
hello_embed,hello_embed.shape

(tensor([[-0.8923, -0.0583, -0.1955, -0.9656,  0.4224]],
        grad_fn=<EmbeddingBackward0>),
 torch.Size([1, 5]))

In [74]:
hello_embed.view((1,-1))
hello_embed,hello_embed.shape

(tensor([[-0.8923, -0.0583, -0.1955, -0.9656,  0.4224]],
        grad_fn=<EmbeddingBackward0>),
 torch.Size([1, 5]))

In [88]:
lookup_tensor1 = torch.tensor([0,1], dtype=torch.long)
all_embed = embeds(lookup_tensor1)
all_embed,all_embed.shape

(tensor([[ 3.5870, -1.8313,  1.5987, -1.2770,  0.3255],
         [-0.4791,  1.3790,  2.5286,  0.4107, -0.9880]],
        grad_fn=<EmbeddingBackward0>),
 torch.Size([2, 5]))

## An Example: N-Gram Language Modeling
Recall that in an n-gram language model, given a sequence of words $w$, we want to compute $P(w_{i}|w_{i-1},w_{i-2},...,w_{i-n+1})$ Where $w_{i}$ is the $i$th word of the sequence.

In [75]:
"""
In this example, we will compute the loss function on some training examples and update the parameters with backpropagation.
"""

# 上下文的大小为2，即目标词的标签是其左右的两个词
CONTEXT_SIZE = 2

# 词嵌入的维数是10
EMBEDDING_DIM = 10

In [76]:
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

In [77]:
test_sentence,len(test_sentence)

(['When',
  'forty',
  'winters',
  'shall',
  'besiege',
  'thy',
  'brow,',
  'And',
  'dig',
  'deep',
  'trenches',
  'in',
  'thy',
  "beauty's",
  'field,',
  'Thy',
  "youth's",
  'proud',
  'livery',
  'so',
  'gazed',
  'on',
  'now,',
  'Will',
  'be',
  'a',
  "totter'd",
  'weed',
  'of',
  'small',
  'worth',
  'held:',
  'Then',
  'being',
  'asked,',
  'where',
  'all',
  'thy',
  'beauty',
  'lies,',
  'Where',
  'all',
  'the',
  'treasure',
  'of',
  'thy',
  'lusty',
  'days;',
  'To',
  'say,',
  'within',
  'thine',
  'own',
  'deep',
  'sunken',
  'eyes,',
  'Were',
  'an',
  'all-eating',
  'shame,',
  'and',
  'thriftless',
  'praise.',
  'How',
  'much',
  'more',
  'praise',
  "deserv'd",
  'thy',
  "beauty's",
  'use,',
  'If',
  'thou',
  'couldst',
  'answer',
  "'This",
  'fair',
  'child',
  'of',
  'mine',
  'Shall',
  'sum',
  'my',
  'count,',
  'and',
  'make',
  'my',
  'old',
  "excuse,'",
  'Proving',
  'his',
  'beauty',
  'by',
  'succession',
  

In [78]:
"""
 we should tokenize the input, but we will ignore that for now build a list of tuples.

Each tuple is ([ word_i-CONTEXT_SIZE, ..., word_i-1 ], target word)

按照预设置的上下文大小构建元组（创建训练数据集）
"""
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]

# Print the first 3, just so you can see what they look like.
ngrams[:3]

[(['forty', 'When'], 'winters'),
 (['winters', 'forty'], 'shall'),
 (['shall', 'winters'], 'besiege')]

In [79]:
"""
构建每个单词的索引序列
"""

# 使用set()函数，返回一个无序不重复元素集
vocab = set(test_sentence)
# 单词数量为97
len(vocab)

97

In [80]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
word_to_ix

{'thine!': 0,
 'How': 1,
 'when': 2,
 'warm': 3,
 'worth': 4,
 'to': 5,
 'shall': 6,
 "youth's": 7,
 'use,': 8,
 'dig': 9,
 'own': 10,
 'old': 11,
 'Will': 12,
 'lusty': 13,
 'Thy': 14,
 'Then': 15,
 'more': 16,
 'lies,': 17,
 'on': 18,
 'by': 19,
 'field,': 20,
 'Were': 21,
 'thou': 22,
 'This': 23,
 'cold.': 24,
 'deep': 25,
 'besiege': 26,
 'proud': 27,
 "deserv'd": 28,
 'blood': 29,
 'sum': 30,
 'brow,': 31,
 'so': 32,
 'thy': 33,
 'now,': 34,
 'praise.': 35,
 'art': 36,
 'succession': 37,
 "beauty's": 38,
 'trenches': 39,
 'were': 40,
 'livery': 41,
 'where': 42,
 'eyes,': 43,
 'count,': 44,
 'To': 45,
 'within': 46,
 'thine': 47,
 'old,': 48,
 'made': 49,
 'praise': 50,
 'all': 51,
 'much': 52,
 'of': 53,
 'days;': 54,
 'in': 55,
 'sunken': 56,
 'Shall': 57,
 'If': 58,
 'winters': 59,
 'a': 60,
 "excuse,'": 61,
 'the': 62,
 'his': 63,
 'say,': 64,
 'held:': 65,
 'an': 66,
 'small': 67,
 'weed': 68,
 'my': 69,
 "'This": 70,
 'answer': 71,
 'fair': 72,
 'And': 73,
 'thriftless': 74

In [81]:
"""
构建模型
- 非常简单的模型，只是用来展示embedding层的使用方法
- 用的是很简单的线性层
"""

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()

        # 将vocab_size个单词嵌入维数为embedding_dim的空间，嵌入后的词向量表示为torch.Size([1, embedding_dim]))
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        # 这里每次训练的词汇数（上下文数量）为context_size，故输入tensor形状为context_size * embedding_dim
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)

        # 输出的结果大小为97，即单词的个数
        self.linear2 = nn.Linear(128, vocab_size)


    def forward(self, inputs):

        # inputs.shape=torch.size([2])（context_size=2）
        # self.embeddings(inputs).shape=torch.size([2,10])
        # view相当于numpy里的resize
        # view((1,-1))表示resize为1行的矩阵，-1表示列数自适应
        # embeds.shape=torch.size([1,20])
        embeds = self.embeddings(inputs).view((1, -1))


        out = F.relu(self.linear1(embeds))

        out = self.linear2(out)

        # 在softmax的结果上再做多一次log运算
        log_probs = F.log_softmax(out, dim=1)

        return log_probs

In [84]:
"""
定义训练所需组件
- 模型
- 损失函数
- 优化器
"""

# 存储loss的列表
losses = []

# The negative log likelihood loss
loss_function = nn.NLLLoss()

# 训练模型
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

# 优化器
optimizer = optim.SGD(model.parameters(), lr=0.001)

model

NGramLanguageModeler(
  (embeddings): Embedding(97, 10)
  (linear1): Linear(in_features=20, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=97, bias=True)
)

In [83]:
for epoch in range(10):
    total_loss = 0

    # 遍历训练集
    for context, target in ngrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words into integer indices and wrap them in tensors)
        # 获取context的词汇在word_to_ix的索引，并将其转为tensor类型
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        print(f'context_idxs: {context_idxs}, context_idxs.shape: {context_idxs.shape}')

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a new instance, you need to zero out the gradients from the old instance
        # 梯度清零
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)

# The loss decreased every iteration over the training data!
print(losses)


context_idxs: tensor([85, 83]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([59, 85]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([ 6, 59]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([26,  6]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([33, 26]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([31, 33]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([73, 31]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([ 9, 73]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([25,  9]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([39, 25]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([55, 39]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([33, 55]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([38, 33]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([20, 38]), context_idxs.shape: torch.Size([2])
context_idxs: tensor([14, 20]), context_idxs.sha

In [85]:
# To get the embedding of a particular word, e.g. "beauty"
print(model.embeddings.weight[word_to_ix["beauty"]])

tensor([-1.0484, -0.0432,  1.2019, -0.0697,  1.0654, -0.8067, -0.1626, -0.7073,
         0.2308, -1.5493], grad_fn=<SelectBackward0>)


## Exercise: Computing Word Embeddings: Continuous Bag-of-Words(CBOW)
CBOW是continuous bag of words的缩写，中文译为“连续词袋模型”。它是一种用于生成词向量的神经网络模型，由Tomas Mikolov等人于2013年提出 。词向量是一种将单词表示为固定长度的实数向量的方法，可以捕捉单词之间的语义和语法关系。

CBOW的基本思想是，给定一个单词的上下文（即窗口内的其他单词），预测该单词本身。例如，对于句子“The cat climbed up the tree”，如果窗口大小为5，那么当中心单词为“climbed”时，上下文单词为“The”、“cat”、“up”和“the”。CBOW模型要求根据这四个上下文单词，计算出“climbed”的概率分布。

![](../../img/1_1.png)


In [90]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# 按照空格划分单词并保存在列表中
raw_text

['We',
 'are',
 'about',
 'to',
 'study',
 'the',
 'idea',
 'of',
 'a',
 'computational',
 'process.',
 'Computational',
 'processes',
 'are',
 'abstract',
 'beings',
 'that',
 'inhabit',
 'computers.',
 'As',
 'they',
 'evolve,',
 'processes',
 'manipulate',
 'other',
 'abstract',
 'things',
 'called',
 'data.',
 'The',
 'evolution',
 'of',
 'a',
 'process',
 'is',
 'directed',
 'by',
 'a',
 'pattern',
 'of',
 'rules',
 'called',
 'a',
 'program.',
 'People',
 'create',
 'programs',
 'to',
 'direct',
 'processes.',
 'In',
 'effect,',
 'we',
 'conjure',
 'the',
 'spirits',
 'of',
 'the',
 'computer',
 'with',
 'our',
 'spells.']

In [91]:
# By deriving a set from `raw_text`, we deduplicate the array
# 返回一个无序且不含重复元素的集合
vocab = set(raw_text)
vocab

{'As',
 'Computational',
 'In',
 'People',
 'The',
 'We',
 'a',
 'about',
 'abstract',
 'are',
 'beings',
 'by',
 'called',
 'computational',
 'computer',
 'computers.',
 'conjure',
 'create',
 'data.',
 'direct',
 'directed',
 'effect,',
 'evolution',
 'evolve,',
 'idea',
 'inhabit',
 'is',
 'manipulate',
 'of',
 'other',
 'our',
 'pattern',
 'process',
 'process.',
 'processes',
 'processes.',
 'program.',
 'programs',
 'rules',
 'spells.',
 'spirits',
 'study',
 'that',
 'the',
 'they',
 'things',
 'to',
 'we',
 'with'}

In [92]:
# 为每个单词生成索引
word_to_ix = {word: i for i, word in enumerate(vocab)}
word_to_ix

{'is': 0,
 'things': 1,
 'are': 2,
 'conjure': 3,
 'pattern': 4,
 'evolve,': 5,
 'The': 6,
 'idea': 7,
 'of': 8,
 'about': 9,
 'Computational': 10,
 'process.': 11,
 'processes': 12,
 'we': 13,
 'to': 14,
 'computer': 15,
 'that': 16,
 'inhabit': 17,
 'with': 18,
 'our': 19,
 'manipulate': 20,
 'process': 21,
 'People': 22,
 'spirits': 23,
 'spells.': 24,
 'study': 25,
 'create': 26,
 'As': 27,
 'directed': 28,
 'computational': 29,
 'called': 30,
 'abstract': 31,
 'by': 32,
 'a': 33,
 'rules': 34,
 'the': 35,
 'processes.': 36,
 'they': 37,
 'effect,': 38,
 'other': 39,
 'We': 40,
 'programs': 41,
 'computers.': 42,
 'data.': 43,
 'In': 44,
 'beings': 45,
 'direct': 46,
 'program.': 47,
 'evolution': 48}

In [93]:
"""
生成训练样本
- CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
"""
data = []
for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
    context = (
        [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)]
        + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
    )
    target = raw_text[i]
    data.append((context, target))

# 显示前5个训练样本
data[:5]

[(['are', 'We', 'to', 'study'], 'about'), (['about', 'are', 'study', 'the'], 'to'), (['to', 'about', 'the', 'idea'], 'study'), (['study', 'to', 'idea', 'of'], 'the'), (['the', 'study', 'of', 'a'], 'idea')]


In [None]:
"""
Create your model and train. Here are some functions to help you make the data ready for use by your module.
"""

class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass

In [None]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

make_context_vector(data[0][0], word_to_ix)  # example