In [1]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data

import d2lzh_pytorch as d2l
print(torch.__version__)

1.4.0


# 处理数据集

In [2]:
assert 'ptb.train.txt' in os.listdir("./data/")

with open('./data/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [3]:
for st in raw_dataset[:3]:
    print('#tokens:', len(st), st[:5])

#tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
#tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
#tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


## 建立词语索引

In [4]:
# 为了计算简单，我们只保留在数据集中至少出现5次的词
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [5]:
# 将词映射到整数索引
idx_to_token = [tk for tk,_ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
          for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

## 二次采样

In [6]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
    1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset]) #可以看到二次采样后去掉了一半左右的词

'# tokens: 375787'

In [7]:
#下面比较the join在二次采样前后出现在数据集中的次数
def compare_counts(token):
    return '# %s: before = %d, after = %d' % (token, sum(
    [st.count(token_to_idx[token]) for st in dataset]), sum(
    [st.count(token_to_idx[token]) for st in subsampled_dataset]))
print(compare_counts('the'))
print(compare_counts('join'))

# the: before = 50770, after = 2162
# join: before = 45, after = 45


## 提取中心词和背景词

In [8]:
#将与中心词距离不超过背景窗口大小的词作为它的背景词。
#每次在整数1和max_window_size（最大背景窗口）之间随即均匀采样一个整数作为背景窗口的大小

def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2: #每个句子至少要有两个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                min(len(st), center_i + 1 + window_size)))

            indices.remove(center_i) #将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [9]:
#下面创建一个人工数据集， 其中含有词数分别为7和3的两个句子。设最大背景窗口为2，打印所有中心词和它们的背景词
tiny_dataset = [list(range(7)), list(range(7,10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1, 2]
center 1 has contexts [0, 2]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [3, 5]
center 5 has contexts [4, 6]
center 6 has contexts [5]
center 7 has contexts [8]
center 8 has contexts [7, 9]
center 9 has contexts [8]


In [10]:
# 实验中我们设最大背景窗口大小为5.下面提取数据集中所有的中心词及其背景词
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

# 负采样

In [11]:
# 对于一对中心词和背景词，我们随机采样K个噪声词（实验中设置K=5）。根据论文建议，噪声词采样概率P(w)设为w词频与总词频之比的0.75次方

def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                #根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词
                #为了高效计算，可以将k设的稍微大一点
                i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            #噪声词不能是背景词
            if neg not in set(context):
                negatives.append(neg)
        all_negatives.append(negatives)
            
    return all_negatives

sampling_weights = [counter[w] ** 0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

#  读取数据

In [12]:
# 从数据集中提取所有中心词all_centers,以及每个中心词对应的背景词all_contexts和噪声词all_negatives。先定义一个Dataset类
class MyDataset(torch.utils.data.Dataset):
    
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
    
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    
    def __len__(self):
        return len(self.centers)

In [13]:
# 小批量读取

def batchify(data):
    
    max_len = max(len(c) + len(n) for _,c,n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    
    return (torch.tensor(centers).view(-1,1), torch.tensor(contexts_negatives), torch.tensor(masks), torch.tensor(labels))


In [14]:
batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4

dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                           collate_fn = batchify, num_workers=num_workers)

for batch in data_iter:
    for name,data in zip(['centers', 'contexts_negatives', 'masks', 'labels'], batch):
        print(name, 'shape:', data.shape)
    break
    

centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


# 模型

通过使用嵌入层和小批量乘法来实现跳字模型

##  嵌入层

获取词嵌入的层称为嵌入层，在pytorch中可以通过创建nn.Embedding实例得到。嵌入层的权重是一个矩阵，其行数为词典大小（num_embedding），列数为每个词向量的维度（embedding_dim）.

In [17]:
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight

Parameter containing:
tensor([[ 8.2566e-01, -4.1418e-01,  1.0827e+00,  3.8593e-01],
        [ 1.0893e+00,  6.8264e-01, -6.2649e-01, -2.7596e-01],
        [-4.7299e-01,  4.5971e-01,  5.7199e-01, -6.5727e-01],
        [-2.6608e-01,  4.8379e-01, -2.0680e-01,  8.4419e-01],
        [ 3.4442e-01,  3.2321e-01,  1.9970e+00, -6.3670e-01],
        [ 8.4116e-01,  8.2674e-01,  6.8717e-01, -3.6522e+00],
        [ 7.8292e-01,  1.1777e+00,  1.5296e+00, -2.8856e-01],
        [ 4.5455e-01,  1.1677e+00,  1.3419e+00,  1.8962e+00],
        [ 1.1185e+00,  5.8836e-01, -1.3153e-01,  2.6126e-01],
        [ 2.0977e-03, -6.4240e-01,  1.8078e+00, -1.7651e+00],
        [ 9.6602e-01, -7.5056e-01,  1.3155e+00, -2.5733e-01],
        [ 5.2213e-01, -1.5378e+00, -3.1501e+00,  9.6825e-02],
        [ 4.8643e-01,  2.9936e-01, -9.8870e-01,  1.7854e+00],
        [ 6.3611e-02,  1.1025e+00, -1.9858e-02, -7.3321e-01],
        [-5.6354e-01, -1.6684e-01, -6.1249e-02,  7.2817e-01],
        [ 5.5789e-01, -5.1981e-01,  5.3205e-01, 

In [18]:
x = torch.tensor([[1, 2, 3],[4, 5, 6]], dtype=torch.long)
embed(x)

tensor([[[ 1.0893,  0.6826, -0.6265, -0.2760],
         [-0.4730,  0.4597,  0.5720, -0.6573],
         [-0.2661,  0.4838, -0.2068,  0.8442]],

        [[ 0.3444,  0.3232,  1.9970, -0.6367],
         [ 0.8412,  0.8267,  0.6872, -3.6522],
         [ 0.7829,  1.1777,  1.5296, -0.2886]]], grad_fn=<EmbeddingBackward>)

##  小批量乘法

In [19]:
# bmm对两个小批量中的矩阵一一做乘法
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
torch.bmm(X, Y).shape

torch.Size([2, 1, 6])

## 跳字模型的前向计算

在前向计算中，跳字模型的输入包括包含中心词索引center以及连结的背景词与噪声词索引contexts_and_negatives.其中center变量的形状为（批量大小，1），而contexts_and_negatives的形状为（批量大小， max_len）。这两个变量先通过词嵌入层分别由词索引变换为词向量，再通过小批量乘法得到形状为（批量大小，1，max_len）的输出。输出中的每个元素是中心词向量与背景词向量或噪声词向量的内积。

In [20]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

# 训练模型

## 二元交叉熵损失函数

In [23]:
class SigmodBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmodBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        return res.mean(dim=1)

loss = SigmodBinaryCrossEntropyLoss()

我们可以通过掩码变量指定小批量中参与损失函数计算的部分预测值和标签

In [24]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])#1/0 代表背景词/噪声词
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])#掩码变量
loss(pred, label, mask) * mask.shape[1] / mask.float().sum(dim=1)

tensor([0.8740, 1.2100])

In [25]:
def sigmd(x):
    return - math.log(1 / (1 + math.exp(-x)))
print('%.4f' % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) /4)) # 注意1-sigmoid(x) = sigmoid(-x)
print('%.4f' % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))

0.8740
1.2100


## 初始化模型参数

In [26]:
embed_size = 100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)

## 定义训练函数

In [29]:
def train(net, lr, num_epochs):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        
        for batch in data_iter:
            center, context_negaive, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negaive, net[0], net[1])
            l = (loss(pred.view(label.shape), label, mask) * mask.shape[1] / mask.float().sum(dim=1)).mean()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))
            

In [30]:
train(net, 0.01, 10)

train on cpu
epoch 1, loss 1.95, time 121.09s
epoch 2, loss 0.62, time 113.57s
epoch 3, loss 0.45, time 113.87s
epoch 4, loss 0.39, time 122.97s
epoch 5, loss 0.37, time 136.42s
epoch 6, loss 0.35, time 133.66s
epoch 7, loss 0.34, time 148.57s
epoch 8, loss 0.33, time 148.22s
epoch 9, loss 0.32, time 124.64s
epoch 10, loss 0.32, time 158.88s


# 应用词嵌入模型

In [34]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))


In [43]:
get_similar_tokens('chip', 3, net[0])

cosine sim=0.450: intel
cosine sim=0.413: chicago-based
cosine sim=0.410: edison
