<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/02121.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 10.3 word2vec 的实现

In [0]:
import collections 
import math 
import random 
import time 
import os 
import numpy as np 
import torch 
from torch import nn 
import torch.utils.data as Data 
import d2l 
import zipfile

### 10.3.1 处理数据集

In [2]:
!mkdir ../data

mkdir: cannot create directory ‘../data’: File exists


In [0]:
!git clone https://github.com/d2l-ai/d2l-zh.git

In [0]:
!cp d2l-zh/data/ptb.zip ../data

In [0]:
with zipfile.ZipFile('../data/ptb.zip', 'r') as zin:
    zin.extractall('../data/')

In [6]:
with open('../data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [7]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


#### 1 建立词语索引


In [0]:
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [9]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

#### 2 二次采样

In [10]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375563'

In [11]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' %(token, 
        sum([st.count(token_to_idx[token]) for st in dataset]), 
        sum([st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('the')

'# the: before=50770, after=2081'

In [12]:
compare_counts('join')

'# join: before=45, after=45'

#### 3 提取中心词和背景词

In [0]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:
            continue 
        centers += st 
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size), 
                        min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [14]:
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1]
center 1 has contexts [0, 2, 3]
center 2 has contexts [1, 3]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [3, 5]
center 5 has contexts [4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [0]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

### 10.3.2 负采样

In [0]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1 
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives 

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

### 10.3.3 读取数据

In [0]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers 
        self.contexts = contexts 
        self.negatives = negatives 

    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)

In [0]:
def batchify(data):
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives), 
        torch.tensor(masks), torch.tensor(labels))


In [19]:
batch_size = 512 
num_workers = 4 
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batchify, num_workers=num_workers)
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'], batch):
        print(name, 'shape:', data.shape)
    break 

centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


### 10.3.4 跳字模型

#### 1 嵌入层

In [20]:
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight

Parameter containing:
tensor([[ 0.5857, -0.8756,  0.6874, -1.0600],
        [ 2.3237,  1.0624,  1.2328,  0.6584],
        [ 0.7717,  1.0390,  1.4787, -0.3867],
        [-1.0761,  0.2477,  0.4813,  0.3817],
        [-1.1075,  0.2145,  0.5586,  1.1737],
        [ 0.1053, -0.2418, -0.2784,  0.1152],
        [ 0.6647,  0.2206,  1.9034, -0.0731],
        [-0.6615,  0.4606, -1.0465,  1.5683],
        [ 1.1895,  1.4421, -0.8648,  0.6017],
        [-0.6986, -1.2590,  0.7220,  0.0954],
        [ 0.8325,  1.0354,  0.1956, -0.0661],
        [-0.7696,  0.2404, -0.7201, -0.0711],
        [-0.1542,  0.1029,  0.1461, -0.5842],
        [-0.2405,  0.3505,  0.0712,  1.3604],
        [ 0.1141,  0.8417, -0.8556, -0.0872],
        [-0.8149, -0.5558,  0.9528,  0.7065],
        [ 0.3150,  0.1334, -0.9347, -0.4403],
        [ 0.3200,  1.0298,  0.5296, -0.1633],
        [-1.1286,  0.8492, -1.5572, -0.6401],
        [ 1.3195, -1.2292,  0.6236, -0.1975]], requires_grad=True)

In [21]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.long)
embed(x)

tensor([[[ 2.3237,  1.0624,  1.2328,  0.6584],
         [ 0.7717,  1.0390,  1.4787, -0.3867],
         [-1.0761,  0.2477,  0.4813,  0.3817]],

        [[-1.1075,  0.2145,  0.5586,  1.1737],
         [ 0.1053, -0.2418, -0.2784,  0.1152],
         [ 0.6647,  0.2206,  1.9034, -0.0731]]], grad_fn=<EmbeddingBackward>)

#### 2 小批量乘法

In [22]:
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
torch.bmm(X, Y).shape 

torch.Size([2, 1, 6])

#### 3 跳字模型前向计算

In [0]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

### 10.3.5 训练模型

#### 1 二元交叉熵损失函数

In [0]:
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none', weight=mask)
        return res.mean(dim=1)

loss = SigmoidBinaryCrossEntropyLoss()

In [25]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])
loss(pred, label, mask) * mask.shape[1] / mask.float().sum(dim=1)

tensor([0.8740, 1.2100])

In [26]:
def sigmd(x):
    return -math.log(1 / (1 + math.exp(-x)))

print('%.7f' % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) / 4))
print('%.4f' % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))

0.8739896
1.2100


#### 2 初始化模型参数

In [0]:
embed_size = 100 
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size), 
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size), 
)

#### 3 定义训练函数

In [0]:
def train(net, lr, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('training on', device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0 
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            l = (loss(pred.view(label.shape), label, mask) * 
                 mask.shape[1] / mask.float().sum(dim=1)).mean()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1 
        print('epoch %d, loss %.2f, time %.2fs' 
          % (epoch + 1, l_sum / n, time.time() - start))

In [29]:
train(net, 0.01, 10)

training on cuda
epoch 1, loss 1.97, time 6.20s
epoch 2, loss 0.62, time 6.16s
epoch 3, loss 0.45, time 6.20s
epoch 4, loss 0.40, time 6.09s
epoch 5, loss 0.37, time 6.01s
epoch 6, loss 0.35, time 6.11s
epoch 7, loss 0.34, time 6.08s
epoch 8, loss 0.33, time 6.10s
epoch 9, loss 0.32, time 6.09s
epoch 10, loss 0.32, time 6.07s


### 10.3.6 应用词嵌入模型

In [30]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data 
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

get_similar_tokens('chip', 3, net[0])

cosine sim=0.482: software
cosine sim=0.475: armonk
cosine sim=0.462: lavelle
