In [71]:
%%javascript
$.getScript("../../j_n_contents.js")

<IPython.core.display.Javascript object>

<div id="toc">
</div>

# 10.3 word2vector 实现

In [1]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data

sys.path.append("..") 
import d2lzh_pytorch as d2l
print(torch.__version__)

1.2.0


In [3]:
assert 'ptb.train.txt' in os.listdir("../../data/ptb")
with open('../../data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    # st是sentence的缩写
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [6]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [7]:
counter=collections.Counter([tk for st in raw_dataset for tk in st])

In [9]:
counter=dict(filter(lambda x:x[1]>=5,counter.items()))

In [13]:
idx_to_token=[tk for tk,_ in counter.items()]
token_to_idx={tk:idx for idx,tk in enumerate(idx_to_token)}


In [43]:
#dataset=[[token_to_idx[tk] for tk in st if tk in token_to_idx]
#        for st in raw_dataset]
dataset=[]
for st in raw_dataset:
    temp=[]
    for tk in st:
        if tk in token_to_idx:
            temp.append(token_to_idx[tk])
    dataset.append(temp)
num_tokens=sum([len(st) for st in dataset])

In [45]:
#随机概率丢弃高频词
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375534'

In [46]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('the')

'# the: before=50770, after=2038'

In [47]:
compare_counts('join')

'# join: before=45, after=45'

In [48]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [49]:
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1, 2]
center 1 has contexts [0, 2]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [3, 5]
center 5 has contexts [4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [50]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [56]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词。
                # 为了高效计算，可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [61]:
def batchify(data):
    """用作DataLoader的参数collate_fn: 输入是个长为batchsize的list, list中的每个元素都是__getitem__得到的结果"""
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))

In [62]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
        
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)

batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4

dataset = MyDataset(all_centers, 
                    all_contexts, 
                    all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify, 
                            num_workers=num_workers)
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break

centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


In [63]:
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight

Parameter containing:
tensor([[ 1.3311e+00, -7.8351e-01, -2.5407e-01, -7.3469e-01],
        [ 1.4853e+00,  9.5053e-01, -1.8310e+00,  3.2580e-01],
        [-8.8290e-01, -1.2237e+00, -3.9640e-01, -5.4609e-01],
        [-3.4685e-02, -8.2243e-01, -1.1334e+00,  9.6499e-01],
        [ 6.8404e-01,  7.7964e-01,  1.2325e+00, -1.3945e+00],
        [-1.5299e+00, -1.6869e+00, -1.2476e-01, -2.7307e-01],
        [-1.0258e+00, -2.5105e-01, -1.9814e-01,  1.9693e+00],
        [-2.3895e-01, -6.1635e-01,  2.8653e-01, -6.9716e-02],
        [-1.6077e-03, -1.6470e-01,  3.0593e-02, -1.0363e+00],
        [ 1.5744e+00, -5.2243e-01, -3.0018e-01, -1.2966e+00],
        [-2.1367e-01,  1.2058e+00, -9.9999e-01, -2.5739e-01],
        [ 7.4509e-01, -3.1585e-01,  1.2845e+00,  1.5918e-01],
        [-6.8050e-01,  1.8151e+00, -1.1791e+00, -2.5597e-01],
        [-1.4263e-01,  3.1252e-01,  1.1629e+00, -3.1424e-01],
        [-1.2961e-01, -8.4303e-01,  2.2926e-01, -1.0526e+00],
        [ 9.2904e-02, -1.4031e+00, -6.3834e-01, 

In [64]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.long)
embed(x)

tensor([[[ 1.4853,  0.9505, -1.8310,  0.3258],
         [-0.8829, -1.2237, -0.3964, -0.5461],
         [-0.0347, -0.8224, -1.1334,  0.9650]],

        [[ 0.6840,  0.7796,  1.2325, -1.3945],
         [-1.5299, -1.6869, -0.1248, -0.2731],
         [-1.0258, -0.2510, -0.1981,  1.9693]]], grad_fn=<EmbeddingBackward>)

In [65]:
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
torch.bmm(X, Y).shape

torch.Size([2, 1, 6])

In [67]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

In [None]:
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self): # none mean sum
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        """
        input – Tensor shape: (batch_size, len)
        target – Tensor of the same shape as input
        """
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
        return res.mean(dim=1)

loss = SigmoidBinaryCrossEntropyLoss()