In [2]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data

import d2lzh_pytorch as d2l
print(torch.__version__)

1.4.0


# 1 处理数据集

In [3]:
assert 'ptb.train.txt' in os.listdir("./data/")

with open('./data/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [4]:
for st in raw_dataset[:3]:
    print('#tokens:', len(st), st[:5])

#tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
#tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
#tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


##  1.1 建立词语索引

In [6]:
# 为了计算简单，我们只保留在数据集中至少出现5次的词
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [7]:
# 将词映射到整数索引
idx_to_token = [tk for tk,_ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
          for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

## 1.2 二次采样

In [10]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
    1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset]) #可以看到二次采样后去掉了一半左右的词

'# tokens: 375789'

In [14]:
#下面比较the join在二次采样前后出现在数据集中的次数
def compare_counts(token):
    return '# %s: before = %d, after = %d' % (token, sum(
    [st.count(token_to_idx[token]) for st in dataset]), sum(
    [st.count(token_to_idx[token]) for st in subsampled_dataset]))
print(compare_counts('the'))
print(compare_counts('join'))

# the: before = 50770, after = 2104
# join: before = 45, after = 45


## 1.3 提取中心词和背景词

In [17]:
#将与中心词距离不超过背景窗口大小的词作为它的背景词。
#每次在整数1和max_window_size（最大背景窗口）之间随即均匀采样一个整数作为背景窗口的大小

def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2: #每个句子至少要有两个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                min(len(st), center_i + 1 + window_size)))

            indices.remove(center_i) #将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [18]:
#下面创建一个人工数据集， 其中含有词数分别为7和3的两个句子。设最大背景窗口为2，打印所有中心词和它们的背景词
tiny_dataset = [list(range(7)), list(range(7,10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1, 2]
center 1 has contexts [0, 2, 3]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [3, 5]
center 5 has contexts [4, 6]
center 6 has contexts [5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [19]:
# 实验中我们设最大背景窗口大小为5.下面提取数据集中所有的中心词及其背景词
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

# 2 负采样