## 9.3 word2vec的实现

In [1]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data
import d2lzh as d2l

In [2]:
with open('data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    # st是sentence的缩写
    raw_dataset = [st.split() for st in lines]
'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [3]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [4]:
# tk是token的缩写
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1]>=5, counter.items()))

In [5]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

In [6]:
def discard(idx):
    # 与均匀分布对比，确定该词是否被剔除
    return random.uniform(0, 1) < 1-math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens
    )
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375413'

In [8]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset])
    )
compare_counts('the')

'# the: before=50770, after=2131'

In [9]:
compare_counts('join')

'# join: before=45, after=45'

In [10]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        # 每个句子至少有2个词才能组成一对“中心词-背景词”
        if len(st)<2:
            continue
        # 只要句子长度大于等于2，每个词都要做中心词
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i-window_size), 
                                min(len(st), center_i+1+window_size)
                                ))
            # 将中心词排除在背景词外
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [11]:
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1]
center 1 has contexts [0, 2]
center 2 has contexts [0, 1, 3, 4]
center 3 has contexts [2, 4]
center 4 has contexts [3, 5]
center 5 has contexts [3, 4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [12]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [16]:
def get_negatives(all_contexts, sampling_weights, K):
    # all_contexts每个词的背景词列表
    # sampling_weights每个词词频的0.75次幂
    # K噪声词相比于背景词个数的倍数
    all_negatives, neg_candidates, i = [], [], 0
    # 词表中词的个数
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts)*K:
            if i==len(neg_candidates):
                # 从population随机选取k次数据，返回一个列表
                # 根据每个词的权重随机生成k个词的索引作为噪声词
                # 为了高效计算，可以将k设的稍微大一点
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5)
                )
            neg, i = neg_candidates[i], i+1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

In [17]:
sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [22]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers)==len(contexts)==len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    def __len__(self):
        return len(self.centers)

In [23]:
def batchify(data):
    # 中心词、背景词、噪声词
    max_len = max(len(c)+len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context)+len(negative)
        centers += [center]
        contexts_negatives += [context+negative+[0]*(max_len-cur_len)]
        masks += [[1]*cur_len+[0]*(max_len-cur_len)]
        labels += [[1]*len(context)+[0]*(max_len-len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives), torch.tensor(masks), torch.tensor(labels))

In [24]:
batch_size = 512
num_workers = 4
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True, collate_fn=batchify, num_workers=4)
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'], batch):
        print(name, 'shape:', data.shape)
    break

centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


In [25]:
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight

Parameter containing:
tensor([[-7.5014e-01, -6.7758e-01,  1.3905e-01, -3.2504e-01],
        [ 4.6432e-01, -1.2951e+00, -6.3469e-01,  1.7867e+00],
        [-1.4255e+00, -5.1740e-01, -1.8344e+00, -8.6178e-01],
        [-4.1812e-01,  9.8486e-01,  8.6274e-01, -1.8278e-01],
        [-4.8235e-02, -4.4251e-01,  1.8103e+00,  1.4304e-03],
        [ 6.0103e-01,  5.2687e-01, -7.9238e-01, -3.8206e-01],
        [-6.4648e-01,  6.1382e-01, -2.6217e-01,  1.8242e+00],
        [ 2.1424e-01, -1.2573e+00,  9.9863e-01,  4.7190e-01],
        [-1.4787e-01, -1.3340e+00,  9.4021e-03,  2.0213e-01],
        [-2.3015e-01, -5.4096e-01,  7.9691e-01,  5.0277e-01],
        [-2.4606e-01, -1.6335e+00, -9.0530e-03, -5.7450e-01],
        [ 1.7134e+00, -1.9645e+00, -1.8626e-01,  8.8031e-01],
        [ 1.7762e+00, -1.2961e-01, -1.2376e+00, -5.2555e-01],
        [ 1.7810e-01,  1.5218e+00, -2.2159e-01,  2.5675e-01],
        [ 1.5030e+00,  2.1536e-01,  1.1065e+00, -3.9987e-01],
        [ 3.3883e-01, -5.9900e-01, -4.5714e-01, 

In [26]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.long)
embed(x)

tensor([[[ 4.6432e-01, -1.2951e+00, -6.3469e-01,  1.7867e+00],
         [-1.4255e+00, -5.1740e-01, -1.8344e+00, -8.6178e-01],
         [-4.1812e-01,  9.8486e-01,  8.6274e-01, -1.8278e-01]],

        [[-4.8235e-02, -4.4251e-01,  1.8103e+00,  1.4304e-03],
         [ 6.0103e-01,  5.2687e-01, -7.9238e-01, -3.8206e-01],
         [-6.4648e-01,  6.1382e-01, -2.6217e-01,  1.8242e+00]]],
       grad_fn=<EmbeddingBackward>)

In [27]:
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
torch.bmm(X, Y).shape

torch.Size([2, 1, 6])

In [28]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    # 每一个词由背景词向量和中心词向量表示
    # 所以需要两个嵌入表示
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    # batch, emb, num
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

In [30]:
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        """
        input - Tensor shape: (batch_size, len)
        target - Tensor of the same shape as input
        """
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none', weight=mask)
        return res.mean(dim=1)
loss = SigmoidBinaryCrossEntropyLoss()

In [31]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 标签变量label中的1和0分别代表背景词和噪声词
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])
# 掩码变量
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])
loss(pred, label, mask)*mask.shape[1]/mask.float().sum(dim=1)

tensor([0.8740, 1.2100])

In [32]:
def sigmd(x):
    return -math.log(1/(1+math.exp(-x)))
# 1-sigmd(x)=sigmd(-x)
# 背景词部分计算sigmd(x),噪声词部分计算sigmd(-x)
print('%.4f' % ((sigmd(1.5)+sigmd(-0.3)+sigmd(1)+sigmd(-2))/4))
print('%.4f' % ((sigmd(1.1)+sigmd(-0.6)+sigmd(-2.2))/3))

0.8740
1.2100


In [33]:
embed_size = 100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size), 
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)

In [34]:
def train(net, lr, num_epochs):
    device = 'cuda'
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            # 使用掩码变量mask来避免填充项对损失函数计算的影响
            # 一个batch的平均loss
            l = (loss(pred.view(label.shape), label, mask) * mask.shape[1]/mask.float().sum(dim=1)).mean()
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2f' % (epoch+1, l_sum/n, time.time()-start))

In [35]:
train(net, 0.01, 10)

epoch 1, loss 1.96, time 14.15
epoch 2, loss 0.62, time 13.38
epoch 3, loss 0.45, time 13.23
epoch 4, loss 0.39, time 13.57
epoch 5, loss 0.37, time 13.39
epoch 6, loss 0.35, time 13.27
epoch 7, loss 0.34, time 13.31
epoch 8, loss 0.33, time 13.50
epoch 9, loss 0.32, time 13.92
epoch 10, loss 0.32, time 13.38


In [36]:
def get_similar_tokens(query_tokens, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_tokens]]
    # 添加1e-9是为了数值稳定性
    cos = torch.matmul(W, x)/(torch.sum(W*W, dim=1)*(torch.sum(x*x)+1e-9)).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    # 除去输入词
    for i in topk[1:]:
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
get_similar_tokens('chip', 3, net[0])

cosine sim=0.486: bugs
cosine sim=0.484: computers
cosine sim=0.444: mips


## 9.6 求近义词和类比词

In [3]:
! pip install torchtext==0.4.0

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 896 kB/s eta 0:00:011
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0


In [4]:
import torch
import torchtext.vocab as vocab
vocab.pretrained_aliases.keys()

dict_keys(['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d', 'glove.twitter.27B.50d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d'])

In [5]:
[key for key in vocab.pretrained_aliases.keys() if 'glove' in key]

['glove.42B.300d',
 'glove.840B.300d',
 'glove.twitter.27B.25d',
 'glove.twitter.27B.50d',
 'glove.twitter.27B.100d',
 'glove.twitter.27B.200d',
 'glove.6B.50d',
 'glove.6B.100d',
 'glove.6B.200d',
 'glove.6B.300d']

In [6]:
cache_dir = 'data/glove'
glove = vocab.GloVe(name='6B', dim=50, cache=cache_dir)

data/glove/glove.6B.zip: 862MB [08:31, 1.69MB/s]                               
100%|█████████▉| 399999/400000 [00:26<00:00, 14880.24it/s]


In [7]:
print('一共包含%d个词。' % len(glove.stoi))

一共包含400000个词。


In [8]:
glove.stoi['beautiful'], glove.itos[3366]

(3366, 'beautiful')

In [9]:
def knn(W, x, k):
    # 添加的1e-9是为了数值稳定性
    cos = torch.matmul(W, x.view((-1,))) / ((torch.sum(W*W, dim=1)+1e-9).sqrt()*torch.sum(x*x).sqrt())
    _, topk = torch.topk(cos, k=k)
    topk = topk.cpu().numpy()
    return topk, [cos[i].item() for i in topk]

In [12]:
def get_similar_tokens(query_token, k, embed):
    topk, cos = knn(embed.vectors, embed.vectors[embed.stoi[query_token]], k+1)
    # 除去输入词
    for i, c in zip(topk[1:], cos[1:]):
        print('cosine sim=%.3f: %s' % (c, (embed.itos[i])))

In [13]:
get_similar_tokens('chip', 3, glove)

cosine sim=0.856: chips
cosine sim=0.749: intel
cosine sim=0.749: electronics


In [14]:
get_similar_tokens('baby', 3, glove)

cosine sim=0.839: babies
cosine sim=0.800: boy
cosine sim=0.792: girl


In [16]:
get_similar_tokens('beautiful', 3, glove)

cosine sim=0.921: lovely
cosine sim=0.893: gorgeous
cosine sim=0.830: wonderful


In [17]:
def get_analogy(token_a, token_b, token_c, embed):
    vecs = [embed.vectors[embed.stoi[t]] for t in [token_a, token_b, token_c]]
    x = vecs[1] - vecs[0] + vecs[2]
    topk, cos = knn(embed.vectors, x, 1)
    return embed.itos[topk[0]]

In [18]:
get_analogy('man', 'woman', 'son', glove)

'daughter'

In [19]:
get_analogy('beijing', 'china', 'tokyo', glove)

'japan'

In [20]:
get_analogy('bad', 'worst', 'big', glove)

'biggest'

In [21]:
get_analogy('do', 'did', 'go', glove)

'went'

## 9.7 文本情感分类：使用循环神经网络

In [22]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import sys
import d2lzh as d2l
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT = 'data'

In [23]:
fname = os.path.join(DATA_ROOT, 'aclImdb_v1.tar.gz')
if not os.path.exists(os.path.join(DATA_ROOT, 'aclImdb')):
    print('从压缩包解压...')
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

从压缩包解压...


In [24]:
from tqdm import tqdm
def read_imdb(folder='train', data_root='data/aclImdb'):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        # 读取路径下所有文件
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label=='pos' else 0])
    random.shuffle(data)
    return data
train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████| 12500/12500 [00:00<00:00, 20482.38it/s]
100%|██████████| 12500/12500 [00:01<00:00, 11169.05it/s]
100%|██████████| 12500/12500 [00:00<00:00, 21138.34it/s]
100%|██████████| 12500/12500 [00:00<00:00, 20492.99it/s]


In [25]:
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

In [26]:
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)
vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 46152)

In [27]:
def preprocess_imdb(data, vocab):
    # 将每条评论通过截断或者补0，使得长度变成500
    max_l = 500
    def pad(x):
        return x[:max_l] if len(x)>max_l else x+[0]*(max_l-len(x))
    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [29]:
batch_size = 64
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [30]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

In [31]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirection设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                              hidden_size=num_hiddens, 
                              num_layers=num_layers, 
                              bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.decoder = nn.Linear(4*num_hiddens, 2)
    def forward(self, inputs):
        # inputs的形状是（批量大小，词数）
        # 因为LSTM需要将序列长度（seq_len）作为第一维
        # 所以将输入转置后再提取词特征
        # 输出形状为（词数、批量大小、词向量维度）
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的
        # 隐藏层在各时间步的隐藏状态
        # outputs形状是（词数，批量大小，2*隐层单元个数）
        # 乘2是因为双向LSTM
        outputs, _ = self.encoder(embeddings)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入
        # 它的形状为（批量大小，4*隐藏单元个数）
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [32]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

In [33]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, 'glove'))

100%|█████████▉| 399999/400000 [00:43<00:00, 9198.13it/s] 


In [34]:
def load_pretrained_embedding(words, pretrained_vocab):
    """
    从预训练好的vocab中提取出words对应的词向量
    """
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i,:] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    # 没有索引的单词个数
    if oov_count > 0:
        print('There are %d oov words.' % oov_count)
    return embed
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
# 直接加载预训练好的，所以不需要更新
net.embedding.weight.requires_grad = False

There are 21202 oov words.


In [35]:
lr, num_epochs = 0.01, 5
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

Let's use 2 GPUs!
training on cuda
epoch 1, loss 0.6009, train acc 0.675, test acc 0.794, time 103.1 sec
epoch 2, loss 0.2098, train acc 0.811, test acc 0.838, time 96.3 sec
epoch 3, loss 0.1183, train acc 0.844, test acc 0.851, time 99.5 sec
epoch 4, loss 0.0772, train acc 0.870, test acc 0.858, time 98.7 sec
epoch 5, loss 0.0542, train acc 0.890, test acc 0.853, time 99.4 sec


In [36]:
def predict_sentiment(net, vocab, sentence):
    """
    sentence是词语的列表
    """
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item()==1 else 'negative'

In [37]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

In [38]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'negative'

## 9.8 文本情感分类：使用卷积神经网络（textCNN）

In [1]:
import os
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.nn.functional as F
import sys
import d2lzh as d2l
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_ROOT = 'data'

In [2]:
def corr1d(X, K):
    w = K.shape[0]
    Y = torch.zeros(X.shape[0]-w+1)
    for i in range(Y.shape[0]):
        Y[i] = (X[i: i+w]*K).sum()
    return Y

In [4]:
X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
corr1d(X, K)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [5]:
def corr1d_multi_in(X, K):
    # 首先沿着X和K的第0维（通道维）遍历并计算一维互相关结果
    # 然后将所有结果堆叠起来沿着第0维累加
    return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)
X = torch.tensor([
    [0, 1, 2, 3, 4, 5, 6], 
    [1, 2, 3, 4, 5, 6, 7], 
    [2, 3, 4, 5, 6, 7, 8]
])
K = torch.tensor([
    [1, 2], 
    [3, 4], 
    [-1, -3]
])
corr1d_multi_in(X, K)

tensor([ 2.,  8., 14., 20., 26., 32.])

In [6]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        # x shape: (batch_size, channel, seq_len)
        # return shape: (batch_size, channel, 1)
        return F.max_pool1d(x, kernel_size=x.shape[2])

In [7]:
import torch.utils.data as Data
batch_size = 64
train_data = d2l.read_imdb('train', data_root=os.path.join(DATA_ROOT, 'aclImdb'))
test_data = d2l.read_imdb('test', data_root=os.path.join(DATA_ROOT, 'aclImdb'))
vocab = d2l.get_vocab_imdb(train_data)
train_set = Data.TensorDataset(*d2l.preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*d2l.preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

100%|██████████| 12500/12500 [00:00<00:00, 22277.48it/s]
100%|██████████| 12500/12500 [00:00<00:00, 19936.30it/s]
100%|██████████| 12500/12500 [00:00<00:00, 23050.33it/s]
100%|██████████| 12500/12500 [00:00<00:00, 21506.03it/s]


In [8]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # 不参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        # 时序最大化层没有权重，所以可以共用一个实例
        self.pool = GlobalMaxPool1d()
        # 创建多个一维卷积层
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels=2*embed_size, 
                                       out_channels=c, 
                                       kernel_size=k))
    def forward(self, inputs):
        # 将两个形状是(批量大小,词数,词向量维度)的嵌入层的输出按词向量连结
        # (batch, seq_len, 2*embed_size)
        embeddings = torch.cat((self.embedding(inputs), 
                               self.constant_embedding(inputs)), dim=2)
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维
        # 即词向量那一维变换到前一维
        embeddings = embeddings.permute(0, 2, 1)
        # 对于每个一维卷积，在时序最大池化后会得到一个形状为(批量大小,通道大小,1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [9]:
embed_size = 100
kernel_size = [3, 4, 5]
nums_channels = [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_size, nums_channels)

In [10]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, 'glove'))
net.embedding.weight.data.copy_(d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

There are 21202 oov words.
There are 21202 oov words.


In [11]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

Let's use 2 GPUs!
training on cuda
epoch 1, loss 0.4848, train acc 0.755, test acc 0.809, time 26.7 sec
epoch 2, loss 0.1607, train acc 0.861, test acc 0.870, time 21.1 sec
epoch 3, loss 0.0685, train acc 0.919, test acc 0.876, time 21.3 sec
epoch 4, loss 0.0294, train acc 0.958, test acc 0.864, time 21.0 sec
epoch 5, loss 0.0127, train acc 0.978, test acc 0.863, time 21.1 sec


In [12]:
d2l.predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

In [13]:
d2l.predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'negative'

## 9.12 机器翻译

In [1]:
import collections
import os
import io
import math
import torch
from torch import nn
import torch.nn.functional as F
import torchtext.vocab as Vocab
import torch.utils.data as Data
import sys
import d2lzh as d2l
PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# 将一个序列中所有的词记录在all_tokens中以便之后构造词典
# 然后在该序列后面添加PAD直到序列长度为max_seq_len
# 然后将序列保存在all_seqs中
def process_one_seq(seq_tokens, all_tokens, all_seqs, max_seq_len):
    all_tokens.extend(seq_tokens)
    seq_tokens += [EOS] + [PAD] * (max_seq_len-len(seq_tokens)-1)
    all_seqs.append(seq_tokens)
# 使用所有的词构造词典。并将所有序列中的词变为索引后构造Tensor
def build_data(all_tokens, all_seqs):
    vocab = Vocab.Vocab(collections.Counter(all_tokens), specials=[PAD, BOS, EOS])
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs]
    return vocab, torch.tensor(indices)

In [3]:
def read_data(max_seq_len):
    # in和out分别是input和output的缩写
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    with io.open('data/fr-en-small.txt') as f:
        lines = f.readlines()
    for line in lines:
        in_seq, out_seq = line.rstrip().split('\t')
        in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
        if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
            # 如果加上EOS后长于max_seq_len，则忽略掉此样本
            continue
        process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
        process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
    in_vocab, in_data = build_data(in_tokens, in_seqs)
    out_vocab, out_data = build_data(out_tokens, out_seqs)
    return in_vocab, out_vocab, Data.TensorDataset(in_data, out_data)

In [4]:
max_seq_len = 7
in_vocab, out_vocab, dataset = read_data(max_seq_len)
dataset[0]

(tensor([ 5,  4, 45,  3,  2,  0,  0]), tensor([ 8,  4, 27,  3,  2,  0,  0]))

In [13]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, drop_prob=0, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, dropout=drop_prob)
    def forward(self, inputs, state):
        # 输入形状是(批量大小，时间步数)
        # 将输出互换样本维和时间步维
        # (seq_len, batch_size, input_size)
        embedding = self.embedding(inputs.long()).permute(1, 0, 2)
        return self.rnn(embedding, state)
    def begin_state(self):
        # 隐藏状态初始化为None时PyTorch会自动初始化为0
        return None

In [14]:
encoder = Encoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
output, state = encoder(torch.zeros((4, 7)), state=encoder.begin_state())
# GRU的state是h，而LSTM的是一个元组(h,c)
output.shape, state.shape

(torch.Size([7, 4, 16]), torch.Size([2, 4, 16]))

In [15]:
def attention_model(input_size, attention_size):
    model = nn.Sequential(
        nn.Linear(input_size, attention_size, bias=False), 
        nn.Tanh(), 
        nn.Linear(attention_size, 1, bias=False)
    )
    return model

In [16]:
def attention_forward(model, enc_states, dec_state):
    """
    enc_states: (时间步数，批量大小，隐藏单元个数)
    dec_state: (批量大小，隐藏单元个数)
    """
    # 将解码器隐藏状态广播到和编码器隐藏状态形状相同后进行连结
    dec_states = dec_state.unsqueeze(dim=0).expand_as(enc_states)
    enc_and_dec_states = torch.cat((enc_states, dec_states), dim=2)
    # 形状为(时间步数，批量大小，1)
    e = model(enc_and_dec_states)
    # 在时间步维度做softmax运算
    alpha = F.softmax(e, dim=0)
    # 返回背景变量
    return (alpha*enc_states).sum(dim=0)

In [17]:
seq_len, batch_size, num_hiddens = 10, 4, 8
model = attention_model(2*num_hiddens, 10)
enc_states = torch.zeros((seq_len, batch_size, num_hiddens))
dec_state = torch.zeros((batch_size, num_hiddens))
attention_forward(model, enc_states, dec_state).shape

torch.Size([4, 8])

In [18]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, attention_size, drop_prob=0):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = attention_model(2*num_hiddens, attention_size)
        # GRU的输入包含attention输出的c和实际输入
        # 所以尺寸是num_hiddens+embed_size
        self.rnn = nn.GRU(num_hiddens+embed_size, num_hiddens, num_layers, dropout=drop_prob)
        self.out = nn.Linear(num_hiddens, vocab_size)
    def forward(self, cur_input, state, enc_states):
        """
        cur_input shape: (batch, )
        state shape: (num_layers, batch, num_hiddens)
        """
        # 使用注意力机制计算背景向量
        c = attention_forward(self.attention, enc_states, state[-1])
        # 将嵌入后的输入和背景向量在特征维连结
        # (批量大小，num_hiddens+embed_size)
        input_and_c = torch.cat((self.embedding(cur_input), c), dim=1)
        # 为输入和背景向量的连结增加时间步维，时间步个数为1
        output, state = self.rnn(input_and_c.unsqueeze(0), state)
        # 移除时间步维，输出形状为(批量大小，输出词典大小)
        output = self.out(output).squeeze(dim=0)
        return output, state
    def begin_state(self, enc_state):
        # 直接将编码器最终时间步的隐藏状态作为解码器的初始隐藏状态
        return enc_state

In [19]:
def batch_loss(encoder, decoder, X, Y, loss):
    batch_size = X.shape[0]
    enc_state = encoder.begin_state()
    enc_outputs, enc_state = encoder(X, enc_state)
    # 初始化解码器的隐藏状态
    dec_state = decoder.begin_state(enc_state)
    # 解码器在最初时间步的输入是BOS
    dec_input = torch.tensor([out_vocab.stoi[BOS]]*batch_size)
    # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失
    mask, num_not_pad_tokens = torch.ones(batch_size,), 0
    l = torch.tensor([0.0])
    # Y shape: (batch, seq_len)
    for y in Y.permute(1, 0):
        dec_output, dec_state = decoder(dec_input, dec_state, enc_outputs)
        l = l + (mask*loss(dec_output, y)).sum()
        # 使用强制教学
        dec_input = y
        num_not_pad_tokens += mask.sum().item()
        # EOS后面全是PAD，下面一行保证一旦遇到EOS接下来的循环中mask就一直是0
        mask = mask * (y != out_vocab.stoi[EOS]).float()
    return l / num_not_pad_tokens

In [20]:
def train(encoder, decoder, dataset, lr, batch_size, num_epochs):
    enc_optimizer = torch.optim.Adam(encoder.parameters(), lr=lr)
    dec_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss(reduction='none')
    data_iter = Data.DataLoader(dataset, batch_size, shuffle=True)
    for epoch in range(num_epochs):
        l_sum = 0.0
        for X, Y in data_iter:
            enc_optimizer.zero_grad()
            dec_optimizer.zero_grad()
            l = batch_loss(encoder, decoder, X, Y, loss)
            l.backward()
            enc_optimizer.step()
            dec_optimizer.step()
            l_sum += l.item()
        if (epoch+1) % 10 == 0:
            print('epoch %d, loss %.3f' % (epoch+1, l_sum/len(data_iter)))

In [21]:
embed_size, num_hiddens, num_layers = 64, 64, 2
attention_size = 10
drop_prob = 0.5
lr = 0.01
batch_size = 2
num_epochs = 50
encoder = Encoder(len(in_vocab), embed_size, num_hiddens, num_layers, drop_prob)
decoder = Decoder(len(out_vocab), embed_size, num_hiddens, num_layers, attention_size, drop_prob)
train(encoder, decoder, dataset, lr, batch_size, num_epochs)

epoch 10, loss 0.462
epoch 20, loss 0.215
epoch 30, loss 0.114
epoch 40, loss 0.105
epoch 50, loss 0.039


In [22]:
def translate(encoder, decoder, input_seq, max_seq_len):
    in_tokens = input_seq.split(' ')
    in_tokens += [EOS]+[PAD]*(max_seq_len-len(in_tokens)-1)
    # batch=1
    enc_input = torch.tensor([[in_vocab.stoi[tk] for tk in in_tokens]])
    enc_state = encoder.begin_state()
    enc_output, enc_state = encoder(enc_input, enc_state)
    dec_input = torch.tensor([out_vocab.stoi[BOS]])
    dec_state = decoder.begin_state(enc_state)
    output_tokens = []
    for _ in range(max_seq_len):
        dec_output, dec_state = decoder(dec_input, dec_state, enc_output)
        pred = dec_output.argmax(dim=1)
        pred_token = out_vocab.itos[int(pred.item())]
        # 当任一时间步搜索出现EOS时，输出序列即完成
        if pred_token == EOS:
            break
        else:
            output_tokens.append(pred_token)
            dec_input = pred
    return output_tokens

In [23]:
input_seq = 'ils regardent .'
translate(encoder, decoder, input_seq, max_seq_len)

['they', 'are', 'watching', '.']

In [27]:
def bleu(pred_tokens, label_tokens, k):
    len_pred, len_label = len(pred_tokens), len(label_tokens)
    score = math.exp(min(0, 1-len_label/len_pred))
    for n in range(1, k+1):
        num_matches, label_subs = 0, collections.defaultdict(int)
        for i in range(len_label-n+1):
            label_subs[''.join(label_tokens[i: i+n])] += 1
        for i in range(len_pred-n+1):
            if label_subs[''.join(pred_tokens[i: i+n])] > 0:
                num_matches += 1
                label_subs[''.join(pred_tokens[i: i+n])] -= 1
        score *= math.pow(num_matches/(len_pred-n+1), math.pow(0.5, n))
    return score

In [28]:
def score(input_seq, label_seq, k):
    pred_tokens = translate(encoder, decoder, input_seq, max_seq_len)
    label_tokens = label_seq.split(' ')
    print('bleu %.3f, predict: %s' % (bleu(pred_tokens, label_tokens, k), 
                                     ' '.join(pred_tokens)))

In [29]:
score('ils regardent .', 'they are watching .', k=2)

bleu 1.000, predict: they are watching .


In [30]:
score('ils sont canadienne .', 'they are canadian .', k=2)

bleu 0.658, predict: they are arguing .
