In [1]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
DATA_ROOT = "E:\\corpus"

In [3]:
from tqdm import tqdm
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def read_imdb(folder='train', data_root="E:\\corpus\\aclImdb"): 
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')
#train_data
# [["another reason to watch this delightful movie is florence rice. florence who? that was my first reaction as the opening credits ran on the screen. i soon found out who florence rice was, a real beauty who turns in a simply wonderful performance. as they all do in this gripping ensemble piece. from 1939, its a different time but therein lies the charm. it transports you into another world. it starts out as a light comedy but then turns very serious. florence rice runs the gamut from comedienne to heroine. she is absolutely delightful, at the same time strong, vulnerable evolving from a girl to a woman.watch her facial expressions at the end of the movie. she made over forty movies, and i am going to seek out the other thirty nine. alan marshal is of the flynn/gable mode and proves a perfect match for florence. buddy ebsen and una merkel provide some excellent comic moments, but the real star is florence rice. fans of 30's/40's movies, don't miss this one!",
#   1],
#  ["i gave timecop a perfect 10, i gave this 1<br /><br />it's story is very boring, and it has only little to do with the original timecop. lots of things from timecop was scrapped, and they put in new stupid stuff instead. this story is taking place in 2060 (if i remember correctly), but for some reason the timetraveling is now more dangerous :confused:<br /><br />and the action scenes are nothing to be happy about, well most of them aren't... only the first one is great... and there aren't many action scenes at all, and they're all pretty short<br /><br />at one point in the story, the main character travels through time about 5 times within a few minutes... no wait, make that two times...<br /><br />in short: don't waste time watching this movie, it's not worth it",
#   0]]

100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:07<00:00, 1662.70it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:07<00:00, 1778.55it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:06<00:00, 2018.59it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:09<00:00, 1361.15it/s]


In [5]:
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

# return 
# [['another',
#   'reason',
#     ...
#     'miss',
#   'this',
#   'one!'],
#  ['i',
#   'gave',
#   'timecop'
#   ...
#   'not',
#   'worth',
#   'it']]  

In [6]:
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
#     len([st for st in get_tokenized_imdb(train_data)])=25000  25,000条评论
#     [['i','cannot','believe',..., 'taste','in','movies.']...]
# len([tk for st in tokenized_data for tk in st])=5844418  25000条评论的所有字数
# counter：Counter({'i': 70477,'cannot': 1089,'believe': 2309,'the': 322174,'same': 3770,...,'ending.<br': 55,'worst': 2440,...})
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5) # 默认添加特殊词[‘<unk’>, ‘<pad>’]到词汇表中

# vocab vocabulary object
# Vocab.freqs:各个词的频数
# Vocab.stoi：各个词的索引
vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)


('# words in vocab:', 46152)

In [7]:
def preprocess_imdb(data, vocab):
    max_l = 500  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
#     words : ['i','cannot','believe','the',...'taste','in','movies.']    
#     [vocab.stoi[word] for word in words]: [9,486,250,2,...,1743,8,560]
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data]) # 长度变为500，值为词索引标识符
    labels = torch.tensor([score for _, score in data])
    return features, labels
# return tuple
# (tensor([[ 167,  307,    6,  ...,    0,    0,    0],
#          [   9,  441,    0,  ...,    0,    0,    0],
#          [2068,    2,   58,  ...,    0,    0,    0],
#          ...,
#          [   9,  585,    9,  ...,    0,    0,    0],
#          [  46,    9,   90,  ...,    0,    0,    0],
#          [   9,   98,   10,  ...,    0,    0,    0]]),
#  tensor([1, 0, 0,  ..., 0, 1, 0]))

In [8]:
batch_size = 64
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
# train_set:25000行，500列，25000个评论
# (tensor([[ 167,  307,    6,  ...,    0,    0,    0], 
#          [   9,  441,    0,  ...,    0,    0,    0],
#          [2068,    2,   58,  ...,    0,    0,    0],
#          ...,
#          [   9,  585,    9,  ...,    0,    0,    0],
#          [  46,    9,   90,  ...,    0,    0,    0],
#          [   9,   98,   10,  ...,    0,    0,    0]]),
#  tensor([1, 0, 0,  ..., 0, 1, 0]))

# train_iter.batch_sampler
#     >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
#     [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
#     >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
#     [[0, 1, 2], [3, 4, 5], [6, 7, 8]]

# len(train_iter.batch_sampler)
# 总共有391个batch
# 每个batch有两个tensor，一个为训练数据，大小为torch.Size([64, 500])，一个为标注，大小为torch.Size([64])
# [tensor([[ 5039,   274,    44,  ...,     0,     0,     0],
#          [  221, 37596,     7,  ...,     0,     0,     0],
#          [ 1309,  3580,   175,  ..., 15343,     8,     3],
#          ...,
#          [   52,     7,   143,  ...,     0,     0,     0],
#          [    9,    67,    89,  ...,     0,     0,     0],
#          [  415,   210,    70,  ...,     0,     0,     0]]),
#  tensor([1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
#          1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
#          0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0])]

In [9]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

In [109]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        # h,c都有信息，双向则再*2，最后为隐藏单元*4
        self.decoder = nn.Linear(4*num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
#         inputs.shape: torch.Size([64, 500])
#         print(inputs.permute(1, 0).shape):torch.Size([500, 64]) ,置换tensor中的维度
#         print(embeddings.shape): torch.Size([500, 64, 100])
        embeddings = self.embedding(inputs.permute(1, 0))
        
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
#         print(outputs[0].shape) : torch.Size([64, 200])
#         print(outputs[-1].shape) : torch.Size([64, 200])
#           print(encoding.shape) : torch.Size([64, 400])
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
       
        outs = self.decoder(encoding)
        return outs
    
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))

In [23]:
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
#     len(words)=46152 # 使用的语料库的词数
#     glove_vocab.vectors[0].shape[0]=100 选择的预训练词向量的维度
#     embed.shape torch.Size([46152, 100])
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary 未登录词
    # glove_vocab.stoi中未登录'<unk>'和'<pad>'
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
#     embed:每个词的100维词向量
    return embed

In [24]:
net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 21202 oov words.


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2512,  0.6499, -0.2465,  ...,  0.0659, -0.9114,  0.4129],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1752,  0.1468, -0.0800,  ...,  0.1581, -0.6230, -0.2806]])

In [111]:
lr, num_epochs = 0.01, 5
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.2409, train acc 0.905, test acc 0.877, time 43.9 sec
epoch 2, loss 0.0579, train acc 0.960, test acc 0.849, time 44.1 sec
epoch 3, loss 0.0179, train acc 0.982, test acc 0.857, time 44.1 sec
epoch 4, loss 0.0083, train acc 0.989, test acc 0.848, time 44.0 sec
epoch 5, loss 0.0059, train acc 0.991, test acc 0.856, time 43.9 sec


In [112]:
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [116]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) # positive
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) # negative

'positive'

'negative'

参考：http://tangshusen.me/Dive-into-DL-PyTorch/#/chapter10_natural-language-processing/10.7_sentiment-analysis-rnn