In [1]:
# -*- coding: utf-8 -*-
import torch
from torch import nn
from torch import optim
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.datasets import IMDB
import numpy as np
import collections
import os
import sys
import random
import tarfile
import time
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# 数据加载

In [2]:
# IMDB（Internet Movie Database）是一个来自互联网的电影数据库，
# 其中包含了50000条严重两极分化的电影评论。
# 数据集被划分为训练集和测试集，其中训练集和测试集中各有25000条评论，
# 并且训练集和测试集都包含50%的正面评论和50%的消极评论。
# 数据集下载地址：http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [3]:
# 手动设置CPU生成随机数的种子，方便下次复现实验结果
torch.manual_seed(123)

<torch._C.Generator at 0x2c30ef7c7d0>

In [4]:
DATA_ROOT = "./data"
batch_size = 10
# 检查是否有GPU可用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
max_length = 256

In [5]:
def read_imdb(folder='train', data_root=DATA_ROOT+"/aclImdb"):  
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([label, review])
    random.shuffle(data)
    return data 

In [6]:
train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5723.30it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5619.67it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:01<00:00, 6577.07it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:01<00:00, 6440.26it/s]


# 数据预处理

In [7]:
# 分词器
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [8]:
print(tokenizer('here is the an example!'))

['here', 'is', 'the', 'an', 'example', '!']


In [9]:
# 分词迭代器
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [10]:
# 使用分词迭代器构建词汇表
# specials参数用于自定义词表："<pad>"和"<unk>"，分别表示占位符和未登录词（没有被收录在词表中的词）
vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(train_data), specials=["<pad>", "<unk>"])
# 未在词汇表的数据的索引被设置为词汇表中"<unk>"的索引。
vocab.set_default_index(vocab["<unk>"])

In [11]:
# 数据处理pipeline
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 1 if x == 'pos' else 0

In [12]:
print(text_pipeline('here is an example <pad> <pad>'))

[132, 10, 41, 465, 0, 0]


In [13]:
print(label_pipeline('pos'))

1


In [14]:
pad = text_pipeline('<pad>')

In [15]:
# 整理batch数据，包括对变长数据的处理等
def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = text_pipeline(_text)[:max_length]
         length_list.append(len(processed_text))
         text_list.append((processed_text + pad * max_length)[:max_length])
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.tensor(text_list, dtype=torch.int64)
    length_list = torch.tensor(length_list, dtype=torch.int64)
    return label_list.to(device), text_list.to(device), length_list.to(device)

In [16]:
# 使用to_map_style_dataset函数将迭代器转化为Dataset类型
train_dataset = to_map_style_dataset(train_data)
test_dataset = to_map_style_dataset(test_data)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_eval_ = random_split(train_dataset, 
                                         [num_train, len(train_dataset) - num_train])
train_dataloader = DataLoader(split_train_, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
eval_dataloader = DataLoader(split_eval_, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# 模型定义

In [17]:
# 定义模型
class BiLSTM(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
                 dropout_rate, pad_index=0, pretrained_embedding=None):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional,
                            dropout=dropout_rate, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.init_weights(pretrained_embedding)

    def init_weights(self, pretrained_embedding):
        initrange = 0.5
        if pretrained_embedding != None:
            self.embedding.weight.data.copy_(pretrained_embedding)
            # 直接加载预训练好的weight, 所以不需要更新它
            self.embedding.weight.requires_grad = False 
        else:
            self.embedding.weight.data.uniform_(-initrange, initrange)
            self.embedding.weight.requires_grad = True 
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, ids, length):
        embedded = self.dropout(self.embedding(ids))
        packed_embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, length.to("cpu"), batch_first=True, 
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_length = torch.nn.utils.rnn.pad_packed_sequence(packed_output)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
        else:
            hidden = self.dropout(hidden[-1])
        prediction = self.fc(hidden)
        return prediction

In [18]:
# 由于情感分类的训练数据集不是很⼤，
# 为应对过拟合，可以使⽤在更⼤规模语料上预训练的词向量作为每个词的特征向量。
# 这⾥，我们为词典vocab中的每个词加载300维的GloVe词向量。
# 注意，预训练词向量的维度需要与创建的模型中的嵌⼊层维度embedding_dim⼀致。
# 第⼀次加载预训练词向量实例时会⾃动下载相应的词向量到cache参数指定的⽂件夹（默认为.vector_cache）。
glove_vocab = Vocab.GloVe(name='6B', dim=300, cache="./model/glove")

In [19]:
# 然后，我们将⽤这些词向量作为评论中每个词的特征向量。
# 此外，在训练中我们不再更新这些词向量。
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 0
    if oov_count > 0:
        print("There are %d oov words.")
    return embed

In [20]:
pretrained_embedding = load_pretrained_embedding(vocab.get_itos(), glove_vocab)

In [21]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
# 由于数据的情感极性共分为两类，因此这里我们要把output_dim的值设置为2。
output_dim = 2
n_layers = 2
bidirectional = True
dropout_rate = 0.5
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout_rate, pretrained_embedding = pretrained_embedding)

# 模型训练

In [22]:
lr = 5e-4
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [23]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [24]:
def train(dataloader, model, criterion, optimizer, device):
    model = model.to(device)
    model.train()
    train_losses = []
    train_accs = []
    total_acc, total_count = 0, 0
    log_interval = 50
    start_time = time.time()
    for idx, (label, ids, length) in enumerate(dataloader):
        label = label.to(device)
        ids = ids.to(device)
        length = length.to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label) # loss计算
        accuracy = get_accuracy(prediction, label)
        # 梯度更新
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        train_accs.append(accuracy.item())
        total_acc += (prediction.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc / total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
    return train_losses, train_accs

In [25]:
def test(dataloader, model, criterion, device):
    model = model.to(device)
    # 切换到推理模式
    model.eval()
    test_losses = []
    test_accs = []
    total_acc, total_count = 0, 0
    log_interval = 50
    start_time = time.time()
    with torch.no_grad():
        for idx, (label, ids, length) in enumerate(dataloader):
            label = label.to(device)
            ids = ids.to(device)
            length = length.to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label) # loss计算
            accuracy = get_accuracy(prediction, label)
            test_losses.append(loss.item())
            test_accs.append(accuracy.item())
            total_acc += (prediction.argmax(1) == label).sum().item()
            total_count += label.size(0)
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches '
                      '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                                  total_acc / total_count))
                total_acc, total_count = 0, 0
                start_time = time.time()
    return test_losses, test_accs

In [26]:
best_eval_loss = float('inf')
train_loss_list = []
train_acc_list = []
eval_loss_list = []
eval_acc_list = []
n_epochs = 2
for epoch in range(n_epochs):
    train_losses, train_accs = train(train_dataloader, model, criterion, optimizer, device)
    eval_losses, eval_accs = test(eval_dataloader, model, criterion, device)
    train_loss_list.extend(train_losses)
    train_acc_list.extend(train_accs)
    eval_loss_list.extend(eval_losses)
    eval_acc_list.extend(eval_accs) 
    epoch_train_loss = np.mean(train_losses)
    epoch_train_acc = np.mean(train_accs)
    epoch_eval_loss = np.mean(eval_losses)
    epoch_eval_acc = np.mean(eval_accs)    
    if epoch_eval_loss < best_eval_loss:
        best_eval_loss = epoch_eval_loss
        torch.save(model.state_dict(), 'model/lstm/BiLSTM.pt')   
    print(f'epoch: {epoch + 1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'eval_loss: {epoch_eval_loss:.3f}, eval_acc: {epoch_eval_acc:.3f}')

| epoch   0 |    50/ 2375 batches | accuracy    0.527
| epoch   0 |   100/ 2375 batches | accuracy    0.530
| epoch   0 |   150/ 2375 batches | accuracy    0.546
| epoch   0 |   200/ 2375 batches | accuracy    0.558
| epoch   0 |   250/ 2375 batches | accuracy    0.606
| epoch   0 |   300/ 2375 batches | accuracy    0.594
| epoch   0 |   350/ 2375 batches | accuracy    0.530
| epoch   0 |   400/ 2375 batches | accuracy    0.610
| epoch   0 |   450/ 2375 batches | accuracy    0.600
| epoch   0 |   500/ 2375 batches | accuracy    0.606
| epoch   0 |   550/ 2375 batches | accuracy    0.538
| epoch   0 |   600/ 2375 batches | accuracy    0.618
| epoch   0 |   650/ 2375 batches | accuracy    0.616
| epoch   0 |   700/ 2375 batches | accuracy    0.682
| epoch   0 |   750/ 2375 batches | accuracy    0.584
| epoch   0 |   800/ 2375 batches | accuracy    0.664
| epoch   0 |   850/ 2375 batches | accuracy    0.640
| epoch   0 |   900/ 2375 batches | accuracy    0.662
| epoch   0 |   950/ 2375 ba

# 模型测试

## 测试集测试

In [27]:
# 加载model
model.load_state_dict(torch.load("model/lstm/BiLSTM.pt"))
test_losses, test_accs = test(test_dataloader, model, criterion, device)
print(f'test_loss: {np.mean(test_losses):.3f}, test_acc: {np.mean(test_accs):.3f}')

| epoch   1 |    50/ 2500 batches | accuracy    0.806
| epoch   1 |   100/ 2500 batches | accuracy    0.832
| epoch   1 |   150/ 2500 batches | accuracy    0.848
| epoch   1 |   200/ 2500 batches | accuracy    0.820
| epoch   1 |   250/ 2500 batches | accuracy    0.824
| epoch   1 |   300/ 2500 batches | accuracy    0.822
| epoch   1 |   350/ 2500 batches | accuracy    0.816
| epoch   1 |   400/ 2500 batches | accuracy    0.834
| epoch   1 |   450/ 2500 batches | accuracy    0.840
| epoch   1 |   500/ 2500 batches | accuracy    0.844
| epoch   1 |   550/ 2500 batches | accuracy    0.830
| epoch   1 |   600/ 2500 batches | accuracy    0.844
| epoch   1 |   650/ 2500 batches | accuracy    0.846
| epoch   1 |   700/ 2500 batches | accuracy    0.832
| epoch   1 |   750/ 2500 batches | accuracy    0.830
| epoch   1 |   800/ 2500 batches | accuracy    0.800
| epoch   1 |   850/ 2500 batches | accuracy    0.848
| epoch   1 |   900/ 2500 batches | accuracy    0.828
| epoch   1 |   950/ 2500 ba

## 真实影评测试

In [28]:
def predict(text, text_pipeline):
    with torch.no_grad():
        processed_text = text_pipeline(text)[:max_length]
        length = torch.tensor([len(processed_text)], dtype=torch.int64)
        text = torch.tensor([(processed_text + pad * max_length)[:max_length]])
        output = model(text, length)
        return output.argmax(1).item() + 1

In [29]:
review_label_dict = {1: "neg",
                     2: "pos"}

In [30]:
model = model.to("cpu")

In [31]:
review1 = "This movie is too rubbish, and I don't like it very much."
print("This is a %s review" % review_label_dict[predict(review1, text_pipeline)])

This is a neg review


In [32]:
review2 = "this movie is great, and I love it very much."
print("This is a %s review" % review_label_dict[predict(review2, text_pipeline)])

This is a pos review
