# **使用RNN进行情感分类**

In [1]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys

In [2]:
sys.path.append("../utils")
import d2lzh as d2l

In [3]:
torch.cuda.set_device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
DATA_ROOT = "../datasets/IMDB"

## **数据读取**

我们使用了IMDB的情感数据集，这个数据集分为训练和测试用的数据集两个数据集，分别包含25000电影评论，正反例数据量相同

### **读取数据**

In [5]:
fname = os.path.join(DATA_ROOT, 'aclImdb_v1.tar.gz')
if not os.path.exists(os.path.join(DATA_ROOT, 'aclImdb')):
    print("从压缩包解压.....")
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [6]:
from tqdm import tqdm
def read_imdb(folder='train', data_root=r'C:\D\ProgramFile\jupyter\learning\torch_learn\dive_to_dp\datasets\IMDB\aclImdb'):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:04<00:00, 2648.05it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:03<00:00, 3158.94it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:03<00:00, 3128.54it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:04<00:00, 3120.83it/s]


In [7]:
len(train_data)

25000

### **预处理数据**

预处理包括:
- 分词
- 创建字典
- 截断和补0

In [8]:
# 基于空格分词
def get_tokenized_imdb(data):
    """
    data:list of [string, label]
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

# 创建词典， 过滤了次数少于5的词
def get_vacab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5) # torchtext提供的功能

vocab = get_vacab_imdb(train_data)
len(vocab)

46152

In [9]:
# 截断和补零
def preprocess_imdb(data, vocab):
    max_l = 500
    
    def pad(x):
        return x[:max_l] if max_l < len(x) else x + [0] * (max_l - len(x))
    
    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in st]) for st in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

### **创建迭代器**

In [10]:
batch_size = 64
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [11]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

## **模型**

我们使用BiLSTM来进行分类

In [12]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.encoder = nn.LSTM(input_size = embed_size,
                               hidden_size = num_hiddens,
                               num_layers = num_layers,
                               bidirectional = True) # 输出是(seq_len, batch, num_directions * hidden_size)
        self.decoder = nn.Linear(4 * num_hiddens, 2) # 输入为初始时间步和最终时间步的隐藏状态
        
    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [13]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

# **加载预训练的词向量**

In [14]:
def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) 
    oov_count = 0
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx] # 把数组对应位置的数字变成词向量
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print(f"There are {oov_count} oov words.")
    return embed 

In [15]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))

In [16]:
net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 21202 oov words.


## **训练**

In [19]:
lr, num_epochs = 0.01, 5
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.3974, train acc 0.827, test acc 0.841, time 39.3 sec
epoch 2, loss 0.1739, train acc 0.852, test acc 0.836, time 39.5 sec
epoch 3, loss 0.1033, train acc 0.870, test acc 0.841, time 39.2 sec
epoch 4, loss 0.0691, train acc 0.887, test acc 0.852, time 40.3 sec
epoch 5, loss 0.0502, train acc 0.895, test acc 0.850, time 41.4 sec
