<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/02113.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 10.7 文本情感分类：使用循环神经网络

In [0]:
import collections 
import os 
import random 
import tarfile 
import torch 
from torch import nn 
import torchtext.vocab as Vocab 
import torch.utils.data as Data 
import d2l 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 10.7.1 文本情感分类数据

#### 1 读取数据集

In [0]:
!mkdir -p ../data/calImdb

In [0]:
!pip install mxnet

In [0]:
from mxnet.gluon import utils as gutils

In [5]:
def download_imdb(data_dir='../data'):
    url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    sha1 = '01ada507287d82875905620988597833ad4e0903'
    fname = gutils.download(url, data_dir, sha1_hash=sha1)
    with tarfile.open(fname, 'r') as f:
        f.extractall(data_dir)

download_imdb()

Downloading ../data/aclImdb_v1.tar.gz from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz...


In [0]:
def read_imdb(folder='train'):  
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join('../data/aclImdb/', folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')

#### 2 预处理数据

In [0]:
def get_tokenized_imdb(data):
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

In [9]:
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 46151)

In [0]:
def preprocess_imdb(data, vocab):
    max_l = 500 
    
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels 

#### 3 创建数据迭代器

In [0]:
batch_size = 64 
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [12]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break 
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

### 10.7.2 使用循环神经网络的模型

In [0]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.encoder = nn.LSTM(input_size=embed_size, 
                    hidden_size=num_hiddens, 
                    num_layers=num_layers, 
                    bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2)

    def forward(self, inputs):
        embeddings = self.embedding(inputs.permute(1, 0))
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs 

In [0]:
embed_size, num_hiddens, num_layers = 100, 100, 2 
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

#### 1 加载预训练的词向量

In [15]:
glove_vocab = Vocab.GloVe(name='6B', dim=100)

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                           
100%|█████████▉| 398074/400000 [00:14<00:00, 27746.52it/s]

In [0]:
def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0  # out of vocabulary 
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 0 
    if oov_count > 0:
        print('There are %d oov words.')
    return embed 

net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab)
)
net.embedding.weight.requires_grad = False 

#### 2 训练并评价模型

In [18]:
lr, num_epochs = 0.01, 5 
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.5491, train acc 0.709, test acc 0.810, time 42.6 sec
epoch 2, loss 0.1984, train acc 0.825, test acc 0.830, time 42.3 sec
epoch 3, loss 0.1162, train acc 0.851, test acc 0.837, time 42.0 sec
epoch 4, loss 0.0775, train acc 0.869, test acc 0.844, time 41.9 sec
epoch 5, loss 0.0542, train acc 0.888, test acc 0.849, time 41.8 sec


In [0]:
def predict_sentiment(net, vocab, sentence):
    device = list(net.parameters())[0].device 
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [20]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

In [21]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'negative'