<a href="https://colab.research.google.com/github/yananma/5_programs_per_day/blob/master/02112.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 10.8 文本情感分类：使用卷积神经网络 ( textCNN )

In [0]:
import os 
import torch 
from torch import nn 
import torchtext.vocab as Vocab 
import torch.utils.data as Data 
import torch.nn.functional as F 
import d2l 
import tarfile
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 10.8.1 一维卷积层

In [0]:
def corr1d(X, K):
    w = K.shape[0]
    Y = torch.zeros((X.shape[0] - w + 1))
    for i in range(Y.shape[0]):
        Y[i] = (X[i: i + w] * K).sum()
    return Y 

In [3]:
X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
corr1d(X, K)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [5]:
def corr1d_multi_in(X, K):
    return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)

X = torch.tensor([[0, 1, 2, 3, 4, 5, 6], 
         [1, 2, 3, 4, 5, 6, 7], 
         [2, 3, 4, 5, 6, 7, 8]])
K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(X, K)

tensor([ 2.,  8., 14., 20., 26., 32.])

### 10.8.2 时序最大池化层

In [0]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    
    def forward(self, x):
        return F.max_pool1d(x, kernel_size=x.shape[2])

### 10.8.3 读取和预处理 IMDb 数据集

In [0]:
!mkdir ../data

In [0]:
!pip install mxnet

In [0]:
from mxnet.gluon import utils as gutils

In [0]:
DATA_ROOT = "../data"

In [0]:
def download_imdb(data_dir='../data'):
    url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    sha1 = '01ada507287d82875905620988597833ad4e0903'
    fname = gutils.download(url, data_dir, sha1_hash=sha1)
    with tarfile.open(fname, 'r') as f:
        f.extractall(data_dir)

download_imdb()

In [0]:
def read_imdb(folder='train'):  
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join('../data/aclImdb/', folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

In [22]:
batch_size = 64 
train_data = d2l.read_imdb('train', data_root=os.path.join(DATA_ROOT, 'aclImdb'))
test_data = d2l.read_imdb('test', data_root=os.path.join(DATA_ROOT, 'aclImdb'))
vocab = d2l.get_vocab_imdb(train_data)
train_set = Data.TensorDataset(*d2l.preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*d2l.preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

100%|██████████| 12500/12500 [00:00<00:00, 44420.38it/s]
100%|██████████| 12500/12500 [00:00<00:00, 45405.60it/s]
100%|██████████| 12500/12500 [00:00<00:00, 47225.96it/s]
100%|██████████| 12500/12500 [00:00<00:00, 45261.04it/s]


### 10.8.4 textCNN 模型

In [0]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                          out_channels = c, 
                          kernel_size = k))

    def forward(self, inputs):
        embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
        embeddings = embeddings.permute(0, 2, 1)
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs 

In [0]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

#### 1 加载预训练的词向量

In [28]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, 'glove'))

../data/glove/glove.6B.zip: 862MB [06:30, 2.21MB/s]                          
100%|█████████▉| 398098/400000 [00:20<00:00, 18741.06it/s]

In [0]:
net.embedding.weight.data.copy_(d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False 

#### 2 训练并评价模型

In [34]:
lr, num_epochs = 0.001, 5 
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.4310, train acc 0.807, test acc 0.861, time 16.2 sec
epoch 2, loss 0.1223, train acc 0.901, test acc 0.869, time 16.2 sec
epoch 3, loss 0.0449, train acc 0.950, test acc 0.853, time 16.2 sec
epoch 4, loss 0.0178, train acc 0.975, test acc 0.856, time 16.2 sec
epoch 5, loss 0.0083, train acc 0.986, test acc 0.858, time 16.2 sec


In [35]:
d2l.predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

In [36]:
d2l.predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'negative'