# 文本情感分类：使用卷积神经网络（textCNN）

In [1]:
import os
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import  torch.nn.functional as F

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT = "../../data"
print(torch.__version__, device)

1.2.0 cuda


## 读取数据

In [2]:
import pandas as pd
import numpy as np

In [3]:
def loadfile():
    neg=pd.read_excel('D:/学习/研一下/大数据概论/neg_30000.xls',header=None)
    pos=pd.read_excel('D:/学习/研一下/大数据概论/pos_30000.xls',header=None)

    combined=np.concatenate((pos[0], neg[0]))
    # print(type(pos[0][0])) <str>
    y = np.concatenate((np.ones(len(pos),dtype=int), np.zeros(len(neg),dtype=int)))
    # pos 1, neg 0

    return combined, y

In [4]:
#构造数据
def data_classfier():
    combined,y = loadfile()
    data = combined
    labels = y
    print('Shape of data tensor:', len(data))
    print('Shape of label tensor:', len(labels))
    


    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)  #打乱
    data = data[indices]

    labels = labels[indices]
    
    VALIDATION_SPLIT = 0.2
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]
    return x_train,y_train,x_val,y_val

In [5]:
x_train,y_train,x_val,y_val = data_classfier()

Shape of data tensor: 60000
Shape of label tensor: 60000


In [6]:
print(type(x_train))
print(type(y_train))
print(type(x_val))
print(type(y_val))
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
48000
48000
12000
12000


In [7]:
train_data = []
for i in  range(len(x_train)):
    train_data.append([])
    train_data[i].append(x_train[i])
    train_data[i].append(y_train[i])

In [8]:
test_data = []
for i in  range(len(x_val)):
    test_data.append([])
    test_data[i].append(x_val[i])
    test_data[i].append(y_val[i])

## 时序最大池化层

In [9]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
         # x shape: (batch_size, channel, seq_len)
        return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)

## 预处理数据集

In [10]:
batch_size = 64
vocab = d2l.get_vocab_imdb(train_data)
train_set = Data.TensorDataset(*d2l.preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*d2l.preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

## textCNN模型

In [11]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        '''
        torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0,
        scale_grad_by_freq=False, sparse=False, _weight=None, device=None, dtype=None)
        num_embeddings (int) – size of the dictionary of embeddings
        embedding_dim (int) – the size of each embedding vector
        '''
        # 不参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        # 时序最大池化层没有权重，所以可以共用一个实例
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        '''
        ModuleList Holds submodules in a list.
        '''
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))

    def forward(self, inputs):
        # 将两个形状是(批量大小, 词数, 词向量维度)的嵌入层的输出按词向量连结
        embeddings = torch.cat((
            self.embedding(inputs), 
            self.constant_embedding(inputs)), dim=2) # (batch, seq_len, 2*embed_size)
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维(即词向量那一维)，变换到前一维
        embeddings = embeddings.permute(0, 2, 1) # 交换维度的函数
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [12]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

### 加载预训练的词向量

In [13]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache="/Users/tangshusen/Datasets/glove")
net.embedding.weight.data.copy_(
    d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
# itos（） Returns：List mapping indices to tokens.
"""load_pretrained_embedding从预训练好的vocab中提取出words对应的词向量"""
net.constant_embedding.weight.data.copy_(
    d2l.load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

There are 15341 oov words.
There are 15341 oov words.


### 训练并评价模型

In [14]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in tqdm(data_iter, desc ='evaluate'):
            y = torch.tensor(y, dtype=torch.long)# 更改label类型
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

In [15]:
import time
from tqdm import tqdm
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    best_acc = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        print(time.strftime("%Y-%m-%d \t %H:%M:%S", time.localtime()), end='\t')
        for X, y in tqdm(train_iter, desc ='train'):
            X = X.to(device)
            y = torch.tensor(y, dtype=torch.long)# 更改label类型
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        if test_acc > best_acc:
            best_acc = test_acc
#             print("saving", end='\t')
#             PATH = "./SaveModel/Glove_yelp_" + str(best_acc) + ".pth"
#             torch.save(net, PATH) 
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [16]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
2022-03-14 	 22:37:46	

  del sys.path[0]
train: 100%|██████████████████████████████████████████████████████████████████████| 750/750 [00:52<00:00, 14.26it/s]
  
evaluate: 100%|███████████████████████████████████████████████████████████████████| 188/188 [00:04<00:00, 37.60it/s]


epoch 1, loss 0.2338, train acc 0.901, test acc 0.951, time 57.6 sec
2022-03-14 	 22:38:44	

train: 100%|██████████████████████████████████████████████████████████████████████| 750/750 [00:52<00:00, 14.37it/s]
evaluate: 100%|███████████████████████████████████████████████████████████████████| 188/188 [00:04<00:00, 38.00it/s]


epoch 2, loss 0.0615, train acc 0.954, test acc 0.962, time 57.2 sec
2022-03-14 	 22:39:41	

train: 100%|██████████████████████████████████████████████████████████████████████| 750/750 [00:52<00:00, 14.38it/s]
evaluate: 100%|███████████████████████████████████████████████████████████████████| 188/188 [00:04<00:00, 38.00it/s]


epoch 3, loss 0.0270, train acc 0.971, test acc 0.952, time 57.1 sec
2022-03-14 	 22:40:38	

train: 100%|██████████████████████████████████████████████████████████████████████| 750/750 [00:52<00:00, 14.34it/s]
evaluate: 100%|███████████████████████████████████████████████████████████████████| 188/188 [00:04<00:00, 37.91it/s]


epoch 4, loss 0.0136, train acc 0.980, test acc 0.960, time 57.3 sec
2022-03-14 	 22:41:35	

train: 100%|██████████████████████████████████████████████████████████████████████| 750/750 [00:52<00:00, 14.36it/s]
evaluate: 100%|███████████████████████████████████████████████████████████████████| 188/188 [00:05<00:00, 37.50it/s]

epoch 5, loss 0.0079, train acc 0.986, test acc 0.962, time 57.2 sec





In [17]:
d2l.predict_sentiment(net, vocab, ['this', 'hospital', 'is', 'so', 'great'])

'positive'

In [18]:
d2l.predict_sentiment(net, vocab, ['this', 'hospital', 'is', 'so', 'bad'])

'negative'

In [19]:
print("请输入一句医疗评价的英文：")
s = input() 
s = [i for i in s.split()]
print(s)
print(d2l.predict_sentiment(net, vocab, s))

请输入一句医疗评价的英文：
I feel that this hospital is very general and the charge is very expensive. I won't come next time.
['I', 'feel', 'that', 'this', 'hospital', 'is', 'very', 'general', 'and', 'the', 'charge', 'is', 'very', 'expensive.', 'I', "won't", 'come', 'next', 'time.']
negative
