# RNN + GRU 实现文本情感分类

In [1]:
# 读入数据
import pandas as pd
train_data = pd.read_csv('C:/Users/WYX/Desktop/test/data/train.tsv', header=0, delimiter="\t", quoting=3)
test_data = pd.read_csv('C:/Users/WYX/Desktop/test/data/test.tsv', header=0, delimiter="\t", quoting=3)
from nltk.tokenize import WordPunctTokenizer
word_tokenizer = WordPunctTokenizer()
def textPreprocess(text):
    words = word_tokenizer.tokenize(text)
    return ' '.join(words)

# 预处理文本
train_data['Sentence'] = train_data['Sentence'].apply(textPreprocess)
test_data['Sentence'] = test_data['Sentence'].apply(textPreprocess)
#print(train_data['Sentence'])
num_train_texts = train_data['Sentence'].size
num_test_texts = test_data['Sentence'].size
# 收集 vocab
word_set = set()
for i in range(0, num_train_texts):
    word_set.update([w.lower() for w in word_tokenizer.tokenize(train_data['Sentence'][i])])
for i in range(0, num_test_texts):
    word_set.update([w.lower() for w in word_tokenizer.tokenize(test_data['Sentence'][i])])
print(len(word_set))

30359


In [2]:
# 定义 load_embeding 可以选择使用使用谷歌预训练的glove 或者 个人在textcnn中训练好的 embeding
def load_glove_embedding(word_list, uniform_scale, dimension_size):
    index = 0
    if os.path.exists('glove_embedding.npy'):
        word_vectors = np.load('glove_embedding.npy')
        print('Successfully load saved embedding!')
    else:
        glove_words = {}
        #with open('C:/Users/WYX/Desktop/test/embedding_word2vec.txt', 'r',encoding='utf-8') as fopen:
        with open('C:/Users/WYX/Desktop/test/glove.840B.300d.txt', 'r',encoding='utf-8') as fopen:
            for line in fopen:
                tmp = line.split(' ')
                glove_words[tmp[0]] = np.array(tmp[1:], dtype=np.float32)

        word_to_index = {}
        word_vectors = np.zeros([len(word_set), 300])
        for word in word_list:
            word_to_index[word] = index
            index += 1
            if word in glove_words:
                word_vectors[word_to_index[word]] = (glove_words[word])
            elif word == '<pad>':
                word_vectors[word_to_index[word]] = (np.zeros(dimension_size, dtype=np.float32))
            else:
                word_vectors[word_to_index[word]] = (np.random.uniform(-uniform_scale, uniform_scale, dimension_size))
        
    return word_vectors, word_to_index

In [3]:
import os
import numpy as np
word_vectors, word_to_index = load_glove_embedding(word_set, 0.25, 300)

In [4]:
# 设置禁用 gpu
import torch
is_cuda = False 

In [5]:
# 创建 RNN + GRU 类
import torch.nn as nn

class BaseGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0, bidirectional=False):
        super(BaseGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1   
        self.embedding = nn.Embedding(30359, 300)
        self.embedding.weight.data.copy_(torch.from_numpy(word_vectors))
        self.gru = nn.GRU(300, hidden_size, num_layers, bidirectional=bidirectional, dropout=dropout)
        self.h2o = nn.Linear(self.num_directions * hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)  
    def forward(self, inputs):
        hidden = self.initHidden(is_cuda)
        line = self.embedding(inputs)
        line = torch.transpose(line, 0, 1)
        output, hidden = self.gru(line, hidden)
        output = self.h2o(output[line.size(0)-1])
        output = self.softmax(output)
        return output    
    def initHidden(self, is_cuda=True):
        if is_cuda:
            hidden = torch.zeros(self.num_layers*self.num_directions, 1, self.hidden_size).cuda()
        else:
            hidden = torch.zeros(self.num_layers*self.num_directions, 1, self.hidden_size)
        return hidden

In [6]:
# 准备训练数据
num_of_training_set = int(0.9 * num_train_texts)
#print(train_data)
train_data['Sentiment'] = train_data['Sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
training_set = train_data[:num_of_training_set]
validation_set = train_data[num_of_training_set:]
#print(training_set)

In [27]:
def lineToIndex(line):
    words = [w.lower() for w in word_tokenizer.tokenize(line)]
    words_index = []
    for w in words:
        words_index.append(word_to_index[w])
    lineIndex = torch.tensor([words_index])
    return lineIndex
import random
def randomChoice(pdFrame):
    idx = random.randint(0, len(pdFrame) - 1)
    text = pdFrame.iloc[idx]['Sentence']
    polarity = pdFrame.iloc[idx]['Sentiment']
    print(pdFrame.iloc[idx]['Sentiment'].size)
    text_index = lineToIndex(text)
    polarity_tensor = torch.tensor([polarity], dtype=torch.long)
    return idx, text_index, polarity_tensor

def polarityFromOutput(output):
    top_n, top_i = output.topk(1)
    polarity = top_i[0].item()
    return polarity

In [17]:
# 训练模型
import torch.optim as optim

criterion = nn.NLLLoss()

learning_rate = 0.05 # If you set this too high, it might explode. If too low, it might not learn


def train(model, category_tensor, line_tensor, weight_clip=0.1):
    output = model(line_tensor)
    model.zero_grad()
    loss = criterion(output, category_tensor)
    loss.backward()
    for p in model.parameters():
        if hasattr(p.grad, "data"):
            p.data.add_(-learning_rate, p.grad.data)
    return output, loss.item()

def evaluate(model, n_train_eval, n_test_eval, save_threshold):
    global save_count
    model.eval()
    train_correct = 0
    val_correct = 0
    for i in range(n_train_eval):
        idx, text_index, polarity_tensor = randomChoice(training_set)
        output = model(text_index)
        predict = polarityFromOutput(output)
        if predict == polarity_tensor.item():
            train_correct += 1
    for i in range(n_test_eval):
        idx, text_index, polarity_tensor = randomChoice(validation_set)
        output = model(text_index)
        predict = polarityFromOutput(output)
        if predict == polarity_tensor.item():
            val_correct += 1
    print('train set acc: {} | val set acc {}'.format(train_correct/n_train_eval, val_correct/n_test_eval))

In [19]:
import time
import math

learning_rate = 0.05
n_hidden = 128
model = BaseGRU(300, n_hidden, 3, 2, 0, bidirectional=True)

n_iters = 30000
print_every = 1000
plot_every = 1000

# Keep track of losses for plotting
current_loss = 0
last_loss = 0
all_losses = [0]

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    idx, text_index, polarity_tensor = randomChoice(training_set)
    _, loss = train(model, polarity_tensor, text_index)
    #print(text_index.shape)
    current_loss += loss
    
    if iter % print_every == 0:
        evaluate(model, n_train_eval=500, n_test_eval=500, save_threshold=0.72)
        model.train()
    if iter % plot_every == 0:
        if all_losses[-1] < (current_loss / plot_every):
            learning_rate *= 0.9
        all_losses.append(current_loss / plot_every)
        print('{} {}/{} loss: {}'.format(timeSince(start), iter, n_iters, current_loss / plot_every))
        current_loss = 0

train set acc: 0.518 | val set acc 0.504
3m 37s 1000/30000 loss: 0.9887931494340301
train set acc: 0.56 | val set acc 0.566
7m 2s 2000/30000 loss: 0.9400041382387281
train set acc: 0.55 | val set acc 0.502
10m 28s 3000/30000 loss: 0.9727532647494227


KeyboardInterrupt: 

In [None]:
# 由于个人机器能力有限 在保证 rnn 运行正确的基础上停止了长时间运行