<a href="https://colab.research.google.com/github/yixish/NLPLearning/blob/master/TextRCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!/opt/bin/nvidia-smi

Wed Oct 28 07:31:37 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |     10MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import pandas as pd
def read_hotel():
    train_df =  pd.read_csv('/content/gdrive/My Drive/dataset/Hotel_rating/train.csv');
    
    keys =  list(train_df['review'].values)
    vals =  list(train_df['rating'].values)
    data = []
    for i in range(len(keys)):
        data.append([keys[i],vals[i]-1])
    data_len = len(data)
    return data[:int(data_len*0.8)],data[int(data_len*0.2):]

train_data,test_data= read_hotel()

# 打印训练数据中的前五个sample
for sample in train_data[:5]:
    print(sample[1], '\t', sample[0][:50])

3 	 good place stay check rainforest biobay vieques/cu
4 	 great firstly did n't enjoy hong kong, 3 days quit
3 	 clean convenient hotel catedral ideally located ke
2 	 transport good class high excellent communications
3 	 stay happy la quinta, used stay travelodge street 


In [None]:
def get_tokenized(data):
    '''
    @params:
        data: 数据的列表，列表中的每个元素为 [文本字符串，0/1标签] 二元组
    @return: 切分词后的文本的列表，列表中的每个元素为切分后的词序列
    '''
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    
    return [tokenizer(review) for review, _ in data]

def get_vocab(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    tokenized_data = get_tokenized(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab(train_data)
print('# words in vocab:', len(vocab))

# words in vocab: 14914


In [None]:
max_l = 500  # 将每条评论通过截断或者补0，使得长度变成500

def preprocess(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
 
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
 
    tokenized_data = get_tokenized(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [None]:
train_set = Data.TensorDataset(*preprocess(train_data, vocab))
test_set = Data.TensorDataset(*preprocess(test_data, vocab))


batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

X torch.Size([64, 100]) y torch.Size([64])
#batches: 205


In [None]:
class TextRCNN(nn.Module):

    def __init__(self, vocab,embedding_dim, output_dim, hidden_size, num_layers, bidirectional, dropout):
        super(TextRCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embedding_dim)
        # self.embedding = nn.Embedding.from_pretrained(
        #     pretrained_embeddings, freeze=False)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, dropout=dropout)
        self.W2 = nn.Linear(2 * hidden_size + embedding_dim, hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        text = x.T
        # text: [seq_len, batch size]
        embedded = self.dropout(self.embedding(text))
        # embedded: [seq_len, batch size, emb dim]

        outputs, _ = self.rnn(embedded)
        # outputs: [seq_len， batch_size, hidden_size * bidirectional]

        outputs = outputs.permute(1, 0, 2)
        # outputs: [batch_size, seq_len, hidden_size * bidirectional]

        embedded = embedded.permute(1, 0, 2)
        # embeded: [batch_size, seq_len, embeding_dim]

        x = torch.cat((outputs, embedded), 2)
        # x: [batch_size, seq_len, embdding_dim + hidden_size * bidirectional]

        y2 = torch.tanh(self.W2(x)).permute(0, 2, 1)
        # y2: [batch_size, hidden_size * bidirectional, seq_len]

        y3 = F.max_pool1d(y2, y2.size()[2]).squeeze(2)
        # y3: [batch_size, hidden_size * bidirectional]

        return self.fc(y3)

In [None]:
embed_size, num_hiddens, num_layers = 100, 100, 2
out_dim = 5
model = TextRCNN(vocab,embed_size, out_dim,num_hiddens, num_layers,True,0.5)

In [None]:
cache_dir = "/content/gdrive/My Drive/dataset/GloVe6B"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

def load_pretrained_embedding(words, pretrained_vocab):
    '''
    @params:
        words: 需要加载词向量的词语列表，以 itos (index to string) 的词典形式给出
        pretrained_vocab: 预训练词向量
    @return:
        embed: 加载到的词向量
    '''
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

model.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
model.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 4367 oov words.


In [None]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [None]:
lr, num_epochs = 0.01, 10
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, test_iter, model, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 1.1483, train acc 0.498, test acc 0.576, time 6.5 sec
epoch 2, loss 0.5040, train acc 0.558, test acc 0.615, time 6.4 sec
epoch 3, loss 0.3292, train acc 0.567, test acc 0.625, time 6.4 sec
epoch 4, loss 0.2409, train acc 0.578, test acc 0.597, time 6.4 sec
epoch 5, loss 0.1931, train acc 0.580, test acc 0.614, time 6.4 sec
epoch 6, loss 0.1584, train acc 0.586, test acc 0.620, time 6.4 sec
epoch 7, loss 0.1367, train acc 0.584, test acc 0.633, time 6.4 sec
epoch 8, loss 0.1194, train acc 0.584, test acc 0.616, time 6.4 sec
epoch 9, loss 0.1071, train acc 0.578, test acc 0.604, time 6.3 sec
epoch 10, loss 0.0950, train acc 0.586, test acc 0.619, time 6.4 sec


In [None]:
def predict_sentiment(net, vocab, sentence):
    '''
    @params：
        net: 训练好的模型
        vocab: 在该数据集上创建的词典，用于将给定的单词序转换为单词下标的序列，从而输入模型
        sentence: 需要分析情感的文本，以单词序列的形式给出
    @return: 预测的结果，positive 为正面情绪文本，negative 为负面情绪文本
    '''
    if len(sentence)>=max_l:
        sentence = sentence[:max_l]
    device = list(net.parameters())[0].device # 读取模型所在的环境
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return label.item()+1


In [None]:
test_df =  pd.read_csv('/content/gdrive/My Drive/dataset/Hotel_rating/test.csv');
reviews =  list(test_df['review'].values)
test_df['rating'] = test_df['review'].map(lambda x: predict_sentiment(model,vocab,x.split()))
test_df.describe()

Unnamed: 0,id,rating
count,4099.0,4099.0
mean,2049.0,3.982923
std,1183.423705,1.323734
min,0.0,1.0
25%,1024.5,4.0
50%,2049.0,4.0
75%,3073.5,5.0
max,4098.0,5.0


In [None]:
test_df = test_df.drop('review',axis =1 )
test_df.to_csv('/content/gdrive/My Drive/dataset/sub.csv', header=None,index=False)  