In [98]:
import collections
import os
import random
import math
import time
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [99]:
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(20)

In [100]:
# def load_data():
#     data = []
#     sample_num = 25000

#     with open('./data/positive_process.data', 'r', encoding='utf-8') as f:
#         sentences = f.readlines()
#         for sentence in sentences[:sample_num]:
#             words = [x for x in sentence.strip().split('\t')]
#             data.append([words, 0])

#     with open('./data/negative_process.data', 'r', encoding='utf-8') as f:
#         sentences = f.readlines()
#         for sentence in sentences[:sample_num]:
#             words = [x for x in sentence.strip().split('\t')]
#             data.append([words, 1])

#     random.shuffle(data)
#     return data

from sklearn.utils import shuffle
path = "/home/zw/HDD/opinion/opinionExtraction/"
def load_data():
    data = []
    df = pd.read_csv(path + "non_advice.csv", encoding = "utf-8")
    for sentence,label in zip(df["comments"],df["label"]):
        words = sentence
        sentiment = label
        data.append([words,sentiment])
    random.shuffle(data)
    return data

train_data, test_data = train_test_split(load_data(), test_size=0.2)
# test_data, dev_data = train_test_split(test_data, test_size=0.5)
# train_data

In [101]:
def get_vocab(data):
    tokenized_data = [words for words, _ in data]
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)


vocab = get_vocab(train_data)
print('# words in vocab:', len(vocab))

# words in vocab: 1595


In [102]:
def preprocess(data, vocab):
    max_l = 100  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = [words for words, _ in data]
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [103]:
batch_size = 16
train_set = Data.TensorDataset(*preprocess(train_data, vocab))
test_set = Data.TensorDataset(*preprocess(test_data, vocab))
# dev_set = Data.TensorDataset(*preprocess(dev_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
# dev_iter = Data.DataLoader(dev_set, batch_size)

In [104]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

X torch.Size([16, 100]) y torch.Size([16])
#batches: 946


In [105]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings)  # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

# class BiLSTM_Attention(nn.Module):
#     def __init__(self, vocab, embedding_dim, num_hiddens, num_layers):
#         super(BiLSTM_Attention, self).__init__()
#         self.embedding = nn.Embedding(len(vocab), embedding_dim)
#         # bidirectional设为True即得到双向循环神经网络
#         self.encoder = nn.LSTM(input_size=embedding_dim,
#                                hidden_size=num_hiddens,
#                                num_layers=num_layers,
#                                batch_first=True,
#                                bidirectional=True)
#         self.w_omega = nn.Parameter(torch.Tensor(
#             num_hiddens * 2, num_hiddens * 2))
#         self.u_omega = nn.Parameter(torch.Tensor(num_hiddens * 2, 1))
#         self.decoder = nn.Linear(4*num_hiddens, 2)

#         nn.init.uniform_(self.w_omega, -0.1, 0.1)
#         nn.init.uniform_(self.u_omega, -0.1, 0.1)

#     def forward(self, inputs):
#         # inputs的形状是(seq_len,batch_size)
#         embeddings = self.embedding(inputs.permute(1, 0))
#         # 提取词特征，输出形状为(seq_len,batch_size,embedding_dim)
#         # rnn.LSTM只返回最后一层的隐藏层在各时间步的隐藏状态。
#         outputs, _ = self.encoder(embeddings)  # output, (h, c)
#         # outputs形状是(seq_len,batch_size, 2 * num_hiddens)
#         x = outputs.permute(1, 0, 2)
#         # x形状是(batch_size, seq_len, 2 * num_hiddens)
       
#         # Attention过程
#         u = torch.tanh(torch.matmul(x, self.w_omega))
#        # u形状是(batch_size, seq_len, 2 * num_hiddens)
#         att = torch.matmul(u, self.u_omega)
#        # att形状是(batch_size, seq_len, 1)
#         att_score = F.softmax(att, dim=1)
#        # att_score形状仍为(batch_size, seq_len, 1)
#         scored_x = x * att_score
#        # scored_x形状是(batch_size, seq_len, 2 * num_hiddens)
#         # Attention过程结束
       
#         feat = torch.sum(scored_x, dim=1)
#        # feat形状是(batch_size, 2 * num_hiddens)
#         outs = self.decoder(feat)
#        # out形状是(batch_size, 2)
#         return outs

In [106]:
cache = '.vector_cache'
if not os.path.exists(cache):
    os.mkdir(cache)
glove_vocab = Vocab.Vectors(name='./douyin_work/sgns.renmin.bigram-char', cache=cache)

In [107]:
class BiLSTM_Attention(nn.Module):
    def __init__(self, vocab, embedding_dim, num_hiddens, num_layers):
        super(BiLSTM_Attention, self).__init__()
        # embedding之后的shape: torch.Size([200, 8, 300])
        self.embedding = nn.Embedding(len(vocab), embedding_dim)
        # self.embedding = self.embedding.from_pretrained(vocab.vectors, freeze=False)
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embedding_dim,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               batch_first=False,
                               bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.w_omega = nn.Parameter(torch.Tensor(
            num_hiddens * 2, num_hiddens * 2))
        self.u_omega = nn.Parameter(torch.Tensor(num_hiddens * 2, 1))
        self.decoder = nn.Linear(2*num_hiddens, 2)

        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)

    def forward(self, inputs):
        # inputs的形状是(seq_len,batch_size)
        embeddings = self.embedding(inputs.permute(1, 0))
        # print(embeddings.size())
        # 提取词特征，输出形状为(seq_len,batch_size,embedding_dim)
        # rnn.LSTM只返回最后一层的隐藏层在各时间步的隐藏状态。
        outputs, _ = self.encoder(embeddings)  # output, (h, c)
        # outputs形状是(seq_len,batch_size, 2 * num_hiddens)
        x = outputs.permute(1, 0, 2)
        # print(x.size())
        # x形状是(batch_size, seq_len, 2 * num_hiddens)
        
        # Attention过程
        u = torch.tanh(torch.matmul(x, self.w_omega))
       # u形状是(batch_size, seq_len, 2 * num_hiddens)
        att = torch.matmul(u, self.u_omega)
       # att形状是(batch_size, seq_len, 1)
        att_score = F.softmax(att, dim=1)
       # att_score形状仍为(batch_size, seq_len, 1)
        scored_x = x * att_score
       # scored_x形状是(batch_size, seq_len, 2 * num_hiddens)
        # Attention过程结束
        
        feat = torch.sum(scored_x, dim=1)
       # feat形状是(batch_size, 2 * num_hiddens)
        outs = self.decoder(feat)
        # print(outs.size())
       # out形状是(batch_size, 2)
        return outs
vocab.vectors

In [108]:
embed_size, num_hiddens, num_layers = 300, 300, 2
net = BiLSTM_Attention(vocab, embed_size, num_hiddens, num_layers)


In [109]:
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])  # 初始化为0
    oov_count = 0  # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed


net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False  # 直接加载预训练好的, 所以不需要更新它

There are 34 oov words.


In [110]:
def evaluate(data_iter, net, device=None): # 测试函数
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device
    TP, TN, FP, FN = 0.0, 0.0, 0.0, 0.0
    with torch.no_grad():
        for X, y in data_iter: 
            net.eval()  # 评估模式, 这会关闭dropout
            # net(X.to(device)).argmax(dim=1)
            pred = net(X.to(device)).argmax(dim=1).cpu()
            test = y.to(device).cpu()
            net.train()  # 改回训练模式
            TP += ((pred == 1) & (test == 1)).cpu().sum()
            # TN    predict 和 label 同时为0
            TN += ((pred == 0) & (test == 0)).cpu().sum()
            # FN    predict 0 label 1
            FN += ((pred == 0) & (test == 1)).cpu().sum()
            # FP    predict 1 label 0
            FP += ((pred == 1) & (test == 0)).cpu().sum()
        p = TP / (TP + FP)
        r = TP / (TP + FN)
        F1 = 2 * r * p / (r + p)
        acc = (TP + TN) / (TP + TN + FP + FN)       
    return acc, p, r, F1
from sklearn.metrics import accuracy_score
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            net.eval()  # 评估模式, 这会关闭dropout
            acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
            net.train()  # 改回训练模式
            n += y.shape[0]

    return acc_sum/n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs): # 训练函数
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in tqdm(train_iter):
            # target = target.view(args.batch_size)
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            # print(y)
            # print(y_hat)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc, test_precision, test_R, test_F1 = evaluate(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, precision %.3f, R %.3f, F1 %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, test_precision, test_R, test_F1, time.time() - start))
        # print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
        #       % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

lr, num_epochs = 0.003, 20
# 要过滤掉不计算梯度的embedding参数
net.train()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
#计算准确率

  0%|          | 4/946 [00:00<00:31, 30.28it/s]training on  cuda
100%|██████████| 946/946 [00:30<00:00, 31.52it/s]
  0%|          | 4/946 [00:00<00:29, 31.69it/s]epoch 1, loss 0.3324, train acc 0.842, test acc 0.941, precision 0.936, R 0.958, F1 0.947, time 32.0 sec
100%|██████████| 946/946 [00:32<00:00, 28.92it/s]
  0%|          | 3/946 [00:00<00:32, 29.21it/s]epoch 2, loss 0.0751, train acc 0.945, test acc 0.950, precision 0.940, R 0.972, F1 0.956, time 34.7 sec
100%|██████████| 946/946 [00:33<00:00, 28.40it/s]
  0%|          | 3/946 [00:00<00:32, 29.01it/s]epoch 3, loss 0.0331, train acc 0.965, test acc 0.964, precision 0.970, R 0.964, F1 0.967, time 35.3 sec
100%|██████████| 946/946 [00:33<00:00, 28.41it/s]
  0%|          | 4/946 [00:00<00:29, 32.10it/s]epoch 4, loss 0.0180, train acc 0.974, test acc 0.964, precision 0.975, R 0.959, F1 0.967, time 35.2 sec
100%|██████████| 946/946 [00:32<00:00, 29.03it/s]
  0%|          | 4/946 [00:00<00:27, 34.38it/s]epoch 5, loss 0.0104, train ac

In [111]:
# CNN
import collections
import os
import random
import math
import time
import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from sklearn.model_selection import train_test_split

from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def load_data():
    data = []
    df = pd.read_csv("./douyin_f.csv", encoding = "utf-8")
    for sentence,label in zip(df["comments"],df["label"]):
        words = sentence
        sentiment = label
        data.append([words,sentiment])
    random.shuffle(data)
    return data

def get_vocab(data):
    tokenized_data = [words for words, _ in data]
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)


def preprocess(data, vocab):
    max_l = 100  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = [words for words, _ in data]
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels


def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])  # 初始化为0
    oov_count = 0  # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed


def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            net.eval()  # 评估模式, 这会关闭dropout
            acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
            net.train()  # 改回训练模式
            n += y.shape[0]
    return acc_sum / n


def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    opt_test_acc = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in tqdm(train_iter):
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

train_data, test_data = train_test_split(load_data(), test_size=0.2)
batch_size = 32
vocab = get_vocab(train_data)
train_set = Data.TensorDataset(*preprocess(train_data, vocab))
test_set = Data.TensorDataset(*preprocess(test_data, vocab))
# dev_set = Data.TensorDataset(*preprocess(dev_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)


class GlobalMaxPool1d(nn.Module): # 用一维池化层实现时序最大池化层
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()

    def forward(self, x):
        # x shape: (batch_size, channel, seq_len)
        # return shape: (batch_size, channel, 1)
        return F.max_pool1d(x, kernel_size=x.shape[2])


class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # 不参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        # 时序最大池化层没有权重，所以可以共用一个实例
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels=2 * embed_size,
                                        out_channels=c,
                                        kernel_size=k))

    def forward(self, inputs):
        # 将两个形状是(批量大小, 词数, 词向量维度)的嵌入层的输出按词向量连结
        embeddings = torch.cat((
            self.embedding(inputs),
            self.constant_embedding(inputs)), dim=2)  # (batch, seq_len, 2*embed_size)
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维(即词向量那一维)，变换到前一维
        embeddings = embeddings.permute(0, 2, 1)
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs


embed_size, kernel_sizes, nums_channels = 300, [3, 4, 5], [300, 300, 300]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

cache = '.vector_cache'
if not os.path.exists(cache):
    os.mkdir(cache)
glove_vocab = Vocab.Vectors(name='./douyin_work/sgns.renmin.bigram-char', cache=cache)
net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

  6%|▌         | 12/206 [00:00<00:01, 112.25it/s]There are 17 oov words.
There are 17 oov words.
training on  cuda
100%|██████████| 206/206 [00:01<00:00, 112.78it/s]
  6%|▌         | 12/206 [00:00<00:01, 117.53it/s]epoch 1, loss 0.4713, train acc 0.773, test acc 0.805, time 2.0 sec
100%|██████████| 206/206 [00:01<00:00, 115.33it/s]
  6%|▌         | 12/206 [00:00<00:01, 112.87it/s]epoch 2, loss 0.1815, train acc 0.834, test acc 0.842, time 1.9 sec
100%|██████████| 206/206 [00:01<00:00, 111.52it/s]
  6%|▌         | 12/206 [00:00<00:01, 114.03it/s]epoch 3, loss 0.0944, train acc 0.873, test acc 0.847, time 2.0 sec
100%|██████████| 206/206 [00:01<00:00, 110.97it/s]
  6%|▌         | 12/206 [00:00<00:01, 112.17it/s]epoch 4, loss 0.0567, train acc 0.901, test acc 0.838, time 2.0 sec
100%|██████████| 206/206 [00:01<00:00, 111.08it/s]
epoch 5, loss 0.0374, train acc 0.929, test acc 0.846, time 2.0 sec


In [112]:
# import csv
# path = "/home/zw/HDD/opinion/opinionExtraction/"
# f = open(path + "non_advice.csv","r")
# positive = list()
# negetive = list()
# positive.append("label,negative_prob,positive_prob,confidence,comments")
# negetive.append("label,negative_prob,positive_prob,confidence,comments")
# for line in open(path + "gov_final.csv","r"):
#     line = f.readline()
#     str = line.split(",")
#     if str[0] == '1':
#         positive.append(line)
#     if str[0] == "0":
#         negetive.append(line)
# f.close()
# open(path + 'p_result.csv', 'w').write('%s' % "".join(positive))
# open(path + 'n_result.csv', 'w').write('%s' % "".join(negetive))
