# 读取数据

In [1]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
import numpy as np
 
def loadData(filename):
    data = []
    fr = open(filename, 'r', encoding='utf8')
    for line in fr.readlines():                 #逐行读取
        lineArr = line.strip().split('\t')      #滤除行首行尾空格，以\t作为分隔符，对这行进行分解
        num = np.shape(lineArr)[0]     
        data.append(["".join(lineArr[0:num-1]), int(lineArr[num-1])])#这一行的除最后一个被添加为数据
        #labelMat.append(int(lineArr[num-1]))#这一行的最后一个数据被添加为标签
    return data

In [3]:
train_data = loadData("./data/train_data.txt")
test_data = loadData("./data/test_data.txt")

"test txt len: ", len(test_data), "train txt len: ", len(train_data)
for i in train_data[0:3]:
    print(i[0], i[1])

Charlie Hebdos Last Tweet Spoofed ISIS Leader AlBaghdadi  URL 0
Trocadero square in Paris evacuated Policemen pointing their gun URL 1
held by gunman at kosher supermarket in Paris as nd hostagetaking underway AP Gunman linked to Thursdays killing of policewoman 0


# 去停用词

In [4]:
#分词和去停用词
import pandas as pd
import nltk
from string import punctuation as enpunctuation
import re

In [5]:
zhonPunctuation = u'''＂ ＃ ＄ ％ ＆ ＇ （ ） ＊ ＋ ， － ／ ： ； ＜ ＝ ＞ ＠ ［ ＼ ］ ＾ ＿ ｀ ｛ ｜ ｝ ～ ｟ ｠ ｢ ｣ ､  〃 〈 〉 《 》 「 」 『 』 【 】 〔 〕 〖 〗 〘 〙 〚 〛 〜 〝 〞 〟  〾 〿 – — ‘ ’ ‛ “ ” „ ‟ … ‧ ﹏ ﹑ ﹔ · ！ ？ ｡ → 、 。'''
punctuations = set([str(i) for i in enpunctuation]) | set([str(i) for i in zhonPunctuation]) #去标点符号
srctweet1 = [nltk.word_tokenize(st[0]) for st in train_data if st[0] != '' and st[0] not in punctuations]
srctweet2 = [nltk.word_tokenize(st[0]) for st in test_data if st[0] != '' and st[0] not in punctuations]

In [6]:
stopwords=pd.read_csv("./data/stop_words-master/english.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords.head()

Unnamed: 0,stopword
0,😳
1,😒
2,🙈
3,😳🙈
4,😬


In [7]:
#清除标点符号
def del_mark(word):
    punc = '~`!#$%^&*()_+-=|\';":/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》《{\]}\[✏'
    return re.sub(r"[%s]+" %punc, "", word)

In [8]:
def drop_stopwords(srctweet,stopwords):
    srctweet_clean = []
    all_words = []
    for line in srctweet:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            word = del_mark(word)
            line_clean.append(word)
            all_words.append(word)##记录所有line_clean中的词
        srctweet_clean.append(line_clean)
    return srctweet_clean,all_words
    #print (contents_clean)
        

stopwords = stopwords.stopword.values.tolist()
srctweet1,all_words = drop_stopwords(srctweet1,stopwords)
srctweet2,_  = drop_stopwords(srctweet2,stopwords)
i=0
for line in srctweet1:
    train_data[i][0] = " ".join(line)
    i+=1
i=0
for line in srctweet2:
    test_data[i][0] = " ".join(line)
    i+=1
type(train_data[0]), train_data[1]

(list, ['Trocadero square Paris evacuated Policemen gun URL', 1])

In [9]:
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]


现在，我们可以根据分好词的训练数据集来创建词典了。我们在这里过滤掉了出现次数少于5的词

In [10]:
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 513)

因为每条评论长度不一致所以不能直接组合成小批量，我们定义preprocess_imdb函数对每条评论进行分词，并通过词典转换成词索引，然后通过截断或者补0来将每条评论长度固定成300。

In [11]:
def preprocess_imdb(data, vocab):
    max_l = 300  # 将每条评论通过截断或者补0，使得长度变成300

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

# 创建数据迭代器

现在，我们创建数据迭代器。每次迭代将返回一个小批量的数据。

In [12]:
batch_size = 64
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)


打印第一个小批量数据的形状以及训练集中小批量的个数。

In [13]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)


X torch.Size([64, 300]) y torch.Size([64])


('#batches:', 30)

# 使用循环神经网络模型

在这个模型中，每个词先通过嵌入层得到特征向量。然后，我们使用双向循环神经网络对特征序列进一步编码得到序列信息。最后，我们将编码的序列信息通过全连接层变换为输出。具体来说，我们可以将双向长短期记忆在最初时间步和最终时间步的隐藏状态连结，作为特征序列的表征传递给输出层分类。在下面实现的BiRNN类中，Embedding实例即嵌入层，LSTM实例即为序列编码的隐藏层，Linear实例即生成分类结果的输出层。

In [14]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.decoder = nn.Linear(4*num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs


创建一个含两个隐藏层的双向循环神经网络

In [15]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

# 读取词嵌入

In [16]:
token_to_idx_list = loadData("./data/token_to_idx.txt")
token_to_idx = {}
for item in token_to_idx_list:
    dic = {item[0]: item[1]}
    token_to_idx.update(dic)

token_to_idx['happening']

419

In [17]:
embed_size = 100
net1 = nn.Sequential(
    nn.Embedding(num_embeddings=len(token_to_idx), embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(token_to_idx), embedding_dim=embed_size)
)
net1 = torch.load("./data/net.pt")

In [18]:
net1[0].weight.data.shape, len(token_to_idx)

(torch.Size([2327, 100]), 2327)

In [19]:
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    W = pretrained_vocab.weight.data
    embed = torch.zeros(len(words),  W.shape[1]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = token_to_idx[word]
            embed[i, :] = W[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, net1[0]))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

There are 157 oov words.


# 训练并评价模型

In [None]:
lr, num_epochs = 0.01, 10
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.2419, train acc 0.903, test acc 0.847, time 1.9 sec
epoch 2, loss 0.0853, train acc 0.943, test acc 0.856, time 1.8 sec
epoch 3, loss 0.0426, train acc 0.955, test acc 0.856, time 1.8 sec
epoch 4, loss 0.0245, train acc 0.970, test acc 0.856, time 1.8 sec
epoch 5, loss 0.0195, train acc 0.973, test acc 0.852, time 1.8 sec


In [None]:
torch.save(net, './data/LSTM_net_loss=0.0135_acc=0.876.pt')

In [28]:
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    #return 'positive' if label.item() == 1 else 'negative'
    return int(1) if label.item() == 1 else int(0)

# 测试数据

In [57]:
type(test_data[0][0]), type(test_data[0][1])

(str, int)

In [42]:
#分词和去停用词
import pandas as pd
import nltk
from string import punctuation as enpunctuation
import re

In [38]:
zhonPunctuation = u'''＂ ＃ ＄ ％ ＆ ＇ （ ） ＊ ＋ ， － ／ ： ； ＜ ＝ ＞ ＠ ［ ＼ ］ ＾ ＿ ｀ ｛ ｜ ｝ ～ ｟ ｠ ｢ ｣ ､  〃 〈 〉 《 》 「 」 『 』 【 】 〔 〕 〖 〗 〘 〙 〚 〛 〜 〝 〞 〟  〾 〿 – — ‘ ’ ‛ “ ” „ ‟ … ‧ ﹏ ﹑ ﹔ · ！ ？ ｡ → 、 。'''
punctuations = set([str(i) for i in enpunctuation]) | set([str(i) for i in zhonPunctuation]) #去标点符号
srctweet = [nltk.word_tokenize(st[0]) for st in test_data if st[0] != '' and st[0] not in punctuations]

In [43]:
stopwords=pd.read_csv("./data/stop_words-master/english.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords.head()

Unnamed: 0,stopword
0,😳
1,😒
2,🙈
3,😳🙈
4,😬


In [40]:
#清除标点符号
def del_mark(word):
    punc = '~`!#$%^&*()_+-=|\';":/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》《{\]}\[✏'
    return re.sub(r"[%s]+" %punc, "", word)

In [26]:
def drop_stopwords(srctweet,stopwords):
    srctweet_clean = []
    all_words = []
    for line in srctweet:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            word = del_mark(word)
            line_clean.append(word)
            all_words.append(word)##记录所有line_clean中的词
        srctweet_clean.append(line_clean)
    return srctweet_clean,all_words
    #print (contents_clean)
        

stopwords = stopwords.stopword.values.tolist()
srctweet_clean,all_words = drop_stopwords(srctweet,stopwords)
type(srctweet_clean[0]), srctweet_clean[0]

AttributeError: 'list' object has no attribute 'stopword'

In [29]:
net2 = torch.load(r"D:\NoteBook\RumerDetection\data\LSTM_net_loss=0.0023_acc=0.856.pt")
net2.embedding.weight.requires_grad

False

In [30]:
test_result = []
for item in test_data:
    test_result.append([item, predict_sentiment(net2, vocab, item[0].split())])

In [20]:
type(test_result[0]), type(test_result[0])

(list, list)

In [31]:
count = 0
for i in range(0, len(test_result)):
    if int(test_result[i][1]) ^ test_data[i][1] == 0:
        count += 1
'正确数:', count, '总数: ', len(test_result), '正确率：', 1.0 * count/len(test_result) * 100

('正确数:', 163, '总数: ', 209, '正确率：', 77.99043062200957)

In [23]:
torch.save(net, './data/LSTM_net_loss=0.0135_acc=0.876.pt')

In [89]:
for i in range(0, len(test_result)):
    if int(test_result[i][1]) != test_data[i][1]:
        print('test_data: ', srctweet_clean[i], ', ', 'lable: ', test_result[i][1], '\n')

test_data:  ['UPDATE', 'Reports', 'Sydney', 'Opera', 'House', 'evacuated', 'News'] ,  lable:  0 

test_data:  ['Sydney', 'siege', 'police', 'enter', 'cafe', 'hostages', 'flee', 'scene', 'gunfire', 'heard', 'reports', 'injured', 'person', 'URL'] ,  lable:  0 

test_data:  ['Map', 'locating', 'Paris', 'offices', 'satirical', 'magazine', 'Charlie', 'Hebdo', 'AFP', 'URL'] ,  lable:  1 

test_data:  ['At', 'hostages', 'flee', 'lindtcafe', 'Sydney', 'URL'] ,  lable:  0 

test_data:  ['Plane', 'crashes', 'southern', 'France', 'board', 'Germanwings', 'budget', 'airline', 'flying', 'Barcelona', 'Dusseldorf', 'URL'] ,  lable:  0 

test_data:  ['COMING', 'UP', 'LIVE', 'Ottawa', 'shooting', 'Stephen', 'Harper', 'nation', 'URL'] ,  lable:  0 

test_data:  ['NHL', 'postpones', 'Wednesdays', 'LeafsSenators', 'tragedy', 'Ottawa', 'URL'] ,  lable:  0 

test_data:  ['BREAKING', 'Police', 'entered', 'Sydney', 'cafe', 'loud', 'bangs', 'flashes', 'stream', 'sydneysiege'] ,  lable:  0 

test_data:  ['German