In [1]:
import numpy as np
import pandas as pd
import re

# 导入词向量模型，https://github.com/Embedding/Chinese-Word-Vectors

In [2]:
from gensim.models import KeyedVectors

In [3]:
cn_model = KeyedVectors.load_word2vec_format('./embeddings/sgns.weibo.bigram', 
                                             binary=False,
                                             unicode_errors="ignore")

In [46]:
print(cn_model.vocab['心情'].index)
print(cn_model.vectors[666].shape)
print(cn_model.vectors[666])

666
(300,)
[ 0.867238  0.447538 -0.491654  0.442702  0.3249   -0.135766 -0.040395
 -0.004786 -0.542705  0.473954 -0.087931  0.182698 -0.513613  0.468792
  0.414553  0.623913 -0.409261 -0.469407  0.514468 -0.918192 -0.359212
 -0.827985  0.474507  0.150142  0.21658   0.598472 -1.086412  0.492395
  0.741502  1.547145 -0.094865  0.318981 -0.01839  -0.410346 -0.449421
  0.148408  0.723303  0.504586  0.969177  0.276921 -0.1546   -0.646725
 -0.534673  0.905004  0.544035 -0.060806 -0.958402 -1.208919  0.5922
  0.007131 -0.072191  0.461774  0.70442   0.669932  0.091214 -1.372572
 -0.262632 -0.21109  -0.204028  0.773246 -0.257194  0.018511 -0.320097
  0.878415 -0.310128 -0.234066  0.032051  0.122899  0.215627  0.026293
  0.643676  0.119276 -0.8249   -0.01256  -0.516165  0.1447    0.302147
 -0.372154  0.149966 -0.677081  0.186576  0.717744  0.631457  0.328822
 -0.404428  0.051025  1.094605  0.154441  0.315256  0.43526   0.546499
  0.232901 -0.792701  0.644786  1.588248 -0.143094  0.900562 -0.4694

# 读取数据

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
weibo = pd.read_csv('./data/all_data.txt',sep='\t', names=['is_not_rumor','content'],encoding='utf-8')
weibo.head()

Unnamed: 0,is_not_rumor,content
0,0,【李登辉今天凌晨心脏病复发身亡】台北快讯：原国民党、台联党主席，有“台独教父”之称的李登辉，...
1,1,有一男生，平时老在一起玩，关系很好，喜欢他很久了。昨天表白，跟他说喜欢他，他笑了一下问我，喜...
2,0,【央视员工爆料：广西惊现帝王局长】 广西都安民政局长黄某一个人吃着509份底保，九套房子，6...
3,0,朋友们远离穿这种图案服装的人，发给你的家人、爱人、朋友，如果见到请立即报警。 ​
4,1,中国好声音都长得像xx什么的。。。（精选超像+搞笑） 来自 人人网 李梓瑶。


In [6]:
content = weibo.content.values.tolist()
label=weibo.is_not_rumor.values.tolist()

In [33]:
str(label[0])+'\t'+content[0]

'0\t【李登辉今天凌晨心脏病复发身亡】台北快讯：原国民党、台联党主席，有“台独教父”之称的李登辉，与今天凌晨在其家中因心脏病复发抢救无效死亡。'

# 分词和tokenize，https://github.com/lancopku/PKUSeg-python

In [7]:
import pkuseg

In [8]:
stopwords=pd.read_csv("./stopwords/stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords = stopwords.stopword.values.tolist()

In [9]:
seg = pkuseg.pkuseg(model_name='web')

In [10]:
train_tokens = []
for text in content:
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
    cut_list = seg.cut(text)
    cut_list_clean=[]
    for word in cut_list:
        if word in stopwords:
            continue
        cut_list_clean.append(word)
    
    #索引化
    for i, word in enumerate(cut_list_clean):
        try:
            # 将词转换为索引index
            cut_list_clean[i] = cn_model.vocab[word].index
        except KeyError:
            # 如果词不在字典中，则输出0
            cut_list_clean[i] = 0
    train_tokens.append(cut_list_clean)

In [11]:
# 获得所有tokens的长度
num_tokens = [len(tokens) for tokens in train_tokens]
num_tokens = np.array(num_tokens)
# 取tokens平均值并加上两个tokens的标准差，
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

58

In [12]:
train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')

In [34]:
train_pad

array([[    0,     0,     0, ..., 14638, 12594,  2143],
       [    0,     0,     0, ...,   120,   120,  2664],
       [    0,     0,     0, ..., 19557,    68, 27912],
       ...,
       [    0,     0,     0, ...,  1289,     0,   696],
       [    0,     0,     0, ...,   569,   791,     0],
       [    0,     0,     0, ...,     0,     0,     0]])

# 生成词向量

In [13]:
num_words = 50000 #选择使用前50k个使用频率最高的词
embedding_dim=300 #每一个词汇都用一个长度为300的向量表示
embedding_matrix = np.zeros((num_words, embedding_dim))
for i in range(num_words):
    embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]#前50000个index对应的词的词向量
embedding_matrix = embedding_matrix.astype('float32')

In [14]:
train_pad[train_pad>=num_words ] = 0
train_target = np.array(label)

# 训练

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_pad,
                                                    train_target,
                                                    test_size=0.1,
                                                    random_state=12)

In [17]:
#序贯(Sequential)模型
model = Sequential()
# 嵌入层
model.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
#Bidirectional包装器:双向RNN包装器
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(Bidirectional(LSTM(units=32, return_sequences=False)))
#全连接层
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer=Adam(lr=1e-3)



In [18]:
import os

In [19]:
# 建立一个权重的存储点
checkpoint_save_path="./checkpoint/rumor_LSTM.ckpt"
if os.path.exists(checkpoint_save_path+'.index'):
    print('----------load the model----------')
    model.load_weights(checkpoint_save_path)

In [20]:
#保存参数和模型
checkpoint = ModelCheckpoint(filepath=checkpoint_save_path, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)

In [21]:
# 5个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.1, min_lr=1e-8, patience=0,
                                       verbose=1)
# 定义callback函数
callbacks = [
    earlystopping, 
#    checkpoint,
    lr_reduction
]

In [22]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [23]:
model.fit(X_train, y_train,validation_split=0.1,epochs=20,batch_size=128,callbacks=callbacks)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 2744 samples, validate on 305 samples
Epoch 1/20
Epoch 2/20
Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 00011: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 12/20
Epoch 00012: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 13/20
Epoch 00013: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 14/20
Epoch 00014: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 15/20
Epoch 00015: ReduceLROnPlateau reducing learning rate to 1e-08.
Epoch 00015: early stopping


<tensorflow.python.keras.callbacks.History at 0x18410c4a3c8>

# 保存模型

In [24]:
model.save('LSTM_rumor_model_58.h5')

In [25]:
result = model.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))

Accuracy:87.61%


In [26]:
def predict_rumor_LSTM(text,label):
    print(text)
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
    cut = seg.cut(text)

    cut_clean=[]
    for word in cut:
        if word in stopwords:
            continue
        cut_clean.append(word)

    for i, word in enumerate(cut_clean):
        try:
            cut_clean[i] = cn_model.vocab[word].index
            if cut_clean[i] >= 50000:
                cut_clean[i] = 0
        except KeyError:
            cut_clean[i] = 0

    tokens_pad = pad_sequences([cut_clean], maxlen=max_tokens,
                           padding='pre', truncating='pre')

    dic={0:'谣言',1:'非谣言'}
    result = model.predict(x=tokens_pad)
    coef = result[0][0]
    if coef >= 0.5:
        print('真实是'+dic[label],'预测是非谣言','output=%.2f'%coef)
    else:
        print('真实是'+dic[label],'预测是谣言','output=%.2f'%coef)
    print('---------------------------------------------')

In [27]:
test_list = [
    '兴仁县今天抢小孩没抢走，把孩子母亲捅了一刀，看见这车的注意了，真事，车牌号辽HFM055！！！！！赶紧散播！ 都别带孩子出去瞎转悠了 尤其别让老人自己带孩子出去 太危险了 注意了！！！！辽HFM055北京现代朗动，在各学校门口抢小孩！！！110已经 证实！！全市通缉！！',
    '重庆真实新闻:2016年6月1日在重庆梁平县袁驿镇发生一起抢儿童事件，做案人三个中年男人，在三中学校到镇街上的一条小路上，把小孩直接弄晕(儿童是袁驿新幼儿园中班的一名学生)，正准备带走时被家长及时发现用棒子赶走了做案人，故此获救！请各位同胞们以此引起非常重视，希望大家有爱心的人传递下',
    '@尾熊C 要提前预习育儿知识的话，建议看一些小巫写的书，嘻嘻',
]
test_label=[0,0,1]
for i in range(len(test_list)):
    predict_rumor_LSTM(test_list[i],test_label[i])

兴仁县今天抢小孩没抢走，把孩子母亲捅了一刀，看见这车的注意了，真事，车牌号辽HFM055！！！！！赶紧散播！ 都别带孩子出去瞎转悠了 尤其别让老人自己带孩子出去 太危险了 注意了！！！！辽HFM055北京现代朗动，在各学校门口抢小孩！！！110已经 证实！！全市通缉！！
真实是谣言 预测是谣言 output=0.10
---------------------------------------------
重庆真实新闻:2016年6月1日在重庆梁平县袁驿镇发生一起抢儿童事件，做案人三个中年男人，在三中学校到镇街上的一条小路上，把小孩直接弄晕(儿童是袁驿新幼儿园中班的一名学生)，正准备带走时被家长及时发现用棒子赶走了做案人，故此获救！请各位同胞们以此引起非常重视，希望大家有爱心的人传递下
真实是谣言 预测是谣言 output=0.11
---------------------------------------------
@尾熊C 要提前预习育儿知识的话，建议看一些小巫写的书，嘻嘻
真实是非谣言 预测是非谣言 output=0.54
---------------------------------------------
