In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import jieba
import re

In [7]:
train=pd.read_csv('data/train.csv')
test=pd.read_csv('data/test_dataset.csv')

In [8]:
def seg_words(text):
    #去掉不在(所有中文、大小写字母、数字)中的非法字符
    regex = re.compile(r'[^\u4e00-\u9fa5A-Za-z0-9]')
    text = regex.sub(' ', text) # 将非法字符用‘ ’替代
    text = text.strip() # 去掉前后的空格
    word_list = jieba.cut(text, cut_all= False)
    return word_list

In [9]:
# 将 content 分词
train['content_seg'] = train['content'].apply(lambda x: " ".join(seg_words(str(x))))
test['content_seg'] = test['content'].apply(lambda x:" ".join(seg_words(str(x))))
train[['content', 'content_seg']].head()

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/j3/8hf614ps7xx_nl_knd25th4r0000gn/T/jieba.cache
Loading model cost 0.840 seconds.
Prefix dict has been built successfully.


Unnamed: 0,content,content_seg
0,回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：,回复 新浪 网友 对 博文 国家文物局 限制 鉴宝 节目 现场 估价 转 的 评论 ...
1,//分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....,分享 网易 新闻 发生 在 昆明 的 火锅店 老板 辱 滇门 云南 人该...
2,西宁城管围殴民警扬言要把警察打死|西宁城管围...,西宁 城管 围殴 民警 扬言 要 把 警察 打死 西宁 城管 围
3,【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...,川航 航班 因 驾驶舱 风挡 破裂 安全 备降 成都 今天上午 6 26 从 重庆江...
4,支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】,支持 郑强 贵州大学 校长 回应 空姐 言论 常给 她们 写 感谢信


In [10]:
test.head()

Unnamed: 0,id,content,picture_lists,category,comment_2,comment_all,content_seg
0,fb7abd30e64904d4aec80913f39c2a4d,因为带了口罩和害怕唾沫的关系，现在街上即便行人不少也基本只有脚步声，在打印店打印东西，店里同...,,疫情,,,因为 带 了 口罩 和 害怕 唾沫 的 关系 现在 街上 即便 行人 不少 也 基本 只...
1,33862231893127fa396812b4fa9cc709,台词汇编457（@万善公）两周！/钟南山院士呼吁：解决疫情最快，成本最低的方式就是全中国人民...,,疫情,,,台词 汇编 457 万善公 两周 钟南山 院士 呼吁 解决 疫情 最...
2,0fe350647b75a2729c9b6d5604fa4baa,从武汉撤回的日本人，迎接他们的是每人一台救护车，206人=206台救护车，进行隔离。这就是连...,,疫情,,,从 武汉 撤回 的 日本 人 迎接 他们 的 是 每人 一台 救护车 206 人 ...
3,7de07ad7a1eacff14b0ab88303bde62b,武汉地铁。钟院士的防病毒高招: 各位去医院或其他公共场合之前用淡盐水漱一下咽喉部位，回...,,疫情,,,武汉 地铁 钟 院士 的 防病毒 高招 各位 去 医院 或 其他 公...
4,73952631593ee8f33a9b42bd66caaf96,重要的事情说三遍！钟南山院士呼吁： 钟南山院士呼吁： 钟南山院士呼吁：​目...,,疫情,,,重要 的 事情 说 三遍 钟南山 院士 呼吁 钟南山 院士 呼吁 ...


In [11]:
# 这里是训练集和测试集都要做处理

train["content_comment_2c_seg"] = (
    train["content"].astype(str) + train["comment_2c"].astype(str)
).apply(lambda x: " ".join(seg_words(str(x))))

train["content_comment_all_seg"] = (
    train["content"].astype(str) + train["comment_all"].astype(str)
).apply(lambda x: " ".join(seg_words(str(x))))

test["content_comment_2c_seg"] = (
    test["content"].astype(str) + test["comment_2"].astype(str)
).apply(
    lambda x: " ".join(seg_words(str(x)))
)  # 变量名和train不一样
test["content_comment_all_seg"] = (
    test["content"].astype(str) + test["comment_all"].astype(str)
).apply(lambda x: " ".join(seg_words(str(x))))

In [None]:
def preprocess(train_seg, test_seg, num_words = 10000):
    '''
        train_seg
    '''
    maxlen = train_seg.apply(lambda x:len(x)).max() # 最长有多少词
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words= num_words)#tokenizer分词器类
    tokenizer.fit_on_texts(train_seg) # 词条化，生成文本词典
    sequences_train = tokenizer.texts_to_sequences(train_seg) #把句子转化为单词序列，这里已经编码了
    train_sequence_pad = tf.keras.preprocessing.sequence.pad_sequences(sequences_train, maxlen= maxlen, value = 0.0)
    #补齐，因为keras只能输入等长序列
    #按照最大的长度进行补齐
    word_index = tokenizer.word_index # word_index: 字典，将单词（字符串）映射为它们的排名或者索引。仅在调用fit_on_texts之后设置。
   # word_index_inversed = {v:k for k,v in word_index.items()}
    print('train_sequence_pad shape:',train_sequence_pad.shape)
    # 处理test
    sequences_test = tokenizer.texts_to_sequences(test_seg)
    test_sequence_pad = tf.keras.preprocessing.sequence.pad_sequences(sequences_test, maxlen= maxlen, value = 0.0)
    print('test_sequence_pad shape:',test_sequence_pad.shape)
    return train_sequence_pad, test_sequence_pad, word_index

In [None]:
# 处理content
(
    train_content_sequence_pad,
    test_content_sequence_pad,
    word_index_cotent,
) = preprocess(train_seg=train["content_seg"], test_seg=test["content_seg"])

In [None]:
# 处理content + comment_2c
(
    train_content_comment_2c_sequence_pad,
    test_content_comment_2c_sequence_pad,
    word_index_with_comment_2c,
) = preprocess(
    train_seg=train["content_comment_2c_seg"], test_seg=test["content_comment_2c_seg"]
)

In [None]:
# 处理content + comment_all
(
    train_content_comment_all_sequence_pad,
    test_content__comment_all_sequence_pad,
    word_index_with_comment_all,
) = preprocess(
    train_seg=train["content_comment_all_seg"], test_seg=test["content_comment_all_seg"]
)

In [None]:
word_index_cotent

In [None]:

np.save( '../data/train_content_sequence_pad.npy',train_content_sequence_pad)
np.save("../data/test_content_sequence_pad.npy", test_content_sequence_pad)
np.save( "../data/train_label.npy",train["label"].to_numpy())

In [None]:
np.save( "../data/test_text.npy", test["content_seg"].to_numpy())
np.save( "../data/train_text.npy",
    train[
        ["content_seg", "content_comment_2c_seg", "content_comment_all_seg"]
    ].to_numpy(),
)

In [None]:
train["label"].to_numpy()

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from matplotlib.pyplot import imshow

In [None]:
data_dir = '../data/'
train_images_dir = data_dir + '/train_images/' #这也是一种设置路径的方法，可以记住
test_images_dir = data_dir + '/test_images/'
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test_dataset.csv'

train_images_filenames = [train_images_dir + filename for filename in os.listdir(train_images_dir)]
test_images_filenames = [test_images_dir + filename for filename in os.listdir(test_images_dir)]

#tfrecord_file = data_dir + '/train/train.tfrecords'
# train_filenames = train_cat_filenames + train_dog_filenames

FIG_SIZE = (10,10)

In [None]:

def preprocess_image(image, target_fig_size):
    '''
        input: image对象,通过tf.io.read_file读入
        ouput:返回一个tftensor
        help(tf.image.decode_image),自动识别图片类型，gif返回num_frames, height, width, 3], 其他返回[height, width, num_channels]
        gif = tf.io.read_file('E:/dataset/2019-nCoV/competition/fake_news/train_images/02b6176b5e8dcd910112b2b63eb3f8bb.gif')
        gif_tensor = tf.image.decode_image(gif, channels=3) 只取了第一帧
        gif_tensor.shape # 
    '''
    img_tensor = tf.image.decode_image(image, channels=3)
    img_final = tf.image.resize(img_tensor, [*target_fig_size])
    img_final /= 255.0  # normalize to [0,1] range
    return img_final

def load_and_preprocess_image(path, target_fig_size):
    image = tf.io.read_file(path)
    return preprocess_image(image,target_fig_size)

In [None]:
FIG_SIZE = (10,10)

In [None]:
#非常慢
feature = tf.constant([], shape= [0,*FIG_SIZE,3],dtype='float32')
count = []# 不可以写成 tf.float64?
for i in range(train.shape[0]):
    # 如果没有图片，就补充一个1，FIG_SIZE,3形状的全零张量
    if i in train.loc[train.picture_lists.isna()].index:
        zero_tensor= tf.zeros([1,*FIG_SIZE,3])
        feature= tf.concat((feature,zero_tensor),axis = 0)
        print(train.loc[i,'picture_lists'])
        print(i,'done zero')
        count.append(i)
    else:
        # 如果有图片，只取第一张图片进行处理
        first_image_name = train.loc[i,'picture_lists'].split('\t')[0]
        first_image_path = train_images_dir + first_image_name
        try:
            print(i,first_image_path)
            tf_tensor = load_and_preprocess_image(first_image_path, FIG_SIZE)
            tf_tensor= tf.reshape(tf_tensor,[1,*FIG_SIZE, 3])
            feature= tf.concat((feature,tf_tensor),axis = 0)
            print(i,'done jpg')
            count.append(i)
        except:
            feature= tf.concat((feature,zero_tensor),axis = 0)
            print(i,first_image_path)
            print(i,'done other')
            count.append(i)

In [None]:
np.save('../data/train_image_feature.npy', feature)

In [None]:
test_feature = tf.constant([], shape= [0,*FIG_SIZE,3],dtype='float32')
count = []# 不可以写成 tf.float64?
for i in range(test.shape[0]):
    # 如果没有图片，就补充一个1，FIG_SIZE,3形状的全零张量
    if i in test.loc[test.picture_lists.isna()].index:
        zero_tensor= tf.zeros([1,*FIG_SIZE,3])
        test_feature= tf.concat((test_feature,zero_tensor),axis = 0)
        print(i,test.loc[i,'picture_lists'])
        count.append(i)
    else:
        # 如果有图片，只取第一张图片进行处理
        first_image_name = test.loc[i,'picture_lists'].split('\t')[0]
        first_image_path = test_images_dir + first_image_name
        try:
            tf_tensor = load_and_preprocess_image(first_image_path, FIG_SIZE)
            tf_tensor= tf.reshape(tf_tensor,[1,*FIG_SIZE, 3])
            test_feature= tf.concat((test_feature,tf_tensor),axis = 0)
            print(i,first_image_path)
            count.append(i)
        except:
            test_feature= tf.concat((test_feature,zero_tensor),axis = 0)
            print(i,first_image_path)
            count.append(i)

In [None]:
np.save('../data/test_image_feature.npy',test_feature)

In [None]:
# 保存train label：'ncw_label','fake_label','real_label'
np.save( '../data/train_labels.npy', train[['ncw_label','fake_label','real_label']].to_numpy()) # train_label 是 one-hot形式的

## 建模部分

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import jieba
import re
import time

In [None]:
df=pd.read_csv('../data/train.csv')
df.head()

In [None]:
import jieba
df['content'] = df.content.apply(lambda x: " ".join(jieba.cut(x))) 
df.head()

In [12]:
df1 = df[['content', 'ncw_label','fake_label','real_label']]

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
maxlen = 100
max_words = 10000

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df1.content)
sequences = tokenizer.texts_to_sequences(df1.content)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.array(df1[['ncw_label','fake_label','real_label']])

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
training_samples = int(len(indices) * .8)
validation_samples = len(indices) - training_samples
X_train = data[:training_samples]
y_train = labels[:training_samples]
X_valid = data[training_samples: training_samples + validation_samples]
y_valid = labels[training_samples: training_samples + validation_samples]

In [None]:
#!pip install gensim

In [None]:
from gensim.models import KeyedVectors

### 加载与训练好的词向量

In [None]:
zh_model = KeyedVectors.load_word2vec_format('../data/zh.vec')
len(zh_model[next(iter(zh_model.vocab))])

In [None]:
embedding_dim = len(zh_model[next(iter(zh_model.vocab))])
embedding_matrix = np.random.rand(max_words, embedding_dim)
embedding_matrix = (embedding_matrix - 0.5) * 2

In [None]:
for word, i in word_index.items():
    if i < max_words:
        try:
          embedding_vector = zh_model.get_vector(word)
          embedding_matrix[i] = embedding_vector
        except:
          pass
            

### LSTM提取文本特征

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM,Dropout

units = 32

model = Sequential()
model.add(Embedding(max_words, embedding_dim))
model.add(LSTM(units))
model.add(Dropout(0.2))
model.add(Dense(24,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()



In [None]:
%%time
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=10)
history = model.fit(X_train, y_train,
                    epochs=30,
                    batch_size=32,
                    validation_split=0.2,callbacks=[early_stop])
model.save("mymodel.h5")

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
train_image_feature = np.load( '../data/train_image_feature.npy', allow_pickle=True)
#test_image_feature = np.load('../data/test_image_feature.npy', allow_pickle=True)

In [None]:
#加载预训练模型
train_content_sequence = keras.Input(
    shape=(100,)
)  # Variable-length sequence of ints
Embedding_layer = keras.layers.Embedding(
    max_words, embedding_dim, weights=[embedding_matrix], trainable=False
)(train_content_sequence)

In [None]:
lstm_text=tf.keras.layers.LSTM(16)(Embedding_layer)
lstm_text= tf.keras.layers.Dense(8,activation='relu')(lstm_text)
lstm_test_pr=tf.keras.layers.Dense(3,activation="softmax")(lstm_text)
text_model= keras.Model(inputs=train_content_sequence, outputs=lstm_test_pr)

In [None]:
### CNN提取图片特征

In [None]:
image_input = keras.Input(shape=(10, 10, 3))  
#image_feature = keras.layers.Dropout(0.2)(image_input)
image_feature = keras.layers.Conv2D(15, kernel_size = 2, padding="same",activation = 'relu')(image_input)
image_feature = keras.layers.Dropout(0.2)(image_feature)
image_feature = keras.layers.Conv2D(5, kernel_size = 2, activation = 'relu')(image_feature)
image_feature = keras.layers.Dropout(0.2)(image_feature)
image_feature = keras.layers.Conv2D(5, kernel_size = 2, activation = 'relu')(image_feature)
image_feature = keras.layers.Dropout(0.2)(image_feature)
#image_feature = keras.layers.MaxPooling2D(2)(image_feature)
image_feature = keras.layers.Flatten()(image_feature)

In [None]:
x = keras.layers.Dense(8, activation="relu")(image_feature)
x = keras.layers.Dropout(0.2)(x)
#x = keras.layers.Dense(8, activation="relu")(x)
im_pred = keras.layers.Dense(3, activation="softmax", name="pred")(x)
img_model=keras.Model(inputs=image_input, outputs=im_pred)
img_model.summary()

### 文本与图片融合多模态

In [None]:
# Combine wide and deep into one model
merged_out = keras.layers.concatenate([text_model.output, img_model.output])
merged_out=keras.layers.Dropout(0.2)(merged_out)
merged_out =keras.layers.Dense(3, activation='softmax')(merged_out)

combined_model = keras.Model([text_model.input,img_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(optimizer=keras.optimizers.RMSprop(1e-3),
              loss=keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
import time
%%time
early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
history =  combined_model.fit([data, train_image_feature],
                    labels, epochs=30, batch_size=32, validation_split=0.2,verbose=1,callbacks= [early_stop])

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
###  BiLSTM+ATTENTION多模态

In [None]:
from keras.layers import Bidirectional
lstm_text1=keras.layers.Bidirectional(keras.layers.LSTM(16))(Embedding_layer)
lstm_text1= keras.layers.Dense(8,activation='relu')(lstm_text1)
attention=keras.layers.Attention(use_scale=False)([lstm_text1,lstm_text1])
Input_layer=tf.keras.layers.Concatenate()([lstm_text1, attention])
lstm_test_pr=keras.layers.Dense(3,activation="softmax")(lstm_text1)
text_model1= keras.Model(inputs=train_content_sequence ,outputs=lstm_test_pr)
text_model1.summary()

In [None]:
image_input = keras.Input(shape=(10, 10, 3))  
#image_feature = keras.layers.Dropout(0.2)(image_input)
image_feature = keras.layers.Conv2D(15, kernel_size = 2, padding="same",activation = 'relu')(image_input)
image_feature = keras.layers.Dropout(0.2)(image_feature)
image_feature = keras.layers.Conv2D(5, kernel_size = 2, activation = 'relu')(image_feature)
image_feature = keras.layers.Dropout(0.2)(image_feature)
image_feature = keras.layers.Conv2D(5, kernel_size = 2, activation = 'relu')(image_feature)
image_feature = keras.layers.Dropout(0.2)(image_feature)
image_feature = keras.layers.MaxPooling2D(2)(image_feature)
image_feature = keras.layers.Flatten()(image_feature)
x = keras.layers.Dense(8, activation="relu")(image_feature)
x = keras.layers.Dropout(0.2)(x)
#x = keras.layers.Dense(8, activation="relu")(x)
im_pred = keras.layers.Dense(3, activation="softmax", name="pred")(x)
img_model=keras.Model(inputs=image_input, outputs=im_pred)
img_model.summary()

In [None]:
# Combine wide and deep into one model
merged_out = keras.layers.concatenate([text_model1.output, img_model.output])
merged_out =keras.layers.Dense(3, activation='softmax')(merged_out)
combined_model = keras.Model([text_model1.input,img_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(optimizer=keras.optimizers.RMSprop(1e-3),
              loss=keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
%%time
early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
history =  combined_model.fit([data, train_image_feature],
                    labels, epochs=30, batch_size=32, validation_split=0.2,verbose=1,callbacks= [early_stop])

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()