In [None]:
%matplotlib inline
import os
import re
import collections
import pickle
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, Lambda, LSTM, Embedding, Conv1D, TimeDistributed, Add
from tensorflow.keras import backend as K
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [None]:
def load(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

def dump(value, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        pickle.dump(value, f)
          
def save_model(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    model.save(path)

In [None]:
dataset_path = '../data/dataset/'
model_path = '../model/fake_news_classifier_model/'
user2ind = load(dataset_path+'user2ind.pkl')
eid2ind = load(dataset_path+'eid2ind.pkl')
eid_train = load(dataset_path+'eid_train.pkl')
eid_test = load(dataset_path+'eid_test.pkl')
X = load(dataset_path+'X.pkl')
X_dict = load(dataset_path+'X_dict.pkl')
y_dict = load(dataset_path+'y_dict.pkl')
dict_ = load(dataset_path+'dict_.pkl')
subX_dict = load(dataset_path+'subX_dict.pkl')

In [None]:
'''
matrix_main is used for LSTM input.
matrix_sub is used for the scoring module.
'''
acc=0

nb_users = len(user2ind)
nb_events = len(eid2ind)
nb_features = 2+20+100    # (#temporal, #user, #doc)
dim_hidden = 50

In [None]:
##### Main part #####
inputs = Input(shape=(None, nb_features))
emb_out = TimeDistributed(Dense(100, activation='tanh'))(inputs)    # W_e
emb_out = Dropout(0.2)(emb_out)
lstm_out = LSTM(dim_hidden, activation='tanh', return_sequences=False)(emb_out)    #(None, dim_hidden)
lstm_out = Dense(100, activation='tanh')(lstm_out)     # (None, 100) W_r
lstm_out = Dropout(0.2)(lstm_out)

In [None]:
##### Sub part #####
nb_score = 1
nb_expand = 100
sub_input = Input(shape=(None, nb_feature_sub))
user_vec = TimeDistributed(Dense(nb_expand, activation='tanh',
                                 kernel_regularizer=keras.regularizers.l2(0.01)))(sub_input)   # (None, None, nb_expand)
sub_h = TimeDistributed(Dense(nb_score, activation='sigmoid'))(user_vec)    # (None, None, nb_score)
z = Lambda(lambda x: K.mean(x, axis=1), output_shape=lambda s: (s[0], s[2]))(sub_h)    #(None, nb_score)

In [None]:
##### Concatenate #####
out1 = keras.layers.Dense(1, activation='sigmoid')(lstm_out)
concat_out = Add()([out1, z])
# concat_out = merge([rnn_out, z], mode='concat', concat_axis=1)
# concat_out = concatenate([rnn_out, z], axis=1)

##### Classifier #####
# outputs = Dense(1, activation='sigmoid')(concat_out)
# outputs = Dense(1, activation='sigmoid')(concat_out)
outputs = concat_out

##### Model #####
hvector = Model(inputs=[inputs, sub_input], outputs=concat_out)
zscore = Model(inputs=sub_input, outputs=sub_h)
model = Model(inputs=[inputs, sub_input], outputs=outputs)
uvector = Model(inputs=sub_input, outputs=user_vec)

In [None]:
##### Compile #####
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss='binary_crossentropy')
print("Model is compiled.")
model.summary()

In [None]:
y_test

In [None]:
preds

In [None]:
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix

### Training... ###
nb_epoch = 30
    
for ep in range(nb_epoch+1):
    print("{} epoch!!!!!!!!".format(ep))
    ##### Looping for eid_train #####
    losses = []
    for ii, eid in enumerate(eid_train):
        
        print(f'eid:{eid}')

        trainX = X_dict[eid]
        trainX = trainX.astype(np.float32)
        
        sub_trainX = subX_dict[eid]
        sub_trainX = sub_trainX.astype(np.float32)
        
        trainY = y_dict[eid]
        
        if ep % 10 == 0:
            h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), batch_size=1, epochs=1, verbose=2)
        else:
            h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), batch_size=1, epochs=1, verbose=0)
        
        losses.append(h.history['loss'][0])
    print("%% mean loss : {}".format(np.mean(losses)))

    ### Evaluation ###
    preds = []
    y_test = []
    
    for ii, eid in enumerate(eid_test):

        testX = X_dict[eid]
        testX = testX.astype(np.float32)
        
        sub_testX = subX_dict[eid]
        sub_testX = sub_testX.astype(np.float32)
        
        y_test.append(y_dict[eid])

        pred = model.predict([np.array([testX]), np.array([sub_testX])], verbose=0)
        preds.append(pred[0,0])

    preds = np.array(preds)
    preds = preds>0.5
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    print("%%% Test results {} samples %%%".format(len(y_test)))
    print("accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))
    print("precision : {:.4f} / {:.4f}".format(tp/(tp+fp), tn/(fn+tn)))
    print("recall : {:.4f} / {:.4f}".format(tp/(tp+fn), tn/(fp+tn)))
    print("F1 score : {:.4f} / {:.4f}".format(2*tp/(2*tp+fp+fn), 2*tn/(2*tn+fp+fn)))
        
    if acc < (tp+tn)/(tp+tn+fp+fn):
        acc = (tp+tn)/(tp+tn+fp+fn)
        print("%%%%%%%%%% Save model\t acc:{} %%%%%%%%%%%%".format(acc))

In [None]:
save_model(model, model_path+'fake_news_classifier.h5')