## Check GPU

In [1]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [2]:
import pandas as pd
import nltk
import cv2
import keras
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, Callback, TensorBoard
from keras.layers import Input, Embedding, Bidirectional, LSTM, Conv1D, MaxPooling1D, BatchNormalization, Flatten, Dense, Dropout, Reshape, Concatenate, Masking
from keras.regularizers import l2
from keras.utils import Sequence, to_categorical
from keras.optimizers import Adam
from keras.backend import epsilon
from keras.models import model_from_json
from keras.models import Model, Sequential
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback

tbc=TensorBoardColab()

Using TensorFlow backend.


Wait for 8 seconds...
TensorBoard link:
https://17d1ecb5.ngrok.io


## Parameters

In [0]:
TRAIN_TSV = "train2c.tsv"
VAL_TSV = "val2.tsv"
TEST_TSV = "test2.tsv"

pathf = "/content/drive/My Drive/fake_news/LIAR-PLUS-master/dataset/"

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Read Data

In [5]:
train_pd = pd.read_csv(pathf+TRAIN_TSV, sep='\t')
train_pd = train_pd.fillna('None')
print(train_pd.shape)
print(train_pd.Label.value_counts())

val_pd = pd.read_csv(pathf+VAL_TSV, sep='\t')
val_pd = val_pd.fillna('None')
print(val_pd.shape)
print(val_pd.Label.value_counts())

test_pd = pd.read_csv(pathf+TEST_TSV, sep='\t')
test_pd = test_pd.fillna('None')
print(test_pd.shape)
print(test_pd.Label.value_counts())

(10237, 16)
half-true      2114
false          1994
mostly-true    1961
true           1675
barely-true    1654
pants-fire      839
Name: Label, dtype: int64
(1284, 16)
false          263
mostly-true    251
half-true      248
barely-true    237
true           169
pants-fire     116
Name: Label, dtype: int64
(1267, 16)
half-true      265
false          249
mostly-true    241
barely-true    212
true           208
pants-fire      92
Name: Label, dtype: int64


## Check discrepancies in Column

*   Data Cleaned Manually




In [0]:
def clean_dataset(column, max_len):
  count = 0
  for i in range(train_pd[column].shape[0]):
    if type(train_pd[column][i]) == float:
      continue
    sent = train_pd[column][i].split(' ')
    if len(sent) > max_len:
      print(train_pd[column][i]+'\n')
      print(str(i))
      count = count + 1
  print(str(count) + '\n')

In [0]:
clean_dataset("Statement", 70)
clean_dataset("Subject", 20)
clean_dataset("Speaker", 10)
clean_dataset("Job Title", 20)
clean_dataset("Context", 30)
clean_dataset("Justification", 300)

## Convert Labels to floats

In [0]:
def convert_labels(data):
  print(data.Label.shape)
  data_y = np.zeros((data.Label.shape[0],))
  for i in range(data.Label.shape[0]):
    if data.Label[i] == 'pants-fire':
      data_y[i] = 0;
    elif data.Label[i] == 'false':
      data_y[i] = 0;
    elif data.Label[i] == 'barely-true':
      data_y[i] = 0;
    elif data.Label[i] == 'half-true':
      data_y[i] = 1;
    elif data.Label[i] == 'mostly-true':
      data_y[i] = 1;
    else:
      data_y[i] = 1;
  return data_y

In [7]:
train_y = convert_labels(train_pd) 
train_pd['Label'] = train_y
print(train_pd.Label.value_counts())

val_y = convert_labels(val_pd) 
val_pd['Label'] = val_y
print(val_pd.Label.value_counts())

test_y = convert_labels(test_pd) 
test_pd['Label'] = test_y
print(test_pd.Label.value_counts())

(10237,)
1.0    5750
0.0    4487
Name: Label, dtype: int64
(1284,)
1.0    668
0.0    616
Name: Label, dtype: int64
(1267,)
1.0    714
0.0    553
Name: Label, dtype: int64


## Get the count history

In [0]:
train_metadata = train_pd.values[:,9:14]
for i in range(train_metadata.shape[0]):
  for j in range(train_metadata.shape[1]):
    if train_metadata[i][j] == 'None':
      train_metadata[i][j] = 0

val_metadata = val_pd.values[:,9:14]
for i in range(val_metadata.shape[0]):
  for j in range(val_metadata.shape[1]):
    if val_metadata[i][j] == 'None':
      val_metadata[i][j] = 0

test_metadata = test_pd.values[:,9:14]
for i in range(test_metadata.shape[0]):
  for j in range(test_metadata.shape[1]):
    if test_metadata[i][j] == 'None':
      test_metadata[i][j] = 0

## Get embeddings

In [0]:
def get_embeddings(data, max_length):
  t = Tokenizer()
  t.fit_on_texts(data)
  vocab_size = len(t.word_index) + 1
  encoded_docs = t.texts_to_sequences(data)
  #print(encoded_docs)
  padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
  print(padded_docs.shape)
  return padded_docs, t, vocab_size

In [10]:
val_stmt_x, t, vocab_size = get_embeddings(val_pd['Statement'], 50)
val_sub_x, t, vocab_size = get_embeddings(val_pd['Subject'], 10)
val_cxt_x, t, vocab_size = get_embeddings(val_pd['Context'], 25)
val_spkr_x, t, vocab_size = get_embeddings(val_pd['Speaker'], 5)
val_prty_x, t, vocab_size = get_embeddings(val_pd['Party'], 5)
val_job_x, t, vocab_size = get_embeddings(val_pd['Job Title'], 20)
val_state_x, t, vocab_size = get_embeddings(val_pd['State'], 5)
val_just_x, t, vocab_size = get_embeddings(val_pd['Justification'], 150)

test_stmt_x, t, vocab_size = get_embeddings(test_pd['Statement'], 50)
test_sub_x, t, vocab_size = get_embeddings(test_pd['Subject'], 10)
test_cxt_x, t, vocab_size = get_embeddings(test_pd['Context'], 25)
test_spkr_x, t, vocab_size = get_embeddings(test_pd['Speaker'], 5)
test_prty_x, t, vocab_size = get_embeddings(test_pd['Party'], 5)
test_job_x, t, vocab_size = get_embeddings(test_pd['Job Title'], 20)
test_state_x, t, vocab_size = get_embeddings(test_pd['State'], 5)
test_just_x, t, vocab_size = get_embeddings(test_pd['Justification'], 150)

train_stmt_x, t, vocab_size = get_embeddings(train_pd['Statement'], 50)
train_sub_x, t, vocab_size = get_embeddings(train_pd['Subject'], 10)
train_cxt_x, t, vocab_size = get_embeddings(train_pd['Context'], 25)
train_spkr_x, t, vocab_size = get_embeddings(train_pd['Speaker'], 5)
train_prty_x, t, vocab_size = get_embeddings(train_pd['Party'], 5)
train_job_x, t, vocab_size = get_embeddings(train_pd['Job Title'], 20)
train_state_x, t, vocab_size = get_embeddings(train_pd['State'], 5)
train_just_x, t, vocab_size = get_embeddings(train_pd['Justification'], 150)

print(vocab_size)

(1284, 50)
(1284, 10)
(1284, 25)
(1284, 5)
(1284, 5)
(1284, 20)
(1284, 5)
(1284, 150)
(1267, 50)
(1267, 10)
(1267, 25)
(1267, 5)
(1267, 5)
(1267, 20)
(1267, 5)
(1267, 150)
(10237, 50)
(10237, 10)
(10237, 25)
(10237, 5)
(10237, 5)
(10237, 20)
(10237, 5)
(10237, 150)
24217


In [11]:
# load the whole embedding into memory
embeddings_index = dict()
f = open(pathf + 'glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


## Define Model

In [0]:
def get_features(input_length):
  input_tensor = Input((input_length,))
  X = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=input_length, trainable=False)(input_tensor)
  X = Bidirectional(LSTM(32, return_sequences=True))(X)
  X = Bidirectional(LSTM(16, return_sequences=True))(X)
  X = Flatten()(X)
  X = Dense(1024, activation='relu', kernel_regularizer=l2(0.0))(X)
  X = Dropout(0.3)(X)
  X = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(X)
  X = Reshape((8,16))(X)
  X = Conv1D(128,3, padding='same', activation='relu', kernel_regularizer=l2(0.0))(X)
  X = MaxPooling1D(2)(X)
  X = BatchNormalization()(X)
  X = Flatten()(X)
  output_tensor = Dense(128, activation='relu', kernel_regularizer=l2(0.0))(X)
  output_tensor = Dropout(0.3)(output_tensor)
  return input_tensor, output_tensor

In [0]:
stmt_input, stmt_ftrs = get_features(train_stmt_x.shape[1])
sub_input, sub_ftrs = get_features(train_sub_x.shape[1])
cxt_input, cxt_ftrs = get_features(train_cxt_x.shape[1])
spkr_input, spkr_ftrs = get_features(train_spkr_x.shape[1])
prty_input, prty_ftrs = get_features(train_prty_x.shape[1])
job_input, job_ftrs = get_features(train_job_x.shape[1])
state_input, state_ftrs = get_features(train_state_x.shape[1])
just_input, just_ftrs = get_features(train_just_x.shape[1])

combo_1 = Concatenate()([stmt_ftrs, sub_ftrs])
combo_2 = Concatenate()([stmt_ftrs, cxt_ftrs])
combo_3 = Concatenate()([spkr_ftrs, prty_ftrs])
combo_4 = Concatenate()([prty_ftrs, job_ftrs])
combo_5 = Concatenate()([sub_ftrs, cxt_ftrs])
combo_6 = Concatenate()([stmt_ftrs, state_ftrs])
combo_7 = Concatenate()([stmt_ftrs, prty_ftrs])
combo_8 = Concatenate()([state_ftrs, prty_ftrs])
combo_9 = Concatenate()([cxt_ftrs, prty_ftrs])
combo_10 = Concatenate()([cxt_ftrs, spkr_ftrs])
combo_11 = Concatenate()([stmt_ftrs, just_ftrs])
combo_12 = Concatenate()([cxt_ftrs, just_ftrs])

input_tensor_3 = Input((5,))
metadata = Dense(256, activation='relu')(input_tensor_3)

out_1 = Concatenate()([combo_1, combo_2, combo_3, combo_4, combo_5, combo_6, combo_7, combo_8, combo_9, combo_10, combo_11, combo_12])
out = Dense(1028, activation='relu', kernel_regularizer=l2(0.0))(out_1)
out = Concatenate()([out, metadata])
out = Dropout(0.3)(out)
out = Dense(256, activation='relu', kernel_regularizer=l2(0.0))(out)
out = Dropout(0.3)(out)
#out = Dense(6, activation='softmax')(out)
out = Dense(2, activation='sigmoid')(out)
model = Model(inputs = [stmt_input, sub_input, cxt_input, spkr_input, prty_input, job_input, state_input, just_input, input_tensor_3], outputs = out)
model.summary()

## Training

In [14]:
# define labels
train_y = to_categorical(train_pd['Label'], 2)
val_y = to_categorical(val_pd['Label'], 2)
test_y = to_categorical(test_pd['Label'], 2)
print(train_y.shape)
print(train_y)
print(val_y.shape)
print(test_y.shape)

(10237, 2)
[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]
(1284, 2)
(1267, 2)


In [0]:
#model.load_weights(pathf+'model/main_binary.h5')
stop = EarlyStopping(monitor="val_acc", patience=20, mode="max")
reduce_lr = ReduceLROnPlateau(monitor="val_acc", factor=0.1, patience=10, min_lr=1e-6, verbose=1, mode="max")
optimizer = Adam(lr = 0.001)
model.compile(optimizer, loss='binary_crossentropy', metrics=['acc'])
CB1 = keras.callbacks.ModelCheckpoint(pathf+'model/main_binary.h5', monitor = "val_acc", save_best_only = True, save_weights_only = True,
                                      verbose = 1, mode="max", period=1)

In [0]:
model.fit([train_stmt_x, train_sub_x, train_cxt_x, train_spkr_x, train_prty_x, train_job_x, train_state_x, train_just_x, train_metadata], train_y,
          epochs=200, verbose=1, validation_data=([val_stmt_x, val_sub_x, val_cxt_x, val_spkr_x, val_prty_x, val_job_x, val_state_x, val_just_x,
                                                      val_metadata], val_y), callbacks=[CB1, reduce_lr, stop, TensorBoardColabCallback(tbc)])

## Evaluate the Model

In [0]:
WEIGHTS_FILE = pathf+"model/main_binary.h5"
model.load_weights(WEIGHTS_FILE)
val_loss, val_acc = model.evaluate([val_stmt_x, val_sub_x, val_cxt_x, val_spkr_x, val_prty_x, val_job_x, val_state_x, val_just_x, val_metadata], val_y,
                                   verbose=1)
print('Validation Accuracy: %f' % (val_acc))
test_loss, test_acc = model.evaluate([test_stmt_x, test_sub_x, test_cxt_x, test_spkr_x, test_prty_x, test_job_x, test_state_x, test_just_x, test_metadata],
                                     test_y, verbose=1)
print('Test Accuracy: %f' % (test_acc))

In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict([test_stmt_x, test_sub_x, test_cxt_x, test_spkr_x, test_prty_x, test_job_x, test_state_x, test_just_x, test_metadata],
                       batch_size=32, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(test_pd['Label'], y_pred_bool))

              precision    recall  f1-score   support

         0.0       0.67      0.72      0.69       553
         1.0       0.77      0.72      0.75       714

    accuracy                           0.72      1267
   macro avg       0.72      0.72      0.72      1267
weighted avg       0.73      0.72      0.72      1267

