In [None]:
!pip install transformers -q
!pip install keras-tcn -q
from tcn import TCN
import pandas as pd 
import numpy as np 
import tensorflow as tf 
import re 
from sklearn.model_selection import train_test_split

In [None]:
# convert class in text to numbers
def get_fnn_class(row):
    if (row['label_fnn'] == 'fake'):
        return 0
    else:
        return 1
    
def get_liar_poli_class(row, label):
    # false, pants-fire, barely-true,
    # true, half-true, mostly-true
    if (row[label] == 'false' or row[label] == 'pants-fire' or row[label] == 'barely-true'):
        return 0
    else:
        return 1

In [None]:
# load data
cnn_news = pd.read_csv('../input/breaking-news-from-twitter-20102021/tweets_cnn.csv')

snopes = pd.read_csv('../input/scraped-fake/snopes-scrape-full.csv')
aff = pd.read_csv('../input/scraped-fake/aff-scrape-full-processed.csv')

covid_ifcn = pd.read_csv('../input/scraped-fake/ifcn-scrape-full.csv')
covid_real = pd.read_csv('../input/covidnews/trueNews.csv')

fn_get_real = pd.read_csv('../input/fake-news/fake.csv')

fnnn_train = pd.read_csv('../input/fnnndata/fnn_train.csv', index_col=0)
fnnn_test = pd.read_csv('../input/fnnndata/fnn_test.csv', index_col=0)
fnnn_dev = pd.read_csv('../input/fnnndata/fnn_dev.csv', index_col=0)

liar_data_train = pd.read_csv('../input/liardata/liar_train.csv', index_col=0)
liar_data_test = pd.read_csv('../input/liardata/liar_test.csv', index_col=0)
liar_data_dev = pd.read_csv('../input/liardata/liar_dev.csv', index_col=0)

politifact = pd.read_csv('../input/politifact/politifact-scrape-full.csv')
poli_alt = pd.read_csv('../input/politifact-factcheck-data/politifact.csv')

fnn_buzzfake = pd.read_csv('../input/fakenewsnet/BuzzFeed_fake_news_content.csv')
fnn_buzzreal = pd.read_csv('../input/fakenewsnet/BuzzFeed_real_news_content.csv')

fr_fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
fr_real = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
bbc_news = pd.read_csv('../input/breaking-news-from-twitter-20102021/tweets_bbc.csv')

In [None]:
# prepare data
poli_liar_fnn = None
scraped_fake = None

# buzzfeed data
fnn_buzzfake = fnn_buzzfake[['title']]
fnn_buzzreal = fnn_buzzreal[['title']]
fnn_buzzfake['class'] = 0 
fnn_buzzreal['class'] = 1
poli_liar_fnn = pd.concat([fnn_buzzfake, fnn_buzzreal], ignore_index=True, sort=False)

# get data needed from FNN dataset
fnnn_train['class'] = fnnn_train.apply(lambda row: get_fnn_class(row), axis=1)
fnnn_test['class'] = fnnn_test.apply(lambda row: get_fnn_class(row), axis=1)
fnnn_dev['class'] = fnnn_dev.apply(lambda row: get_fnn_class(row), axis=1)
fnnn_train = fnnn_train.rename({'statement' : 'title'}, axis=1)
fnnn_test = fnnn_test.rename({'statement' : 'title'}, axis=1)
fnnn_dev = fnnn_dev.rename({'statement' : 'title'}, axis=1)

fnnn_train = fnnn_train[['title', 'class']]
fnnn_test = fnnn_test[['title', 'class']]
fnnn_dev = fnnn_dev[['title', 'class']]
combine_fnnn_list = [fnnn_train, fnnn_test, fnnn_dev]
combined_fnnn_data = pd.concat(combine_fnnn_list, ignore_index=True, sort=False)
poli_liar_fnn = pd.concat([poli_liar_fnn, combined_fnnn_data], ignore_index=True, sort=False)

# get the rest of politifact data
politifact = politifact.drop(politifact[(politifact.target == 'full-flop') | (politifact.target == 'half-flip') | (politifact.target == 'no-flip')].index)
politifact['class'] = politifact.apply(lambda row: get_liar_poli_class(row, 'target'), axis=1)
politifact = politifact.rename({'statement' : 'title'}, axis=1)
politifact = politifact[['title', 'class']]
poli_liar_fnn = pd.concat([poli_liar_fnn, politifact], ignore_index=True, sort=False)

# get more politifact data
poli_alt = poli_alt.drop(poli_alt[(poli_alt.fact == 'full-flop') | (poli_alt.fact == 'half-flip') | (poli_alt.fact == 'no-flip')].index)
poli_alt['class'] = poli_alt.apply(lambda row: get_liar_poli_class(row, 'fact'), axis=1)
poli_alt = poli_alt.rename({'sources_quote' : 'title'}, axis=1)
poli_alt = poli_alt[['title', 'class']]
poli_liar_fnn = pd.concat([poli_liar_fnn, poli_alt], ignore_index=True, sort=False)

# get data needed from LIAR dataset
liar_data_train['class'] = liar_data_train.apply(lambda row: get_liar_poli_class(row, 'label-liar'), axis=1)
liar_data_test['class'] = liar_data_test.apply(lambda row: get_liar_poli_class(row, 'label-liar'), axis=1)
liar_data_dev['class'] = liar_data_dev.apply(lambda row: get_liar_poli_class(row, 'label-liar'), axis=1)
liar_data_train = liar_data_train.rename({'statement' : 'title'}, axis=1)
liar_data_test = liar_data_test.rename({'statement' : 'title'}, axis=1)
liar_data_dev = liar_data_dev.rename({'statement' : 'title'}, axis=1)
liar_data_train = liar_data_train[['title', 'class']]
liar_data_test = liar_data_test[['title', 'class']]
liar_data_dev = liar_data_dev[['title', 'class']]
combine_liar_list = [liar_data_train, liar_data_test, liar_data_dev]
combined_liar_data = pd.concat(combine_liar_list, ignore_index=True, sort=False)
poli_liar_fnn = pd.concat([poli_liar_fnn, combined_liar_data], ignore_index=True, sort=False)
poli_liar_fnn = poli_liar_fnn.drop_duplicates(subset='title', ignore_index=True)
poli_liar_fnn = poli_liar_fnn[poli_liar_fnn.title.str.count(' ').gt(5)]

# get data from getting real dataset
fn_get_real = fn_get_real[['title']]
fn_get_real['class'] = 0 
scraped_fake = pd.concat([scraped_fake, fn_get_real], ignore_index=True, sort=False)

# add all the scraped fake news
scraped_fake = pd.concat([scraped_fake, aff], ignore_index=True, sort=False)
scraped_fake = pd.concat([scraped_fake, snopes], ignore_index=True, sort=False)
scraped_fake = scraped_fake.drop_duplicates(subset='title', ignore_index=True)
scraped_fake = scraped_fake[scraped_fake.title.str.count(' ').gt(5)]

# add real news from cnn
cnn_news = cnn_news.rename({'tweet' : 'title'}, axis=1)
cnn_news = cnn_news[['title']]
cnn_news['class'] = 1
cnn_news = cnn_news.drop_duplicates(subset='title', ignore_index=True)
cnn_news = cnn_news[cnn_news.title.str.count(' ').gt(5)]

# split covid data into train and test set
covid_real = covid_real.rename({'Text' : 'title'}, axis=1)
covid_real = covid_real.rename({'Label' : 'class'}, axis=1)
covid_real = covid_real[['title', 'class']]
combined_covid_data = pd.concat([covid_ifcn, covid_real], ignore_index=True, sort=False)
combined_covid_data = combined_covid_data.drop_duplicates(subset='title', ignore_index=True)
combined_covid_data = combined_covid_data[combined_covid_data.title.str.count(' ').gt(5)]

# get data needed from fr dataset
fr_fake = fr_fake[['title']]
fr_real = fr_real[['title']]
fr_fake['class'] = 0 
fr_real['class'] = 1
fr_data = pd.concat([fr_fake, fr_real], ignore_index=True, sort=False)
fr_data = fr_data.drop_duplicates(subset='title', ignore_index=True)

# get data from BBC
bbc_news = bbc_news.rename({'tweet' : 'title'}, axis=1)
bbc_news = bbc_news[['title']]
bbc_news['class'] = 1 
bbc_news = bbc_news.drop_duplicates(subset='title', ignore_index=True)

In [None]:
features_covid_data = combined_covid_data['title'].astype(str)
targets_covid_data = combined_covid_data['class']
covid_feature_train, covid_feature_test, covid_target_train, covid_target_test = train_test_split(features_covid_data, targets_covid_data, test_size=0.10, stratify=targets_covid_data, random_state=42)

features_poli_liar_fnn = poli_liar_fnn['title'].astype(str)
targets_poli_liar_fnn = poli_liar_fnn['class']
pln_feature_train, pln_feature_test, pln_target_train, pln_target_test = train_test_split(features_poli_liar_fnn, targets_poli_liar_fnn, test_size=0.10, stratify=targets_poli_liar_fnn, random_state=42)

features_scraped_fake = scraped_fake['title'].astype(str)
targets_scraped_fake = scraped_fake['class']
fake_feature_train, fake_feature_test, fake_target_train, fake_target_test = train_test_split(features_scraped_fake, targets_scraped_fake, test_size=0.10, stratify=targets_scraped_fake, random_state=42)

feature_cnn = cnn_news['title'].astype(str)
target_cnn = cnn_news['class']

features_fr_data = fr_data['title'].astype(str)
targets_fr_data = fr_data['class']

feature_bbc = bbc_news['title'].astype(str)
target_bbc = bbc_news['class']

feature_train = pd.concat([covid_feature_train, pln_feature_train, fake_feature_train, feature_cnn], ignore_index=True, sort=False)
target_train = pd.concat([covid_target_train, pln_target_train, fake_target_train, target_cnn], ignore_index=True, sort=False)

feature_val = pd.concat([covid_feature_test, pln_feature_test, fake_feature_test], ignore_index=True, sort=False)
target_val = pd.concat([covid_target_test, pln_target_test, fake_target_test], ignore_index=True, sort=False)

feature_test = pd.concat([covid_feature_test, pln_feature_test, fake_feature_test, features_fr_data, feature_bbc], ignore_index=True, sort=False)
target_test = pd.concat([covid_target_test, pln_target_test, fake_target_test, targets_fr_data, target_bbc], ignore_index=True, sort=False)

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
stop_words.remove("not")
keep_list = ["aren't", "don't", "didn't", "hasn't", "wouldn't", "weren't", "not", "isn't", "couldn't", 
             "shouldn't", "won't", "mustn't", "wasn't", "haven't", "doesn't", "hadn't", "needn't",
             "didn", "wasn", "isn", "hasn", "needn", "shouldn", "couldn", "wouldn", "weren", "haven", "aren", "doesn", "mustn", "mightn"]
tweet_tokenizer = TweetTokenizer(reduce_len=True)
wordnet_lemmatizer = WordNetLemmatizer()
tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}

def get_pos_tag(word):
    return tag_dict.get(nltk.pos_tag([word])[0][1][0].upper(), wordnet.NOUN)

def seperate_texts(text):
    return tweet_tokenizer.tokenize(text)

def lemmatize_nots(text):
    for word in keep_list:
        text = text.replace(word, "not")
    return text

def lemmatize(word_list):
    return [wordnet_lemmatizer.lemmatize(word, get_pos_tag(word)) for word in word_list]

def remove_stopwords_and_others(word_list):
    return [word for word in word_list if word not in stop_words and "http" not in word and "@" not in word]

def process(text):
    text = lemmatize_nots(text)
    word_list = seperate_texts(text)
    result = remove_stopwords_and_others(word_list)
    lemmatized_result = lemmatize(result)
    data = " ". join(lemmatized_result)
    return data

In [None]:
from transformers import BertTokenizer, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
tokenizer.padding_side='right'

def pre_process(data):
    result = []
    for text in data:
        text = text.lower()
        text = re.sub("'", ' ', text)
        text = process(text)
        text = re.sub("\\W", ' ', text)
        # get rid of extra spaces
        text = re.sub(' +', ' ', text)
        text = re.sub('^ ', '', text)
        text = re.sub(' $', '', text)
        result.append(text)
    return result

def tokenise(features):
    f = []
    features = pre_process(features)
    for feature in features:
        f.append(tokenizer.encode(feature, max_length=64, pad_to_max_length=True, truncation=True))
    return f

In [None]:
feature_train = np.array(tokenise(feature_train))
feature_val = np.array(tokenise(feature_val))

In [None]:
callbacks = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, 
                                             patience=10, verbose=2, 
                                             mode='auto', restore_best_weights=True)
# build model
def build_tcn():
    inputs = tf.keras.layers.Input(shape=(64,))
    embed = tf.keras.layers.Embedding(100000, 300)(inputs)
    sd = tf.keras.layers.SpatialDropout1D(0.2)(embed)
    tcn = TCN(nb_filters=128, dilations = [1, 2, 4], return_sequences=True, use_batch_norm=True)(sd)
    ap = tf.keras.layers.GlobalAveragePooling1D()(tcn)
    mp = tf.keras.layers.GlobalMaxPooling1D()(tcn)    
    conc = tf.keras.layers.concatenate([ap, mp])
    dense = tf.keras.layers.Dense(16, activation="relu")(conc)
    dropout = tf.keras.layers.Dropout(0.2)(dense)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)    
    tcn_model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    return tcn_model

# model training
tcn_model = build_tcn()
tcn_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  optimizer=tf.keras.optimizers.Adam(1e-4), 
                  metrics=['accuracy'])
tcn_model.fit(feature_train, target_train, epochs=50, validation_data=(feature_val, target_val), 
              batch_size=32, callbacks=[callbacks], shuffle=True)
tcn_model.save('tcn.h5')

In [None]:
# # bidirectional LSTM
# biLSTM = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(64,)),
#     tf.keras.layers.Embedding(50000, 300),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
#     tf.keras.layers.Dense(16, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Dense(1, activation="sigmoid")
# ])
# biLSTM.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy'])

# bl = biLSTM.fit(feature_train, target_train, epochs=50, validation_data=(feature_val, target_val), batch_size=32, callbacks=[callbacks], shuffle=True)

# # LSTM
# LSTM = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(64,)),
#     tf.keras.layers.Embedding(50000, 300),
#     tf.keras.layers.LSTM(64, return_sequences=True),
#     tf.keras.layers.LSTM(16),
#     tf.keras.layers.Dense(16, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Dense(1, activation="sigmoid")
# ])
# LSTM.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy'])

# LSTM.fit(feature_train, target_train, epochs=50, validation_data=(feature_val, target_val), batch_size=32, callbacks=[callbacks], shuffle=True)

# # GRU
# GRU = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(64,)),
#     tf.keras.layers.Embedding(50000, 300),
#     tf.keras.layers.GRU(64, return_sequences=True),
#     tf.keras.layers.GRU(16),
#     tf.keras.layers.Dense(16, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Dense(1, activation="sigmoid")
# ])
# GRU.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               optimizer=tf.keras.optimizers.Adam(1e-4),
#               metrics=['accuracy'])

# GRU.fit(feature_train, target_train, epochs=50, validation_data=(feature_val, target_val), batch_size=32, callbacks=[callbacks], shuffle=True)

In [None]:
features_fr_data = np.array(tokenise(features_fr_data))
covid_feature_test = np.array(tokenise(covid_feature_test))
pln_feature_test = np.array(tokenise(pln_feature_test))
fake_feature_test = np.array(tokenise(fake_feature_test))
feature_bbc = np.array(tokenise(feature_bbc))
feature_test = np.array(tokenise(feature_test))

tcn_model.evaluate(features_fr_data, targets_fr_data)
tcn_model.evaluate(covid_feature_test, covid_target_test)
tcn_model.evaluate(pln_feature_test, pln_target_test)
tcn_model.evaluate(fake_feature_test, fake_target_test)
tcn_model.evaluate(feature_bbc, target_bbc)
tcn_model.evaluate(feature_test, target_test)

In [None]:
def incremental_train(model, features, targets, epochs, lr):
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  optimizer=tf.keras.optimizers.Adam(lr), 
                  metrics=['accuracy'])
    model.fit(features, targets, epochs=epochs, batch_size=32, shuffle=True)
    return model


def analyse_pheme_data(event, model, propotion_to_train, epochs, lr):
    fake = pd.read_csv("../input/pheme-data/" + event + "-rumours.csv")
    real = pd.read_csv("../input/pheme-data/" + event + "-non-rumours.csv")
    
    fake_number = int(fake.shape[0]*propotion_to_train)
    real_number = int(real.shape[0]*propotion_to_train)

    fake = fake.rename({'text' : 'title'}, axis=1)
    fake = fake[['title', 'class']]
    real = real.rename({'text' : 'title'}, axis=1)
    real = real[['title', 'class']]
    data_train = pd.concat([fake[:fake_number], real[:real_number]], ignore_index=True, sort=False)
    data_test = pd.concat([fake[fake_number:], real[real_number:]], ignore_index=True, sort=False)

    features_train = data_train['title'].astype(str)
    targets_train = data_train['class']
    features_train = np.array(tokenise(features_train))
    
    features_test = data_test['title'].astype(str)
    targets_test = data_test['class']
    features_test = np.array(tokenise(features_test))
    
    print(features_train.shape)
    print(features_test.shape)
    
    print("without training")
    model.evaluate(features_test, targets_test)
    trained_model = incremental_train(model, features_train, targets_train, epochs, lr)
    print("with training")
    loss, accuracy = trained_model.evaluate(features_test, targets_test)
    t_loss, t_accuracy = trained_model.evaluate(feature_test, target_test)
    return accuracy, t_accuracy

In [None]:
events = ["germanwings-crash", "ferguson", "charliehebdo", "putinmissing", "ottawashooting", "sydneysiege"]
epochs_list = [5, 10, 15, 20, 25, 30]
lr_list = [1e-3, 1e-4, 5e-5, 1e-5, 5e-6, 1e-6]

avg_accs = []
accs_for_combined = []
for epochs in epochs_list:
    for lr in lr_list:
        sum_acc = 0
        t_sum_acc = 0
        for event in events:
            model = tf.keras.models.load_model('../input/models-tcn/tcn_model.h5', custom_objects={'TCN': TCN})
            print("analysing: " + event)
            accuracy, t_accuracy = analyse_pheme_data(event, model, 0.2, epochs, lr)
            sum_acc += accuracy
            t_sum_acc += t_accuracy
        avg_accs.append((epochs, lr, sum_acc/6, t_sum_acc/6))

for acc in avg_accs:
    print(acc)

In [None]:
def analyse_pheme_data_inc(event, model, trained_model, iteration, epochs, lr):
    fake = pd.read_csv("../input/pheme-data/" + event + "-rumours.csv")
    real = pd.read_csv("../input/pheme-data/" + event + "-non-rumours.csv")

    fake = fake.rename({'text' : 'title'}, axis=1)
    fake = fake[['title', 'class']]
    real = real.rename({'text' : 'title'}, axis=1)
    real = real[['title', 'class']]
    data = pd.concat([fake, real], ignore_index=True, sort=False)
    features = data['title'].astype(str)
    targets = data['class']
    features = np.array(tokenise(features))
    
   
    size_list = []
    acc_list = []
    acc_list_no_train = []
    data_per_iter = int(data.shape[0]/iteration)
    
    for i in range(iteration):
        print("iteration: " + str(i))
        if i == iteration-1:
            X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.2, random_state=42, stratify=targets)
            size_list.append((features.shape[0], targets.shape[0], X_train.shape[0], X_test.shape[0]))
        else:
            cur_size = data_per_iter/features.shape[0]
            X_cur, X_left, y_cur, y_left = train_test_split(features, targets, train_size=cur_size, random_state=42, stratify=targets)
            X_train, X_test, y_train, y_test = train_test_split(X_cur, y_cur, train_size=0.2, random_state=42, stratify=y_cur)
            size_list.append((X_cur.shape[0], X_left.shape[0], X_train.shape[0], X_test.shape[0]))
            
        print("without training")
        loss_no_train, accuracy_no_train = model.evaluate(X_test, y_test)
        acc_list_no_train.append(accuracy_no_train)
        
        trained_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  optimizer=tf.keras.optimizers.Adam(lr), 
                  metrics=['accuracy'])
        trained_model.fit(X_train, y_train, epochs=epochs, batch_size=32, shuffle=True)
        print("with training")
        loss, accuracy = trained_model.evaluate(X_test, y_test) 
        acc_list.append(accuracy)
        features = X_left
        targets = y_left
    
    return (size_list, acc_list_no_train, acc_list)

In [None]:
events = [("germanwings-crash", 10), ("ferguson", 19), ("charliehebdo", 33), ("putinmissing", 4), ("ottawashooting", 20), ("sydneysiege", 25)]
no_train_avg = []
train_avg = []
accuracy_data = []

for event in events:
    model = tf.keras.models.load_model('../input/models-tcn/tcn_model.h5', custom_objects={'TCN': TCN})
    trained_model = tf.keras.models.load_model('../input/models-tcn/tcn_model.h5', custom_objects={'TCN': TCN})
    print("analysing: " + event[0])
    accuracy_list = analyse_pheme_data_inc(event[0], model, trained_model, event[1], 15, 0.0001)
    accuracy_data.append(accuracy_list)
    no_train_avg.append(sum(accuracy_list[1])/len(accuracy_list[1]))
    train_avg.append(sum(accuracy_list[2])/len(accuracy_list[2]))
    
for i in range(6):
    print(events[i])
    print(no_train_avg[i])
    print(train_avg[i])
    print(accuracy_data[i][0])
    print(accuracy_data[i][1])
    print(accuracy_data[i][2])