In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import utils,preprocessing,feature_extraction,feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from keras import models,layers
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import nltk
import re
import transformers
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import multiprocessing

In [2]:
fcol='title'
#fcol='text'
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text

def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return targets, regressors

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

def runDoc2Vec(train,test,epochs):
    gtraintagged=train.apply(lambda r: TaggedDocument (words=tokenize_text(r[fcol]),
                                                          tags=[r.Label]),axis=1)
    gtesttagged=test.apply(lambda r: TaggedDocument (words=tokenize_text(r[fcol]),
                                                        tags=[r.Label]),axis=1)
    cores = multiprocessing.cpu_count()
    #dbow
    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
    model_dbow.build_vocab([x for x in tqdm(gtraintagged.values)])
    for epoch in range(epochs):
        model_dbow.train(utils.shuffle([x for x in tqdm(gtraintagged.values)]), total_examples=len(gtraintagged.values), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha
    y_train, X_train = vec_for_learning(model_dbow, gtraintagged)
    y_test_dbow, X_test = vec_for_learning(model_dbow, gtesttagged)
    pipedbow=make_pipeline(StandardScaler(), LogisticRegression(n_jobs=1, C=1e5))
    pipedbow.fit(X_train, y_train)
    y_pred_dbow = pipedbow.predict(X_test)
    #dm
    model_dmm=Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
    model_dmm.build_vocab([x for x in tqdm(gtraintagged.values)])
    for epoch in range(epochs):
        model_dmm.train(utils.shuffle([x for x in tqdm(gtraintagged.values)]), total_examples=len(gtraintagged.values), epochs=1);
        model_dmm.alpha -= 0.002
        model_dmm.min_alpha = model_dmm.alpha
    y_train, X_train = vec_for_learning(model_dbow, gtraintagged)
    y_test_dm, X_test = vec_for_learning(model_dbow, gtesttagged)
    pipedbow=make_pipeline(StandardScaler(), LogisticRegression(n_jobs=1, C=1e5))
    pipedbow.fit(X_train, y_train)
    y_pred_dm = pipedbow.predict(X_test)
    #combined
    #model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    #model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])
    y_train, X_train = vec_for_learning(new_model, gtraintagged)
    y_test_combined, X_test = vec_for_learning(new_model, gtesttagged)
    pipecomb=make_pipeline(StandardScaler(), LogisticRegression(n_jobs=1, C=1e5))
    pipecomb.fit(X_train, y_train)
    y_pred_combined = pipecomb.predict(X_test)
    return {
        'Accuracy Doc2Vec(DBOW)': accuracy_score(y_test_dbow, y_pred_dbow),
        'F1 Doc2Vec(DBOW)': f1_score(y_test_dbow, y_pred_dbow, average='weighted'),
        'Accuracy Doc2Vec(DM)': accuracy_score(y_test_dm, y_pred_dm),
        'F1 Doc2Vec(DM)': f1_score(y_test_dm, y_pred_dm, average='weighted'),
        'Accuracy Doc2Vec(Combined)': accuracy_score(y_test_combined, y_pred_combined),
        'F1 Doc2Vec(Combined)':f1_score(y_test_combined, y_pred_combined, average='weighted')}

def runtfidf(train,test):
    vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000,ngram_range=(1,2))
    corpus = train[fcol]
    vectorizer.fit(corpus)
    X_train = vectorizer.transform(corpus)
    dic_vocabulary = vectorizer.vocabulary_
    y = train["Label"]
    X_names = vectorizer.get_feature_names()
    p_value_limit = 0.95
    dtf_features = pd.DataFrame()
    for cat in np.unique(y):
        chi2, p = feature_selection.chi2(X_train, y==cat)
        dtf_features = dtf_features.append(pd.DataFrame({"feature":X_names, "score":1-p, "y":cat}))
        dtf_features = dtf_features.sort_values(["y","score"],ascending=[True,False])
        dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
    X_names = dtf_features["feature"].unique().tolist()
    cf=LogisticRegression(n_jobs=1,C=1e5)
    pipe=pipeline.Pipeline([('vectorizer',vectorizer),('classifier',cf)])
    pipe['classifier'].fit(X_train,y.values)
    X_test=test[fcol].values
    y_test=test['Label'].values
    pred=pipe.predict(X_test)
    return {'Accuracy Tf-Idf':accuracy_score(y_test,pred),'F1 Tf-Idf':f1_score(y_test,pred)}

def runbow(train,test):
    vectorizer = feature_extraction.text.CountVectorizer(max_features=10000,ngram_range=(1,2))
    corpus = train[fcol]
    vectorizer.fit(corpus)
    X_train = vectorizer.transform(corpus)
    dic_vocabulary = vectorizer.vocabulary_
    y = train["Label"]
    X_names = vectorizer.get_feature_names()
    p_value_limit = 0.95
    dtf_features = pd.DataFrame()
    for cat in np.unique(y):
        chi2, p = feature_selection.chi2(X_train, y==cat)
        dtf_features = dtf_features.append(pd.DataFrame({"feature":X_names, "score":1-p, "y":cat}))
        dtf_features = dtf_features.sort_values(["y","score"],ascending=[True,False])
        dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
    X_names = dtf_features["feature"].unique().tolist()
    cf=LogisticRegression(n_jobs=1,C=1e5)
    pipe=pipeline.Pipeline([('vectorizer',vectorizer),('classifier',cf)])
    pipe['classifier'].fit(X_train,y.values)
    X_test=test[fcol].values
    y_test=test['Label'].values
    pred=pipe.predict(X_test)
    return {'Accuracy BOW':accuracy_score(y_test,pred),'F1 BOW':f1_score(y_test,pred)}

def runBERT2(data):
    X_train, X_test, y_train, y_test = train_test_split(data[fcol],data['Label'],test_size=0.2,random_state=14,stratify=data['Label'].values)
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)
    l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
    l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
    model = tf.keras.Model(inputs=[text_input], outputs = [l])
    METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]
    model.compile(optimizer='adam',
     loss='binary_crossentropy',
     metrics=METRICS)
    model.fit(X_train,y_train,epochs=30)
    y_pred=model.predict(X_test)
    y_pred=y_pred.flatten()
    pred = np.where(y_pred > 0.5, 1, 0)
    return {'Accuracy BERT':accuracy_score(y_test,pred),'F1 BERT':f1_score(y_test,pred)}

def runGloVe(data):
    #pre-process
    data[fcol]=data[fcol].apply(lambda x: x.lower())
    #tokenizer
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(data[fcol].values)
    vocab_size = len(tokenizer.word_index) + 1
    X = tokenizer.texts_to_sequences(data[fcol].values)
    #padding
    X = tf.keras.preprocessing.sequence.pad_sequences(X,maxlen = 1000, padding = 'post')
    #create train and test sets
    y=pd.get_dummies(data['Label']).values
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=14,stratify=y)
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    #load embeddings
    embeddings_index = dict()
    f = open('./dataset/glove.6B.200d.txt',encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    #create embedding matrix
    embedding_matrix = np.zeros((vocab_size, 200))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    #model
    model = tf.keras.models.Sequential()
    model.add(Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=1000, trainable=False))
    model.add(Flatten())
    model.add(Dense(2, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=50, verbose=0)
    y_pred=model.predict(X_test)
    #y_pred=y_pred.flatten()
    pred = np.where(y_pred > 0.5, True, False)
    return {'Accuracy GloVe':accuracy_score(y_test,pred),'F1 GloVe':f1_score(y_test,pred,pos_label=True,average='weighted')}

In [3]:
#initialize datasets

#nltk.download("popular")
file_cols=['id','title','text']
gossipli=[]
gossipli.append(pd.read_csv('./dataset/gossipcop_fake.csv',index_col=None,usecols=file_cols).assign(Label=False))
gossipli.append(pd.read_csv('./dataset/gossipcop_real.csv',index_col=None,usecols=file_cols).assign(Label=True))
gossip=pd.concat(gossipli,axis=0,ignore_index=True)
politili=[]
politili.append(pd.read_csv('./dataset/politifact_fake.csv',index_col=None,usecols=file_cols).assign(Label=False))
politili.append(pd.read_csv('./dataset/politifact_real.csv',index_col=None,usecols=file_cols).assign(Label=True))
politi=pd.concat(politili,axis=0,ignore_index=True)
print(gossip[gossip["Label"]==False].size)
print(gossip[gossip["Label"]==True].size)
print(politi[politi["Label"]==False].size)
print(politi[politi["Label"]==True].size)

18948
59484
1488
1820


In [4]:
#split dataset
gossip[fcol]=gossip[fcol].apply(cleanText)
politi[fcol]=politi[fcol].apply(cleanText)
gossip_train, gossip_test =train_test_split(gossip,test_size=0.2,random_state=14,stratify=gossip['Label'].values)
politi_train, politi_test =train_test_split(politi,test_size=0.2,random_state=14,stratify=politi['Label'].values)

In [5]:
#runs gossipcop

gtfidfres=runtfidf(gossip_train,gossip_test)
gbowres=runbow(gossip_train,gossip_test)
gdoc2vecres=runDoc2Vec(gossip_train,gossip_test,100)
ggloveres=runGloVe(gossip)
gBERTres=runBERT2(gossip)

KeyboardInterrupt: 

In [None]:
#runs politifact

ptfidfres=runtfidf(politi_train,politi_test)
pbowres=runbow(politi_train,politi_test)
pdoc2vecres=runDoc2Vec(politi_train,politi_test,100)
pgloveres=runGloVe(politi)
pBERTres=runBERT2(politi)


In [None]:
##print Gossipcop results
print('----------=================Gossip Results==================----------------')
print('Results for Tf-Idf ',gtfidfres)
print('Results for BOW ',gbowres)
print('Results for Doc2Vec ',gdoc2vecres)
print('Results for GloVe',ggloveres)
print('Results for BERT ',gBERTres)



In [None]:
##print Gossipcop results
print('----------=================Politi Results==================----------------')
print('Results for Tf-Idf ',ptfidfres)
print('Results for BOW ',pbowres)
print('Results for Doc2Vec ',pdoc2vecres)
print('Results for Glove ',pgloveres)
print('Results for BERT ',pBERTres)