In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

# Place dataset path here
FNNpath = "../Datasets/FNN Titles/"
CoAIDpath = "../Datasets/CoAID/"


gossipCopFake = pd.read_csv(FNNpath+"gossipcop_fake.csv", usecols=['title'])
gossipCopFake['label']=0
gossipCopTrue = pd.read_csv(FNNpath+"gossipcop_real.csv", usecols=['title'],  nrows=len(gossipCopFake.values))
gossipCopTrue['label']=1

CoAIDpath = "/content/drive/My Drive/CML - News Classifier/Datasets/CoAID/"
CoAIDFalse = pd.read_csv(CoAIDpath+"NewsFakeCOVID-19.csv", usecols=['title'])
CoAIDFalse['label']=0
CoAIDTrue = pd.read_csv(CoAIDpath+"NewsRealCOVID-19.csv", usecols=['title'], nrows=len(CoAIDFalse.values))
CoAIDTrue['label']=1


dfTotal = pd.concat([gossipCopTrue, gossipCopFake, CoAIDTrue, CoAIDFalse])
X = dfTotal['title'].values
y = dfTotal['label'].values

X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(
  X, y, test_size=0.25, random_state=450)



path_external = "/content/drive/MyDrive/CoVerifi&MedVerifi/" # Replace with appropriate path
validationNewsDF = pd.read_csv(path_external+"Combined_News.csv", usecols=['title', 'label'])


def binaryLabel(label):
  if label == "TRUE":
    return 1
  return 0

validationNewsDF['label'] = validationNewsDF['label'].apply(lambda label: binaryLabel(str(label)))

validationX = validationNewsDF['title'].values
validationX = [one_hot(str(elem),10000) for elem in validationX]
validationX = pad_sequences(validationX, padding='post', maxlen=500)
validationY = validationNewsDF['label'].values

In [10]:
#For SVM, LR, and NB, we used the default settings provided in the scikit-learn 
#For CNN we use the standard implementation with default setting https://github.com/dennybritz/cnn-text-classification-tf
#Could just use an out-of-the-box CNN implementation from a TF tutorial
# The dennybritz one seems good but I can do it later
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
# Bernoulli was best of above
import math
X_train = [one_hot(str(elem), 10000) for elem in X_train_base]
X_test =  [one_hot(str(elem), 10000) for elem in X_test_base]
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

svmClassifier = svm.SVC()
svmClassifier.fit(X_train, y_train_base)
svmPrediction = svmClassifier.predict(X_test)
svmPrediction = [math.floor(0.5+pred) for pred in svmPrediction]
print("CoAID_and_GossipCop_only: Results of SVM internal validation:")
print(classification_report(y_test_base, svmPrediction))

svmValidationPrediction = svmClassifier.predict(validationX)
svmValidationPrediction = [math.floor(0.5+pred) for pred in svmValidationPrediction]
print("CoAID_and_GossipCop_only: Results of SVM external validation, on our new dataset:")
print(classification_report(validationY, svmValidationPrediction))



X_train = [one_hot(str(elem), 10000) for elem in X_train_base]
X_test =  [one_hot(str(elem), 10000) for elem in X_test_base]
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

lrClassifier = LogisticRegression(random_state=0, max_iter=1000)
lrClassifier.fit(X_train, y_train_base)
lrPrediction = lrClassifier.predict(X_test)
lrPrediction = [math.floor(0.5+pred) for pred in lrPrediction]
print("CoAID_and_GossipCop_only: Results of LR internal validation:")
print(classification_report(y_test_base, lrPrediction))

lrValidationPrediction = lrClassifier.predict(validationX)
lrValidationPrediction = [math.floor(0.5+pred) for pred in lrValidationPrediction]
print("CoAID_and_GossipCop_only: Results of LR external validation, on our new dataset:")
print(classification_report(validationY, lrValidationPrediction))

X_train = [one_hot(str(elem), 10000) for elem in X_train_base]
X_test =  [one_hot(str(elem), 10000) for elem in X_test_base]
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

bnbClassifier = BernoulliNB()
bnbClassifier.fit(X_train, y_train_base)
bnbPrediction = bnbClassifier.predict(X_test)
bnbPrediction = [math.floor(0.5+pred) for pred in bnbPrediction]
print("CoAID_and_GossipCop_only: Results of Bernoulli NB internal validation:")
print(classification_report(y_test_base, bnbPrediction))
bnbValidationPrediction = bnbClassifier.predict(validationX)
bnbValidationPrediction = [math.floor(0.5+pred) for pred in bnbValidationPrediction]
print("CoAID_and_GossipCop_only: Results of Bernoulli NB external validation, on our new dataset:")
print(classification_report(validationY, bnbValidationPrediction))



CoAID_and_GossipCop_only: Results of SVM internal validation:
              precision    recall  f1-score   support

           0       0.53      0.47      0.50      1476
           1       0.52      0.59      0.55      1472

    accuracy                           0.53      2948
   macro avg       0.53      0.53      0.53      2948
weighted avg       0.53      0.53      0.53      2948

CoAID_and_GossipCop_only: Results of SVM external validation, on our new dataset:
              precision    recall  f1-score   support

           0       0.65      0.53      0.59      3883
           1       0.53      0.65      0.58      3175

    accuracy                           0.59      7058
   macro avg       0.59      0.59      0.59      7058
weighted avg       0.60      0.59      0.59      7058

CoAID_and_GossipCop_only: Results of LR internal validation:
              precision    recall  f1-score   support

           0       0.54      0.49      0.51      1476
           1       0.53      0.5