#Environment Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Put Data in DataFrame (Articles)

In [2]:
import pandas as pd

# Place dataset path here
FNNpath = "../Datasets/FNN Titles/"
CoAIDpath = "../Datasets/CoAID/"

politiFactFake = pd.read_csv(FNNpath+"politifact_fake.csv", usecols=['title'])
politiFactFake['label']=0
politiFactTrue = pd.read_csv(FNNpath+"politifact_real.csv", usecols=['title'], nrows=len(politiFactFake.values))
politiFactTrue['label']=1

CoAIDFalse = pd.read_csv(CoAIDpath+"NewsFakeCOVID-19.csv", usecols=['title'])
CoAIDFalse['label']=0
CoAIDTrue = pd.read_csv(CoAIDpath+"NewsRealCOVID-19.csv", usecols=['title'], nrows=len(CoAIDFalse.values))
CoAIDTrue['label']=1


dfTotal = pd.concat([politiFactTrue, politiFactFake, CoAIDTrue, CoAIDFalse])


X = dfTotal['title'].values
y = dfTotal['label'].values
print("len(X)", len(X))
print("len(y)", len(y))

len(X) 2008
len(y) 2008


# Finetune Fake News Detection Model on CoAID&PolitiFact only

In [3]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 4.3 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 71.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.6 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 72.1 MB/s 
[?25hCollecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37

In [4]:
from sklearn.utils import shuffle
dfTotal = shuffle(dfTotal)
from datasets import Dataset
train_dataset = Dataset.from_pandas(dfTotal)

In [5]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("jy46604790/Fake-News-Bert-Detect")

model = AutoModelForSequenceClassification.from_pretrained("jy46604790/Fake-News-Bert-Detect")

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/735 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [6]:


def tokenize_data(example):
    return tokenizer(example['title'], padding='max_length')


train_dataset = train_dataset.map(tokenize_data, batched=True)


from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", num_train_epochs=3)


from transformers import Trainer

trainer_train = Trainer(
    model=model, args=training_args, train_dataset=train_dataset
)


  0%|          | 0/3 [00:00<?, ?ba/s]

In [7]:
trainer_train.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, title. If __index_level_0__, title are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2008
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 753


Step,Training Loss
500,0.2763


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=753, training_loss=0.20803616689654145, metrics={'train_runtime': 343.1626, 'train_samples_per_second': 17.554, 'train_steps_per_second': 2.194, 'total_flos': 1584980997488640.0, 'train_loss': 0.20803616689654145, 'epoch': 3.0})

In [8]:
import torch
model_path = '../models/Fake-News-Bert-Detect_finetuned_on_CoAID&PolitiFact.pt'
torch.save(model.state_dict(),model_path)

In [9]:
path_to_external_validation_dataset = ".../Datasets/External_Validation_Datasets//" # Replace with appropriate path
dfTotal_external = pd.read_csv(path_to_external_validation_dataset+"External_Validation1.csv", usecols=['title', 'label'])

def binaryLabel(label):
  if label == "TRUE":
    return 1
  return 0
dfTotal_external = dfTotal_external.dropna()
dfTotal_external['label'] = dfTotal_external['label'].apply(lambda label: binaryLabel(str(label)))
dataset_external = Dataset.from_pandas(dfTotal_external)

In [10]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
dataset_external = dataset_external.map(tokenize_data, batched=True)
trainer_eval = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_external,
    compute_metrics=compute_metrics,
)
trainer_eval.evaluate()

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, title. If __index_level_0__, title are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7057
  Batch size = 8


{'eval_accuracy': 0.8961315006376648,
 'eval_loss': 0.5453846454620361,
 'eval_runtime': 130.0709,
 'eval_samples_per_second': 54.255,
 'eval_steps_per_second': 6.789}