## Environment Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Put Data in DataFrame and Clean Data

In [2]:
import pandas as pd
import glob, os

# Place dataset path here
FNNpath = "../Datasets/FNN Titles/"
CoAIDpath = "../Datasets/CoAID/"

politiFactFake = pd.read_csv(FNNpath+"politifact_fake.csv", usecols=['title'])
politiFactFake['label']=0
gossipCopFake = pd.read_csv(FNNpath+"gossipcop_fake.csv", usecols=['title'])
gossipCopFake['label']=0
politiFactTrue = pd.read_csv(FNNpath+"politifact_real.csv", usecols=['title'], nrows=len(politiFactFake.values))
politiFactTrue['label']=1
gossipCopTrue = pd.read_csv(FNNpath+"gossipcop_real.csv", usecols=['title'],  nrows=len(gossipCopFake.values))
gossipCopTrue['label']=1

CoAIDFalse = pd.read_csv(CoAIDpath+"NewsFakeCOVID-19.csv", usecols=['title'])
CoAIDFalse['label']=0
CoAIDTrue = pd.read_csv(CoAIDpath+"NewsRealCOVID-19.csv", usecols=['title'], nrows=len(CoAIDFalse.values))
CoAIDTrue['label']=1


dfTotal = pd.concat([politiFactTrue, gossipCopTrue, politiFactFake, gossipCopFake, CoAIDTrue, CoAIDFalse])


X = dfTotal['title'].values
y = dfTotal['label'].values
print("len(X)", len(X))
print("len(y)", len(y))

len(X) 12654
len(y) 12654


## Finetune roberta-fake-news

In [3]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 4.2 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 58.8 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 66.9 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 68.9 MB/s

In [4]:
from datasets import Dataset
from sklearn.utils import shuffle
dfTotal = shuffle(dfTotal)
train_dataset = Dataset.from_pandas(dfTotal)

In [5]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ghanashyamvtatti/roberta-fake-news")

model = AutoModelForSequenceClassification.from_pretrained("ghanashyamvtatti/roberta-fake-news")

def tokenize_data(example):
    return tokenizer(example['title'], padding='max_length')

train_dataset = train_dataset.map(tokenize_data, batched=True)


from transformers import TrainingArguments

training_args = TrainingArguments("train_trainer", num_train_epochs=3)


from transformers import Trainer

trainer_train = Trainer(
    model=model, args=training_args, train_dataset=train_dataset
)

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

In [6]:
trainer_train.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: title, __index_level_0__. If title, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12654
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4746


Step,Training Loss


In [7]:
import torch
model_path = '../models/roberta-fake-news_on_CoAID_FNN.pt'
torch.save(model.state_dict(),model_path)

In [8]:
path_to_external_validation_dataset = ".../Datasets/External_Validation_Datasets//" # Replace with appropriate path
dfTotal_external = pd.read_csv(path_to_external_validation_dataset+"External_Validation1.csv", usecols=['title', 'label'])

def binaryLabel(label):
  if label == "TRUE":
    return 1
  return 0
dfTotal_external = dfTotal_external.dropna()
dfTotal_external['label'] = dfTotal_external['label'].apply(lambda label: binaryLabel(str(label)))
dataset_external = Dataset.from_pandas(dfTotal_external)

In [10]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

dataset_external = dataset_external.map(tokenize_data, batched=True)

trainer_eval = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_external,
    compute_metrics=compute_metrics,
)
trainer_eval.evaluate()

  0%|          | 0/8 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: title, __index_level_0__. If title, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 7057
  Batch size = 8


{'eval_accuracy': 0.8153606348306646,
 'eval_loss': 0.5768745541572571,
 'eval_runtime': 129.1716,
 'eval_samples_per_second': 54.633,
 'eval_steps_per_second': 6.836}