# Finetuning FakeNewsAAAI
FakeNewsAAAI is a Fake News dataset with 2 possible labels: `real` and `fake`

In [1]:
import os, sys

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from utils.forward_fn import forward_sequence_classification
from utils.metrics import classification_metrics_fn
from utils.data_utils import FakeNewsDataset, FakeNewsDataLoader

In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.4f}'.format(key, value))
    return ' '.join(string_list)

In [3]:
# Set random seed
set_seed(26092020)

# Load Model

In [4]:
# Load Tokenizer and Config
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
config = AutoConfig.from_pretrained('roberta-base')
config.num_labels = FakeNewsDataset.NUM_LABELS

# Instantiate model
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [5]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [6]:
count_param(model)

125237762

# Prepare Dataset

In [7]:
train_dataset_path = './data/train.tsv'
valid_dataset_path = './data/valid.tsv'
# test_dataset_path = './dataset/test.tsv'

In [8]:
train_dataset = FakeNewsDataset(train_dataset_path, tokenizer, lowercase=False)
valid_dataset = FakeNewsDataset(valid_dataset_path, tokenizer, lowercase=False)
# test_dataset = FakeNewsDataset(test_dataset_path, tokenizer, lowercase=False)

train_loader = FakeNewsDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=True)  
valid_loader = FakeNewsDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=False)  
# test_loader = FakeNewsDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=False)

In [9]:
w2i, i2w = FakeNewsDataset.LABEL2INDEX, FakeNewsDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'fake': 0, 'real': 1}
{0: 'real', 1: 'fake'}


# Fine Tuning & Evaluation

In [10]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [11]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = classification_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = classification_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = classification_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.5405 LR:0.00000300:  27%|██▋       | 214/788 [00:22<00:57, 10.06it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1139 > 512). Running this sequence through the model will result in indexing errors
(Epoch 1) TRAIN LOSS:0.4217 LR:0.00000300:  42%|████▏     | 333/788 [00:35<00:46,  9.82it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1633 > 512). Running this sequence through the model will result in indexing errors
(Epoch 1) TRAIN LOSS:0.3440 LR:0.00000300:  63%|██████▎   | 500/788 [00:52<00:31,  9.01it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4566 > 512). Running this sequence through the model will result in indexing errors
(Epoch 1) TRAIN LOSS:0.3106 LR:0.00000300:  76%|███████▌  | 598/788 [01:02<00:18, 10.02it/s]Token indices sequence length is longer than the specified maximum sequence length 

(Epoch 1) TRAIN LOSS:0.2644 ACC:0.8784 F1:0.8782 REC:0.8785 PRE:0.8780 LR:0.00000300


VALID LOSS:0.1159 ACC:0.9569 F1:0.9565 REC:0.9552 PRE:0.9591:  74%|███████▍  | 198/268 [00:07<00:02, 25.22it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.1237 ACC:0.9537 F1:0.9534 REC:0.9523 PRE:0.9562: 100%|██████████| 268/268 [00:10<00:00, 26.24it/s]
  0%|          | 0/788 [00:00<?, ?it/s]

(Epoch 1) VALID LOSS:0.1237 ACC:0.9537 F1:0.9534 REC:0.9523 PRE:0.9562


(Epoch 2) TRAIN LOSS:0.0802 LR:0.00000300:  46%|████▌     | 359/788 [00:37<00:44,  9.63it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4566 > 512). Running this sequence through the model will result in indexing errors
(Epoch 2) TRAIN LOSS:0.0791 LR:0.00000300:  52%|█████▏    | 406/788 [00:42<00:36, 10.55it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1139 > 512). Running this sequence through the model will result in indexing errors
(Epoch 2) TRAIN LOSS:0.0796 LR:0.00000300:  53%|█████▎    | 419/788 [00:43<00:42,  8.76it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1923 > 512). Running this sequence through the model will result in indexing errors
(Epoch 2) TRAIN LOSS:0.0820 LR:0.00000300:  67%|██████▋   | 529/788 [00:55<00:25, 10.28it/s]Token indices sequence length is longer than the specified maximum sequence length 

(Epoch 2) TRAIN LOSS:0.0856 ACC:0.9719 F1:0.9718 REC:0.9717 PRE:0.9719 LR:0.00000300


VALID LOSS:0.0937 ACC:0.9637 F1:0.9635 REC:0.9625 PRE:0.9651:  74%|███████▍  | 198/268 [00:07<00:02, 24.73it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.1010 ACC:0.9607 F1:0.9606 REC:0.9597 PRE:0.9621: 100%|██████████| 268/268 [00:10<00:00, 26.57it/s]
  0%|          | 0/788 [00:00<?, ?it/s]

(Epoch 2) VALID LOSS:0.1010 ACC:0.9607 F1:0.9606 REC:0.9597 PRE:0.9621


(Epoch 3) TRAIN LOSS:0.0599 LR:0.00000300:  22%|██▏       | 170/788 [00:17<01:01, 10.00it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1923 > 512). Running this sequence through the model will result in indexing errors
(Epoch 3) TRAIN LOSS:0.0525 LR:0.00000300:  29%|██▊       | 226/788 [00:23<00:53, 10.47it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4566 > 512). Running this sequence through the model will result in indexing errors
(Epoch 3) TRAIN LOSS:0.0524 LR:0.00000300:  33%|███▎      | 263/788 [00:27<00:51, 10.28it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1633 > 512). Running this sequence through the model will result in indexing errors
(Epoch 3) TRAIN LOSS:0.0478 LR:0.00000300:  63%|██████▎   | 500/788 [00:52<00:32,  8.92it/s]Token indices sequence length is longer than the specified maximum sequence length 

(Epoch 3) TRAIN LOSS:0.0535 ACC:0.9822 F1:0.9822 REC:0.9821 PRE:0.9822 LR:0.00000300


VALID LOSS:0.1067 ACC:0.9700 F1:0.9698 REC:0.9685 PRE:0.9719:  75%|███████▍  | 200/268 [00:07<00:02, 24.98it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.1179 ACC:0.9645 F1:0.9643 REC:0.9632 PRE:0.9667: 100%|██████████| 268/268 [00:10<00:00, 26.43it/s]
  0%|          | 0/788 [00:00<?, ?it/s]

(Epoch 3) VALID LOSS:0.1179 ACC:0.9645 F1:0.9643 REC:0.9632 PRE:0.9667


(Epoch 4) TRAIN LOSS:0.0274 LR:0.00000300:   3%|▎         | 23/788 [00:03<02:04,  6.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1923 > 512). Running this sequence through the model will result in indexing errors
(Epoch 4) TRAIN LOSS:0.0386 LR:0.00000300:  58%|█████▊    | 455/788 [00:47<00:32, 10.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4566 > 512). Running this sequence through the model will result in indexing errors
(Epoch 4) TRAIN LOSS:0.0365 LR:0.00000300:  62%|██████▏   | 485/788 [00:50<00:32,  9.38it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1139 > 512). Running this sequence through the model will result in indexing errors
(Epoch 4) TRAIN LOSS:0.0378 LR:0.00000300:  68%|██████▊   | 532/788 [00:55<00:26,  9.74it/s]Token indices sequence length is longer than the specified maximum sequence length f

(Epoch 4) TRAIN LOSS:0.0365 ACC:0.9889 F1:0.9889 REC:0.9888 PRE:0.9890 LR:0.00000300


VALID LOSS:0.0932 ACC:0.9688 F1:0.9686 REC:0.9679 PRE:0.9695:  75%|███████▍  | 200/268 [00:07<00:02, 25.46it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.0966 ACC:0.9673 F1:0.9671 REC:0.9665 PRE:0.9682: 100%|██████████| 268/268 [00:10<00:00, 26.74it/s]
  0%|          | 0/788 [00:00<?, ?it/s]

(Epoch 4) VALID LOSS:0.0966 ACC:0.9673 F1:0.9671 REC:0.9665 PRE:0.9682


(Epoch 5) TRAIN LOSS:0.0245 LR:0.00000300:  14%|█▍        | 110/788 [00:12<01:18,  8.68it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1923 > 512). Running this sequence through the model will result in indexing errors
(Epoch 5) TRAIN LOSS:0.0242 LR:0.00000300:  50%|████▉     | 392/788 [00:41<00:41,  9.45it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4566 > 512). Running this sequence through the model will result in indexing errors
(Epoch 5) TRAIN LOSS:0.0238 LR:0.00000300:  51%|█████     | 398/788 [00:41<00:39,  9.97it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1633 > 512). Running this sequence through the model will result in indexing errors
(Epoch 5) TRAIN LOSS:0.0230 LR:0.00000300:  84%|████████▎ | 658/788 [01:08<00:13,  9.30it/s]Token indices sequence length is longer than the specified maximum sequence length 

(Epoch 5) TRAIN LOSS:0.0235 ACC:0.9924 F1:0.9924 REC:0.9923 PRE:0.9924 LR:0.00000300


VALID LOSS:0.1473 ACC:0.9656 F1:0.9653 REC:0.9637 PRE:0.9685:  74%|███████▍  | 199/268 [00:06<00:02, 25.81it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.1569 ACC:0.9612 F1:0.9610 REC:0.9597 PRE:0.9641: 100%|██████████| 268/268 [00:09<00:00, 27.01it/s]


(Epoch 5) VALID LOSS:0.1569 ACC:0.9612 F1:0.9610 REC:0.9597 PRE:0.9641


In [12]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.index = df.index + 1
df.to_csv('prediction.csv')

print(df)

NameError: name 'test_loader' is not defined

In [13]:
train_dataset[9]

(array([    0, 27814, 22870,   578, 44537, 46963, 39220, 32689,   833,
           50, 36846, 32689,   833,    19,    50,   396,    10, 13418,
         9396,  1949,    13,  1416,     9,  6247, 43814,    12,  1646,
           35,    10, 17043, 22976,  1966,   111,    20, 39239,  1205,
          640,    90,     4,   876,    73,   574,   245,   846,   176,
         1178,   401,   534,   466,   368,     2]),
 array(0),
 'Retraction—Hydroxychloroquine or chloroquine with or without a macrolide for treatment of COVID-19: a multinational registry analysis - The Lancet https://t.co/L5V2x6G9or')

# Test fine-tuned model on sample sentences

In [14]:
text = 'The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today. | Label : fake (99.921%)


In [15]:
text = 'Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://t.co/1pYW6cWRaS'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://t.co/1pYW6cWRaS | Label : fake (99.958%)


In [16]:
text = 'Retraction—Hydroxychloroquine or chloroquine with or without a macrolide for treatment of COVID-19: a multinational registry analysis - The Lancet https://t.co/L5V2x6G9or'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Retraction—Hydroxychloroquine or chloroquine with or without a macrolide for treatment of COVID-19: a multinational registry analysis - The Lancet https://t.co/L5V2x6G9or | Label : real (97.656%)


In [18]:
text = 'Chinese converting to Islam after realising that no muslim was affected by #Coronavirus #COVD19 in the country'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Chinese converting to Islam after realising that no muslim was affected by #Coronavirus #COVD19 in the country | Label : real (99.788%)
