# Finetuning FakeNewsAAAI
FakeNewsAAAI is a Fake News dataset with 2 possible labels: `real` and `fake`

In [1]:
import os, sys
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from utils.forward_fn import forward_mask_sequence_classification
from utils.metrics import classification_metrics_fn
from utils.data_utils import FakeNewsDataset, FakeNewsDataLoader

In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.4f}'.format(key, value))
    return ' '.join(string_list)

In [3]:
# Set random seed
set_seed(26092020)

# Load Model

In [4]:
# Load Tokenizer and Config
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
config = AutoConfig.from_pretrained('roberta-base')
config.num_labels = FakeNewsDataset.NUM_LABELS

# Instantiate model
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [5]:
count_param(model)

124647170

# Prepare Dataset

In [6]:
train_dataset_path = './data/train.tsv'
valid_dataset_path = './data/valid.tsv'
# test_dataset_path = './dataset/test.tsv'

In [7]:
train_dataset = FakeNewsDataset(dataset_path=train_dataset_path, tokenizer=tokenizer, lowercase=False)
valid_dataset = FakeNewsDataset(dataset_path=valid_dataset_path, tokenizer=tokenizer, lowercase=False)
# test_dataset = FakeNewsDataset(dataset_path=test_dataset_path, tokenizer=tokenizer, lowercase=False)

train_loader = FakeNewsDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=True)  
valid_loader = FakeNewsDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=False)  
# test_loader = FakeNewsDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=8, shuffle=False)

In [8]:
w2i, i2w = FakeNewsDataset.LABEL2INDEX, FakeNewsDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'fake': 0, 'real': 1}
{0: 'fake', 1: 'real'}


# Fine Tuning & Evaluation

In [9]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [10]:
# Train
n_epochs = 3
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        outputs = forward_mask_sequence_classification(model, batch_data[:-1], i2w=i2w, apply_mask=True, device='cuda')
        loss, batch_hyp, batch_label, logits, label_batch = outputs

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = classification_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        outputs = forward_mask_sequence_classification(model, batch_data[:-1], i2w=i2w, apply_mask=True, device='cuda')
        loss, batch_hyp, batch_label, logits, label_batch = outputs
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = classification_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = classification_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.6581 LR:0.00000300:  25%|██▌       | 199/788 [00:21<01:13,  7.98it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1635 > 512). Running this sequence through the model will result in indexing errors
(Epoch 1) TRAIN LOSS:0.5224 LR:0.00000300:  44%|████▍     | 345/788 [00:37<00:47,  9.33it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4568 > 512). Running this sequence through the model will result in indexing errors
(Epoch 1) TRAIN LOSS:0.4702 LR:0.00000300:  53%|█████▎    | 415/788 [00:44<00:37, 10.06it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1141 > 512). Running this sequence through the model will result in indexing errors
(Epoch 1) TRAIN LOSS:0.3601 LR:0.00000300:  82%|████████▏ | 645/788 [01:08<00:14,  9.93it/s]Token indices sequence length is longer than the specified maximum sequence length 

(Epoch 1) TRAIN LOSS:0.3179 ACC:0.8447 F1:0.8430 REC:0.8417 PRE:0.8500 LR:0.00000300


VALID LOSS:0.1472 ACC:0.9431 F1:0.9424 REC:0.9400 PRE:0.9492:  74%|███████▍  | 199/268 [00:07<00:02, 24.38it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.1544 ACC:0.9392 F1:0.9387 REC:0.9368 PRE:0.9451: 100%|██████████| 268/268 [00:10<00:00, 26.05it/s]
  0%|          | 0/788 [00:00<?, ?it/s]

(Epoch 1) VALID LOSS:0.1544 ACC:0.9392 F1:0.9387 REC:0.9368 PRE:0.9451


(Epoch 2) TRAIN LOSS:0.0758 LR:0.00000300:  36%|███▌      | 282/788 [00:29<00:50,  9.94it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1141 > 512). Running this sequence through the model will result in indexing errors
(Epoch 2) TRAIN LOSS:0.0845 LR:0.00000300:  44%|████▎     | 343/788 [00:36<00:47,  9.44it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1635 > 512). Running this sequence through the model will result in indexing errors
(Epoch 2) TRAIN LOSS:0.0906 LR:0.00000300:  85%|████████▌ | 673/788 [01:11<00:12,  9.05it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1925 > 512). Running this sequence through the model will result in indexing errors
(Epoch 2) TRAIN LOSS:0.0919 LR:0.00000300: 100%|██████████| 788/788 [01:23<00:00,  9.42it/s]
  0%|          | 0/268 [00:00<?, ?it/s]

(Epoch 2) TRAIN LOSS:0.0919 ACC:0.9695 F1:0.9694 REC:0.9693 PRE:0.9696 LR:0.00000300


VALID LOSS:0.0828 ACC:0.9744 F1:0.9743 REC:0.9738 PRE:0.9748:  74%|███████▍  | 199/268 [00:07<00:02, 25.08it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.0887 ACC:0.9710 F1:0.9709 REC:0.9705 PRE:0.9715: 100%|██████████| 268/268 [00:10<00:00, 26.35it/s]
  0%|          | 0/788 [00:00<?, ?it/s]

(Epoch 2) VALID LOSS:0.0887 ACC:0.9710 F1:0.9709 REC:0.9705 PRE:0.9715


(Epoch 3) TRAIN LOSS:0.0214 LR:0.00000300:  36%|███▌      | 283/788 [00:30<00:51,  9.90it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1141 > 512). Running this sequence through the model will result in indexing errors
(Epoch 3) TRAIN LOSS:0.0263 LR:0.00000300:  44%|████▎     | 343/788 [00:36<00:47,  9.42it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1635 > 512). Running this sequence through the model will result in indexing errors
(Epoch 3) TRAIN LOSS:0.0253 LR:0.00000300:  85%|████████▌ | 673/788 [01:11<00:12,  9.03it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1925 > 512). Running this sequence through the model will result in indexing errors
(Epoch 3) TRAIN LOSS:0.0252 LR:0.00000300: 100%|██████████| 788/788 [01:24<00:00,  9.37it/s]
  0%|          | 0/268 [00:00<?, ?it/s]

(Epoch 3) TRAIN LOSS:0.0252 ACC:0.9936 F1:0.9936 REC:0.9936 PRE:0.9937 LR:0.00000300


VALID LOSS:0.0923 ACC:0.9750 F1:0.9749 REC:0.9743 PRE:0.9757:  74%|███████▍  | 198/268 [00:07<00:02, 25.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
VALID LOSS:0.0987 ACC:0.9715 F1:0.9714 REC:0.9708 PRE:0.9723: 100%|██████████| 268/268 [00:10<00:00, 26.14it/s]


(Epoch 3) VALID LOSS:0.0987 ACC:0.9715 F1:0.9714 REC:0.9708 PRE:0.9723


In [12]:
# # Evaluate on test
# model.eval()
# torch.set_grad_enabled(False)

# total_loss, total_correct, total_labels = 0, 0, 0
# list_hyp, list_label = [], []

# pbar = tqdm(test_loader, leave=True, total=len(test_loader))
# for i, batch_data in enumerate(pbar):
#     _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
#     list_hyp += batch_hyp

# # Save prediction
# df = pd.DataFrame({'label':list_hyp}).reset_index()
# df.index = df.index + 1
# df.to_csv('prediction.csv')

# print(df)

# Calculate Influence

In [47]:
from torch.nn import CrossEntropyLoss
from transformers import BertForSequenceClassification, RobertaForSequenceClassification
from utils.utils import generate_random_mask

def influence_score(model, id, subword, mask, label, device='cpu'):
    loss_fct = CrossEntropyLoss(reduction='none')
    with torch.no_grad():
        # Prepare input & label
        subword = torch.LongTensor(subword)
        mask = torch.FloatTensor(mask)
        label = torch.LongTensor(label)

        if device == "cuda":
            subword = subword.cuda()
            mask = mask.cuda()
            label = label.cuda()

        if isinstance(model, BertForSequenceClassification):
            # Apply mask
            weight, bias = model.classifier.weight, model.classifier.bias
            dropout_mask = generate_random_mask([id], weight.shape[0], weight.shape[1], device=device).repeat(subword.shape[0],1,1)
            masked_weight = weight.expand_as(dropout_mask) * dropout_mask

            # Calculate latents
            latents = model.bert(subword, attention_mask=mask)[1]
            latents = model.dropout(latents)            
        elif isinstance(model, RobertaForSequenceClassification):
            # Apply mask
            weight, bias = model.classifier.out_proj.weight, model.classifier.out_proj.bias
            dropout_mask = generate_random_mask([id], weight.shape[0], weight.shape[1], device=device).repeat(subword.shape[0],1,1)
            masked_weight = weight.expand_as(dropout_mask) * dropout_mask

            # Calculate latents
            latents = model.roberta(subword, attention_mask=mask)[0][:,0,:]
            latents = model.classifier.dense(latents)
            latents = model.classifier.dropout(latents)
        else:
            ValueError(f'Model class `{type(model)}` is not implemented yet')

        # Compute loss with mask
        logits = torch.einsum('bd,bcd->bc', latents, masked_weight) + bias
        mask_loss = loss_fct(logits.view(-1, model.num_labels), label.view(-1))

        # Compute loss with flipped mask
        logits = torch.einsum('bd,bcd->bc', latents, (masked_weight.max() - masked_weight)) + bias
        flipped_mask_loss = loss_fct(logits.view(-1, model.num_labels), label.view(-1))
                              
        return flipped_mask_loss - mask_loss
                              
def build_influence_matrix(model, data_loader, train_size, device='cpu'):
    test_size = len(data_loader.dataset)
    influence_mat = torch.zeros(test_size, train_size, device=device)
    
    id2idx = {}
    for i, batch_data in enumerate(data_loader):
        print(f'Processing batch {i}/{len(data_loader)}')
        (ids, subword_batch, mask_batch, label_batch, seq_list) = batch_data
        token_type_batch = None

        for train_idx in tqdm(range(train_size)):
            train_id = train_idx + 1
            scores = influence_score(model, train_id, subword_batch, mask_batch, label_batch, device=device)
            for i, id in enumerate(ids):
                id2idx[id] = i
                influence_mat[i, train_idx] = scores[i]
    return influence_mat, id2idx

def get_inference_result(model, data_loader, device='cpu'):
    results = {}
    with torch.no_grad():
        pbar = tqdm(data_loader, leave=True, total=len(data_loader))
        for i, batch_data in enumerate(pbar):
            batch_id = batch_data[0]
            batch_seq = batch_data[-1]
            outputs = forward_mask_sequence_classification(model, batch_data[:-1], i2w=i2w, apply_mask=True, device='cuda')
            loss, batch_hyp, batch_label, logits, label_batch = outputs

            for i, id in enumerate(batch_id):
                results[id] = batch_hyp[i] == batch_label[i]
    return results

def get_filtered_dataloader(data_loader, id_list, batch_size=8, shuffle=False):
    df = data_loader.dataset.data
    filt_df = df[df['id'].isin(id_list)].reset_index(drop=True)
    dataset = FakeNewsDataset(dataset_path=None, dataset=filt_df, tokenizer=tokenizer, lowercase=False)
    data_loader = FakeNewsDataLoader(dataset=dataset, max_seq_len=512, batch_size=batch_size, num_workers=8, shuffle=shuffle)  
    return data_loader


In [36]:
%%time
(ids, subword_batch, mask_batch, label_batch, seq_list) = batch_data
influence_score(model, ids[0], subword_batch, mask_batch, label_batch, device='cuda')

CPU times: user 20.9 ms, sys: 23.7 ms, total: 44.5 ms
Wall time: 42.8 ms


tensor([5.7643, 0.0693, 8.5639], device='cuda:0')

In [27]:
%%time
results = get_inference_result(model, valid_loader, device='cuda')
correct_list = list(map(lambda kv: kv[0], filter(lambda kv: kv[1], results.items())))
incorrect_list = list(map(lambda kv: kv[0], filter(lambda kv: not kv[1], results.items())))


  0%|          | 0/268 [00:00<?, ?it/s][A
  0%|          | 1/268 [00:00<01:53,  2.35it/s][A
  2%|▏         | 5/268 [00:00<01:20,  3.27it/s][A
  3%|▎         | 8/268 [00:00<00:58,  4.46it/s][A
  4%|▍         | 12/268 [00:00<00:42,  6.04it/s][A
  6%|▌         | 16/268 [00:00<00:31,  8.06it/s][A
  7%|▋         | 20/268 [00:00<00:23, 10.58it/s][A
  9%|▉         | 24/268 [00:01<00:18, 13.49it/s][A
 10%|█         | 28/268 [00:01<00:14, 16.69it/s][A
 12%|█▏        | 32/268 [00:01<00:11, 19.99it/s][A
 13%|█▎        | 36/268 [00:01<00:09, 23.26it/s][A
 15%|█▌        | 41/268 [00:01<00:08, 26.86it/s][A
 17%|█▋        | 46/268 [00:01<00:07, 30.13it/s][A
 19%|█▉        | 51/268 [00:01<00:06, 33.24it/s][A
 21%|██        | 56/268 [00:01<00:05, 35.54it/s][A
 23%|██▎       | 61/268 [00:01<00:05, 36.90it/s][A
 25%|██▍       | 66/268 [00:02<00:05, 38.29it/s][A
 26%|██▋       | 71/268 [00:02<00:05, 38.40it/s][A
 28%|██▊       | 76/268 [00:02<00:05, 38.39it/s][A
 30%|███       | 81/268

CPU times: user 6.35 s, sys: 999 ms, total: 7.35 s
Wall time: 7.38 s





In [48]:
filt_valid_loader = get_filtered_dataloader(valid_loader, incorrect_list, batch_size=16)
len(valid_loader), len(filt_valid_loader)

(268, 4)

In [None]:
%%time
influence_matrix, id2idx = build_influence_matrix(model, filt_valid_loader, len(train_loader.dataset), device='cuda')








  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A






 25%|██▌       | 1/4 [04:27<13:21, 267.29s/it][A[A[A[A[A[A[A






 50%|█████     | 2/4 [09:31<09:16, 278.40s/it][A[A[A[A[A[A[A






 75%|███████▌  | 3/4 [14:40<04:47, 287.63s/it][A[A[A[A[A[A[A

In [30]:
len(valid_loader.dataset)

2139

In [49]:
valid_ df = valid_loader.dataset.data

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,0
1,2,11 out of 13 people (from the Diamond Princess...,0
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",0
3,4,Mike Pence in RNC speech praises Donald Trump’...,0
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,1
...,...,...,...
2134,2136,Donald Trump wrongly claimed that New Zealand ...,0
2135,2137,Current understanding is #COVID19 spreads most...,1
2136,2138,Nothing screams “I am sat around doing fuck al...,0
2137,2139,Birx says COVID-19 outbreak not under control ...,0


# Test fine-tuned model on sample sentences

In [14]:
text = 'The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today. | Label : fake (99.921%)


In [15]:
text = 'Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://t.co/1pYW6cWRaS'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://t.co/1pYW6cWRaS | Label : fake (99.958%)


In [16]:
text = 'Retraction—Hydroxychloroquine or chloroquine with or without a macrolide for treatment of COVID-19: a multinational registry analysis - The Lancet https://t.co/L5V2x6G9or'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Retraction—Hydroxychloroquine or chloroquine with or without a macrolide for treatment of COVID-19: a multinational registry analysis - The Lancet https://t.co/L5V2x6G9or | Label : real (97.656%)


In [18]:
text = 'Chinese converting to Islam after realising that no muslim was affected by #Coronavirus #COVD19 in the country'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Chinese converting to Islam after realising that no muslim was affected by #Coronavirus #COVD19 in the country | Label : real (99.788%)
