In [1]:
import torch
import transformers
import datasets

print(f"Running on torch {torch.__version__}v, transformers {transformers.__version__}v, datasets {datasets.__version__}")

Running on torch 1.9.0v, transformers 4.8.1v, datasets 1.8.0


In [2]:
import numpy as np

from transformers import (AdamW, get_linear_schedule_with_warmup, logging, 
                          BertConfig, BertTokenizer, BertForSequenceClassification)
from datasets import load_dataset

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, f1_score

import os
import sys
import random
import warnings
from tqdm.notebook import tqdm
from IPython.display import clear_output

logging.set_verbosity_error()
warnings.filterwarnings('ignore')
SEED = 1618
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def set_seed(seed = 0):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

random_state = set_seed(SEED)

In [7]:
ds = load_dataset("health_fact", "regular")
clear_output()
np.unique(ds['train']['label'])

array([-1,  0,  1,  2,  3])

In [8]:
ds = (ds
      .map(lambda x : {'label_updated': x['label'] + 1}, remove_columns=['label'])
      .rename_column('label_updated', 'label'))
clear_output()
np.unique(ds['train']['label'])

array([0, 1, 2, 3, 4])

In [9]:
cp = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = BertTokenizer.from_pretrained(cp)
config = BertConfig.from_pretrained(cp)
config.update({'num_labels': 5})
model = BertForSequenceClassification.from_pretrained(cp, config=config)
model.to(DEVICE)
clear_output()

In [10]:
lb = LabelBinarizer()
lb = lb.fit(ds['train']['label'])
ds = (ds
      .map(lambda x : {'label_list': lb.transform([x['label']])[0]}, remove_columns=['label'])
      .rename_column('label_list', 'label'))
clear_output()
ds

DatasetDict({
    train: Dataset({
        features: ['claim', 'claim_id', 'date_published', 'explanation', 'fact_checkers', 'label', 'main_text', 'sources', 'subjects'],
        num_rows: 9832
    })
    test: Dataset({
        features: ['claim', 'claim_id', 'date_published', 'explanation', 'fact_checkers', 'label', 'main_text', 'sources', 'subjects'],
        num_rows: 1235
    })
    validation: Dataset({
        features: ['claim', 'claim_id', 'date_published', 'explanation', 'fact_checkers', 'label', 'main_text', 'sources', 'subjects'],
        num_rows: 1225
    })
})

In [12]:
MAX_LENGTH = 120
def tokenize_and_encode(examples):
    return tokenizer.batch_encode_plus(examples["claim"], truncation=True, padding='max_length', max_length=MAX_LENGTH)

cols = ds["train"].column_names
cols.remove("label")
ds_enc = ds.map(tokenize_and_encode, batched=True, remove_columns=cols, num_proc=14)
clear_output()
ds_enc

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids'],
        num_rows: 9832
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids'],
        num_rows: 1235
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids'],
        num_rows: 1225
    })
})

In [13]:
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_label": x["label"].to(torch.float)}, remove_columns=["label"])
          .rename_column("float_label", "label"))
clear_output()
ds_enc['train'][0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'label': tensor([0., 1., 0., 0., 0.]),
 'input_ids': tensor([    2,     6,  1920, 16719,  1920,  2442,  5405,  6896,  8659,  2037,
          2037, 12400, 24662,  2679, 13953,  2141,  2442,  5405,  1982,  8571,
          2141,  1927,  3360,     6,     6,  1977,  5436, 28313,    18,   244,
          1920, 24325, 12471,  1036,  6253,  2112,   234,    62,  6132,  2052,
          1955, 15937,    18,     6,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     

In [14]:
LR = 2e-5
EPS = 1e-8
EPOCHS = 3

def evaluate(model, val_dataloader):
    model.eval()
    loss_val_total = 0
    for batch in val_dataloader:
        inputs = {
                'attention_mask': batch['attention_mask'].to(DEVICE),
                'input_ids': batch['input_ids'].to(DEVICE),
                'token_type_ids': batch['token_type_ids'].to(DEVICE),
                'labels': batch['label'].to(DEVICE),
            }
        with torch.no_grad():
            output = model(**inputs)
            loss = output.loss
        loss_val_total += loss.item()
    loss_val_avg = loss_val_total/len(val_dataloader) 
    return loss_val_avg

def train(model, train_dataloader, val_dataloader):
    optimizer = AdamW(model.parameters(), lr = LR, eps = EPS)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * EPOCHS)
    best_val_loss = 1
    model.train()
    for epoch in range(EPOCHS):
        loss_train_total = 0
        for batch in tqdm(train_dataloader):
            model.zero_grad()
            inputs = {
                'attention_mask': batch['attention_mask'].to(DEVICE),
                'input_ids': batch['input_ids'].to(DEVICE),
                'token_type_ids': batch['token_type_ids'].to(DEVICE),
                'labels': batch['label'].to(DEVICE),
            }
            output = model(**inputs)
            loss = output.loss
            loss_train_total += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        loss_train_avg = loss_train_total / len(train_dataloader)
        loss_val_avg = evaluate(model, val_dataloader)
        print(f'epoch:{epoch+1}/{EPOCHS} train loss={loss_train_avg}  val loss={loss_val_avg}')
        
        if loss_val_avg < best_val_loss:
            best_val_loss = loss_val_avg    
    return best_val_loss

In [15]:
train_dataloader = torch.utils.data.DataLoader(ds_enc['train'], batch_size=32)
val_dataloader = torch.utils.data.DataLoader(ds_enc['validation'], batch_size=32)
train(model, train_dataloader, val_dataloader)

HBox(children=(FloatProgress(value=0.0, max=308.0), HTML(value='')))


epoch:1/3 train loss=0.3198231649766495  val loss=0.26926768169953275


HBox(children=(FloatProgress(value=0.0, max=308.0), HTML(value='')))


epoch:2/3 train loss=0.261448670565695  val loss=0.25971671900688076


HBox(children=(FloatProgress(value=0.0, max=308.0), HTML(value='')))


epoch:3/3 train loss=0.23341736807064575  val loss=0.26568962977482724
[0.25971671900688076]


In [16]:
model.eval()
predictions = []
test_dataloader = torch.utils.data.DataLoader(ds_enc['test'], batch_size=32)
for batch in test_dataloader:
    inputs = {
            'attention_mask': batch['attention_mask'].to(DEVICE),
            'input_ids': batch['input_ids'].to(DEVICE),
            'token_type_ids': batch['token_type_ids'].to(DEVICE),
        }
    with torch.no_grad():
        output = model(**inputs)
        batch_predictions = torch.argmax(output.logits, dim=1)
        predictions.extend(batch_predictions.cpu().detach().numpy().ravel().tolist())

labels = lb.inverse_transform(ds_enc['test']['label'])
print(f"Accuracy of base model is {accuracy_score(y_true=labels, y_pred=predictions):.4f} and f-score is {f1_score(y_true=labels, y_pred=predictions, average='weighted'):.4f}")

Accuracy of base model is 0.6518 and f-score is 0.6138
