data (datasets): https://github.com/dmis-lab/biobert

based on: https://www.youtube.com/watch?v=r6XY80Z9eSA&t=793s

#### 0. Install, download

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git@main 
!pip install -q datasets SentencePiece onnx peft pytorch-lightning

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

import time
import random
import pandas as pd
import numpy as np

from datasets import load_dataset

from transformers import T5Tokenizer, T5ForConditionalGeneration, GPT2Tokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

import re
from tqdm.notebook import tqdm
import textwrap
from termcolor import colored

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

import json
from operator import itemgetter
from distutils.util import strtobool

import argparse
import glob
import os
import logging
from itertools import chain
from string import punctuation

from pathlib import Path
from termcolor import colored
import textwrap

In [2]:
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


42

#### Data

In [3]:
cd /content/drive/MyDrive/ds/t5-ft/boolq

/content/drive/MyDrive/ds/t5-ft/boolq


In [4]:
path_data = "../../boolq/"

srcs = [
    {
        "stream": lambda: open(path_data+ "train.jsonl", "r"),
        "keys": ["question", "answer", "passage"],
    },
    {
        "stream": lambda: open(path_data+ "dev.jsonl", "r"),
        "keys": ["question", "answer", "passage"],
    },
]

In [5]:
def create_dataset(src):
  df = pd.DataFrame(columns = ["question", "answer", "passage"])
  with src["stream"]() as s:
    for d in tqdm(s):
      q, a, p = itemgetter(src["keys"][0], src["keys"][1], src["keys"][2])(
        json.loads(d)
    )
      df.loc[len(df), :] = [q, str(a), p]

    return df

In [6]:
train_ds = create_dataset(srcs[0])
test_ds = create_dataset(srcs[1])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [7]:
train_df = train_ds[['question', 'passage', 'answer']].head(1700)
val_df = train_ds[['question', 'passage', 'answer']].tail(500)
test_df = test_ds[['question', 'passage', 'answer']].head(500)

In [9]:
len(max(train_df.passage.values))

556

### Tokenization

In [8]:
MODEL_NAME = "t5-base"

In [9]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
train_df.answer.unique()

array(['True', 'False'], dtype=object)

In [11]:
class BoolQADataset(Dataset):

  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 8
  ):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index:int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['question'],
      data_row['passage'],
      max_length = self.source_max_token_len,
      padding = 'max_length',
      truncation = 'only_second',
      add_special_tokens = True,
      return_tensors = 'pt'
    )

    target_encoding = tokenizer(
      data_row['answer'],
      max_length = self.target_max_token_len,
      padding = 'max_length',
      truncation = True,
      add_special_tokens = True,
      return_tensors = 'pt'
    )

    labels = target_encoding['input_ids']
    labels[labels == 0] = - 100

    return dict(
        question = data_row['question'],
        context = data_row['passage'],
        answer_text = data_row['answer'],
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten()
    )

In [12]:
sample_dataset = BoolQADataset(train_df, tokenizer)

In [13]:
for sample in sample_dataset:
  print(sample)
  break

{'question': 'do iran and afghanistan speak the same language', 'context': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.', 'answer_text': 'True', 'input_ids': tensor([  103,     3,    23,  2002,    11,     3,     9,    89, 22637,   343,
          152,  2516,     8,   337,  1612,     1, 25518,    41,    87,     2,
          102,     2,    52,     2,    29,     6,     3,    18,     2,    29,
           87,   201,    92,   801,    57,   165,   414, 19140,  

In [14]:
class BoolQADataModule(pl.LightningDataModule):
  
  def __init__(
      self,
      train_df: pd.DataFrame,
      val_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 8
  ):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.val_df = val_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self, stage=None):
    self.train_dataset = BoolQADataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )
    self.val_dataset = BoolQADataset(
        self.val_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )
    self.test_dataset = BoolQADataset(
        self.test_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers = 4
    )
    
  def val_dataloader(self):
    return DataLoader(
        self.val_dataset,
        batch_size = 1,
        num_workers = 4
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )

In [15]:
BATCH_SIZE = 12
N_EPOCHS = 6

data_module = BoolQADataModule(train_df, val_df, test_df, tokenizer, batch_size = BATCH_SIZE)

In [16]:
data_module.setup()

In [17]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

### Modeling

In [18]:
class BoolQAModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

  def forward(self, input_ids, attention_mask, labels = None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels
    )
    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('train_loss', loss, prog_bar = True, logger = True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('val_loss', loss, prog_bar = True, logger = True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('test_loss', loss, prog_bar = True, logger = True)
    return loss
  
  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)

In [19]:
model = BoolQAModel()

In [20]:
!ls

checkpoints  res.csv  training-logs


In [21]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min"
)

In [22]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("training-logs", name = "bool-qa")

In [23]:
trainer = pl.Trainer(
    logger = logger,
    callbacks=[checkpoint_callback],
    max_epochs = N_EPOCHS,
    devices=1, 
    accelerator="gpu",
    log_every_n_steps=30
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [24]:
trainer.fit(model, data_module)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 142: 'val_loss' reached 0.23328 (best 0.23328), saving model to '/content/drive/MyDrive/ds/t5-ft/boolq/checkpoints/best-checkpoint-v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 284: 'val_loss' reached 0.20853 (best 0.20853), saving model to '/content/drive/MyDrive/ds/t5-ft/boolq/checkpoints/best-checkpoint-v2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 426: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 568: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 4, global step 710: 'val_loss' was not in top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 5, global step 852: 'val_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.


In [None]:
trainer.test()

### Predictions

In [33]:
trained_model = BoolQAModel.load_from_checkpoint('checkpoints/best-checkpoint.ckpt')
trained_model.freeze()

In [None]:
trained_model.cuda()

In [35]:
def generate_answer(question):
  source_encoding = tokenizer(
      question["question"],
      question["passage"],
      max_length = 396,
      padding = "max_length",
      truncation = "only_second",
      return_attention_mask = True,
      add_special_tokens = True,
      return_tensors = "pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids = source_encoding["input_ids"].cuda(),
      attention_mask = source_encoding["attention_mask"].cuda(),
      num_beams = 1,
      max_length = 80,
      repetition_penalty = 1.0,
      early_stopping = True,
      use_cache = True
  )

  preds = [
      tokenizer.decode(generated_id, skip_special_tokens = True, clean_up_tokenization_spaces = True)
      for generated_id in generated_ids
  ]

  return "".join(preds)

In [87]:
sample_question = val_df.iloc[3]

In [88]:
sample_question["question"]

'has a team won a superbowl at home'

In [89]:
sample_question["answer"]

'False'

In [90]:
generate_answer(sample_question)

'False'

In [None]:
val_df

In [36]:
def evaluation(df):
  res = []

  for index, question in tqdm(df.iterrows()):
    predicted = generate_answer(question)
    actual = question['answer']
    correct = predicted == actual

    res.append({
            'index': index,
            'question': question["question"],
            'context': question["passage"],
            'actual': actual,
            'predicted': predicted,
            'correct': correct
        })
    
  return pd.DataFrame(res)

In [37]:
ev = evaluation(val_df)

0it [00:00, ?it/s]

In [95]:
ev

Unnamed: 0,index,question,context,actual,predicted,correct
0,8927,was the byzantine empire a continuation of the...,"The Byzantine Empire, also referred to as the ...",True,True,True
1,8928,is cabo the same as cabo san lucas,Cabo San Lucas (Spanish pronunciation: (ˈkaβo ...,True,True,True
2,8929,is john wayne airport the same as santa ana,"John Wayne Airport (IATA: SNA, ICAO: KSNA, FAA...",True,True,True
3,8930,has a team won a superbowl at home,So far no team has yet managed to reach the ch...,False,False,True
4,8931,is metro pcs part of t-mobile,MetroPCS (stylized as metroPCS) is a prepaid w...,True,True,True
...,...,...,...,...,...,...
495,9422,is a us district court a federal court,The United States district courts are the gene...,True,True,True
496,9423,can a tenant get a restraining order against a...,"If a landlord is found to be retaliating, he o...",True,True,True
497,9424,is the golden state warriors in the playoffs,The Warriors went into the 2018 playoffs as th...,True,True,True
498,9425,downton abbey will there be a season 7,Downton Abbey is a British period drama televi...,False,True,False


In [96]:
ev.correct.unique()

array([ True, False])

In [38]:
acc = sum(list(ev.correct.values))/len(ev)
acc #

0.672

In [46]:
COLUMNS = ['model', 'dataset', 'n_epochs', 'batch_size', 'time_1e', 'acc']

In [51]:
res = pd.read_csv('res.csv')

In [52]:
res = res[COLUMNS]

In [53]:
res.loc[len(res), :] =  [MODEL_NAME, '1700/500/500', 3, BATCH_SIZE, 171, acc]

In [54]:
res

Unnamed: 0,model,dataset,n_epochs,batch_size,time_1e,acc
0,t5-base,1000/500/500,3.0,12.0,124.0,0.672
1,t5-base,1700/500/500,3.0,12.0,171.0,0.672


In [55]:
res.to_csv('res.csv', index = False)