data (datasets): https://github.com/dmis-lab/biobert

based on: https://www.youtube.com/watch?v=r6XY80Z9eSA&t=793s

#### 0. Install, download

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q git+https://github.com/huggingface/transformers.git@main 
!pip install -q datasets SentencePiece onnx peft pytorch-lightning

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m124.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

import time
import random
import pandas as pd
import numpy as np

from datasets import load_dataset

from transformers import T5Tokenizer, T5ForConditionalGeneration, GPT2Tokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

import re
from tqdm.notebook import tqdm
import textwrap
from termcolor import colored

from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

import json
from operator import itemgetter
from distutils.util import strtobool

import argparse
import glob
import os
import logging
from itertools import chain
from string import punctuation

from pathlib import Path
from termcolor import colored
import textwrap

In [2]:
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


42

#### Data

In [3]:
cd /content/drive/MyDrive/ds/t5-ft

/content/drive/MyDrive/ds/t5-ft


In [4]:
with Path('BioASQ/BioASQ-train-factoid-5b.json').open() as json_file:
  data = json.load(json_file)

In [8]:
data.keys()

dict_keys(['data', 'version'])

In [9]:
data['data'][0].keys()

dict_keys(['paragraphs', 'title'])

In [10]:
questions = data['data'][0]['paragraphs']

In [11]:
questions[3]

{'qas': [{'id': '56bc751eac7ad10019000013_005',
   'question': 'Name synonym of Acrokeratosis paraneoplastica.',
   'answers': [{'text': 'Bazex syndrome', 'answer_start': 30}]}],
 'context': 'Acrokeratosis paraneoplastica Bazex syndrome associated with esophageal squamocellular carcinoma. BACKGROUND: Acrokeratosis paraneoplastica Bazex (APB) is a very rare disease in the group of obligate paraneoplastic dermatoses, associated mostly with squamous cell carcinoma of the upper aerodigestive tract and metastatic cervical lymphadenopathy. The disease is characterized by violaceous erythemosquamous changes on the acral regions. This entity was first reported by Bazex in 1965. About 160 cases have been presented so far. CASE REPORT: We presented a patient with a three-month history of violaceous erythema, edema, erosions and scaling on the acral regions, elbows and knees and severe nail dystrophy. When the diagnosis was established, he did not have any symptom of internal malignancy. Esophago

In [5]:
def extract_questions_and_answers(factoid_path: Path):
  with Path(factoid_path).open() as json_file:
    data = json.load(json_file)

  questions = data['data'][0]['paragraphs']

  data_rows = []

  for question in questions:
    context = question['context']
    for question_and_answers in question['qas']:
      question = question_and_answers['question']
      answers = question_and_answers['answers']

      for answer in answers:
        answer_text = answer['text']
        answer_start = answer['answer_start']
        answer_end = answer_start + len(answer_text)

        data_rows.append({
            'question': question,
            'context': context,
            'answer_text': answer_text,
            'answer_start': answer_start,
            'answer_end': answer_end
        })
  return pd.DataFrame(data_rows)

In [13]:
extract_questions_and_answers('BioASQ/BioASQ-train-factoid-4b.json').head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [6]:
factoid_paths = sorted(list(Path('BioASQ/').glob('BioASQ-train-factoid-*')))
factoid_paths

[PosixPath('BioASQ/BioASQ-train-factoid-4b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-5b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-6b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-7b.json')]

In [7]:
dfs = []

for factoid_path in factoid_paths[:3]:
  dfs.append(extract_questions_and_answers(factoid_path))

df = pd.concat(dfs)

In [16]:
df.shape

(12988, 5)

In [17]:
df.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [18]:
df.question.nunique(), df.context.nunique(), df.answer_text.nunique()

(443, 2582, 661)

In [19]:
sample_question = df.iloc[240]
sample_question

question                What is the synonym of the lubag disease?
context         Adductor laryngeal breathing dystonia in a pat...
answer_text                        X-linked dystonia-Parkinsonism
answer_start                                                   63
answer_end                                                     93
Name: 240, dtype: object

In [20]:
def color_answer(question):
  answer_start, answer_end = question['answer_start'], question['answer_end']
  context = question['context']

  return colored(context[:answer_start], "white") + \
    colored(context[answer_start:answer_end + 1], "green") + \
    colored(context[answer_end + 1:], "white")

In [21]:
print(sample_question['question'])
print()
print('Answer:')

for wrap in textwrap.wrap(color_answer(sample_question), width = 120):
  print(wrap)

What is the synonym of the lubag disease?

Answer:
Adductor laryngeal breathing dystonia in a patient with lubag (X-linked dystonia-Parkinsonism syndrome). We report a
patient with Lubag (X-linked dystonia-parkinsonism) who presented with severe respiratory stridor from adductor
laryngeal breathing dystonia. Emergency tracheostomy was necessary, and subsequent laryngeal injection with botulinum
toxin led to worsening aspiration. Botulinum toxin injection for severe lingual dystonia was successful.


### Tokenization

In [8]:
MODEL_NAME = "t5-base"

In [9]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [24]:
sample_encoding = tokenizer(
    "Love or to be loved?", 
    "Both, I want to love and to be loved"
)

In [25]:
sample_encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [26]:
print(sample_encoding['input_ids']), print(sample_encoding['attention_mask'])

[2129, 42, 12, 36, 1858, 58, 1, 2867, 6, 27, 241, 12, 333, 11, 12, 36, 1858, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


(None, None)

In [27]:
preds = [
  tokenizer.decode(input_id, skip_special_tokens = True, clean_up_tokenization_spaces=True)
  for input_id in sample_encoding["input_ids"]
]

In [28]:
" ".join(preds)

'Love or to be loved ? </s> Both , I want to love and to be loved </s>'

In [29]:
encoding = tokenizer(
    sample_question["question"],
    sample_question["context"],
    max_length = 396,
    padding = "max_length",
    truncation = "only_second",
    return_attention_mask = True,
    add_special_tokens = True,
    return_tensors = "pt"
)

In [30]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [31]:
print(tokenizer.special_tokens_map)

{'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_53

In [32]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 1)

In [33]:
tokenizer.decode(encoding["input_ids"].squeeze())

'What is the synonym of the lubag disease?</s> Adductor laryngeal breathing dystonia in a patient with lubag (X-linked dystonia-Parkinsonism syndrome). We report a patient with Lubag (X-linked dystonia-parkinsonism) who presented with severe respiratory stridor from adductor laryngeal breathing dystonia. Emergency tracheostomy was necessary, and subsequent laryngeal injection with botulinum toxin led to worsening aspiration. Botulinum toxin injection for severe lingual dystonia was successful.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [34]:
answer_encoding = tokenizer(
    sample_question["answer_text"],
    max_length = 32,
    padding = "max_length",
    truncation = True,
    return_attention_mask = True,
    add_special_tokens = True,
    return_tensors = 'pt'
)

In [35]:
tokenizer.decode(answer_encoding["input_ids"].squeeze())

'X-linked dystonia-Parkinsonism</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [36]:
labels = answer_encoding["input_ids"]
labels

tensor([[    3,     4,    18, 29000, 16633,    17,  8008,    18, 13212,  7815,
            32, 14378,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

In [37]:
labels[labels == 0] = - 100
labels

tensor([[    3,     4,    18, 29000, 16633,    17,  8008,    18, 13212,  7815,
            32, 14378,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100]])

In [10]:
class BioQADataset(Dataset):

  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32
  ):

    self.tokenizer = tokenizer
    self.data = data
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index:int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['question'],
      data_row['context'],
      max_length = self.source_max_token_len,
      padding = 'max_length',
      truncation = 'only_second',
      add_special_tokens = True,
      return_tensors = 'pt'
    )

    target_encoding = tokenizer(
      data_row['answer_text'],
      max_length = self.target_max_token_len,
      padding = 'max_length',
      truncation = True,
      add_special_tokens = True,
      return_tensors = 'pt'
    )

    labels = target_encoding['input_ids']
    labels[labels == 0] = - 100

    return dict(
        question = data_row['question'],
        context = data_row['context'],
        answer_text = data_row['answer_text'],
        input_ids = source_encoding['input_ids'].flatten(),
        attention_mask = source_encoding['attention_mask'].flatten(),
        labels = labels.flatten()
    )

In [39]:
sample_dataset = BioQADataset(df, tokenizer)

In [40]:
for data in sample_dataset:
  print(data['question'])
  print(data['answer_text'])
  print(data['input_ids'][:20])
  print(data['labels'][:20])
  break

What is the inheritance pattern of Li–Fraumeni syndrome?
autosomal dominant
tensor([  363,    19,     8, 28915,  3275,    13,  1414,   104,   371,  6340,
           35,    23, 12398,    58,     1, 17904,    26,     3,    17,   599])
tensor([ 1510, 10348,   138, 12613,     1,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100])


In [11]:
train_df, val_df = train_test_split(df, test_size = 0.05)

In [12]:
train_df.shape, val_df.shape

((12338, 5), (650, 5))

In [13]:
class BioQADataModule(pl.LightningDataModule):
  
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int = 8,
      source_max_token_len: int = 396,
      target_max_token_len: int = 32
  ):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.source_max_token_len = source_max_token_len
    self.target_max_token_len = target_max_token_len

  def setup(self, stage=None):
    self.train_dataset = BioQADataset(
        self.train_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )
    self.test_dataset = BioQADataset(
        self.test_df,
        self.tokenizer,
        self.source_max_token_len,
        self.target_max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers = 4
    )
    
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )

In [14]:
BATCH_SIZE = 12
N_EPOCHS = 3  

data_module = BioQADataModule(train_df, val_df, tokenizer, batch_size = BATCH_SIZE)

In [62]:
#train_dl = BioQADataModule(train_df, val_df, tokenizer, batch_size = 4).train_dataloader()

In [15]:
data_module.setup()

In [16]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

In [17]:
input_ids = tokenizer(
    "translate English to French: I talk a lot, what to do?",
    return_tensors = "pt"
).input_ids

In [49]:
generated_ids = model.generate(input_ids = input_ids)
generated_ids



tensor([[    0,  1022, 15701,  3933,     6,   546,    31,   222,    18,   565,
           546,    31,   173,  3189,  1143,    58,     1]])

In [50]:
preds = [
  tokenizer.decode(gen_id, skip_special_tokens = True, clean_up_tokenization_spaces=True)
  for gen_id in generated_ids
]

" ".join(preds)

"Je parle beaucoup, qu'est-ce qu'il faut faire?"

In [79]:
text = """
summaraize: what to do? I don't understand what to do? what should i do? Do you know what to do?
"""

In [80]:
input_ids = tokenizer(
    text,
    return_tensors = "pt"
).input_ids

generated_ids = model.generate(input_ids = input_ids)

preds = [
  tokenizer.decode(gen_id, skip_special_tokens = True, clean_up_tokenization_spaces=True)
  for gen_id in generated_ids
]

" ".join(preds)

': what to do?: what to do? what should i do?'

In [81]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "pre

In [82]:
encoding

{'input_ids': tensor([[  363,    19,     8, 29443,    13,     8,     3, 11158,     9,   122,
          1994,    58,     1,  2334,    76,  5317,    50,   651,    29,   397,
           138, 10882, 16633,    17,  8008,    16,     3,     9,  1868,    28,
             3, 11158,     9,   122,    41,     4,    18, 29000, 16633,    17,
          8008,    18, 13212,  7815,    32, 14378, 12398,   137,   101,   934,
             3,     9,  1868,    28,  2318,  7893,    41,     4,    18, 29000,
         16633,    17,  8008,    18,  1893,  7815,    32, 14378,    61,   113,
          2569,    28,  5274, 19944,     3,     7,  1788,    26,   127,    45,
           617,    76,  5317,    50,   651,    29,   397,   138, 10882, 16633,
            17,  8008,     5, 15118,     3,  6471,    88,  3481,    32,  2258,
            47,  1316,     6,    11,  8697,    50,   651,    29,   397,   138,
         10672,    28, 14761,    83,    77,   440,    12,   226,    77,  2237,
            12,  4131,    29,    53,  

In [83]:
output = model(
    input_ids = encoding["input_ids"],
    attention_mask = encoding["attention_mask"],
    labels = labels
)

In [84]:
output.keys()

odict_keys(['loss', 'logits', 'past_key_values', 'encoder_last_hidden_state'])

In [85]:
output.logits.shape

torch.Size([1, 32, 32128])

In [86]:
output.loss

tensor(2.2242, grad_fn=<NllLossBackward0>)

### Modeling

In [18]:
class BioQAModel(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

  def forward(self, input_ids, attention_mask, labels = None):
    output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels
    )
    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('train_loss', loss, prog_bar = True, logger = True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('val_loss', loss, prog_bar = True, logger = True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('test_loss', loss, prog_bar = True, logger = True)
    return loss
  
  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)

In [19]:
model = BioQAModel()

In [20]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min"
)

In [24]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("training-logs", name = "bio-qa")

In [26]:
trainer = pl.Trainer(
    logger = logger,
    callbacks=[checkpoint_callback],
    max_epochs = N_EPOCHS,
    devices=1, 
    accelerator="gpu",
    log_every_n_steps=30
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [27]:
%load_ext tensorboard

In [28]:
!rm -rf lightning_logs

In [33]:
%tensorboard --logdir ./training_logs

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
/usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server)
/usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server)
/usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /usr/local/lib/python3.10/dist-packages/tensorboard_data_server/bin/server)
Address already in use
Port 6006 is in use by another program. Either identify and stop that program, or start the server with a different port.

In [30]:
trainer.fit(model, data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 1029: 'val_loss' reached 0.09975 (best 0.09975), saving model to '/content/drive/MyDrive/ds/t5-ft/checkpoints/best-checkpoint.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 2058: 'val_loss' reached 0.08391 (best 0.08391), saving model to '/content/drive/MyDrive/ds/t5-ft/checkpoints/best-checkpoint.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 3087: 'val_loss' reached 0.07721 (best 0.07721), saving model to '/content/drive/MyDrive/ds/t5-ft/checkpoints/best-checkpoint.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
trainer.test()

### Predictions

In [34]:
trained_model = BioQAModel.load_from_checkpoint('checkpoints/best-checkpoint.ckpt')
trained_model.freeze()

In [None]:
trained_model.cuda()

In [46]:
def generate_answer(question):
  source_encoding = tokenizer(
      question["question"],
      question["context"],
      max_length = 396,
      padding = "max_length",
      truncation = "only_second",
      return_attention_mask = True,
      add_special_tokens = True,
      return_tensors = "pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids = source_encoding["input_ids"].cuda(),
      attention_mask = source_encoding["attention_mask"].cuda(),
      num_beams = 1,
      max_length = 80,
      repetition_penalty = 1.0,
      early_stopping = True,
      use_cache = True
  )

  preds = [
      tokenizer.decode(generated_id, skip_special_tokens = True, clean_up_tokenization_spaces = True)
      for generated_id in generated_ids
  ]

  return "".join(preds)

In [48]:
sample_question = val_df.iloc[10]

In [49]:
sample_question["question"]

'What is the main component of the Lewy bodies?'

In [50]:
sample_question["answer_text"]

'alpha-Synuclein'

In [51]:
generate_answer(sample_question)

'alpha-Synuclein'