In [1]:
!nvidia-smi

Sun Jan  8 12:44:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers
!pip install pytorch_lightning
!pip install sentencepiece datasets seqeval
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, http

In [90]:
from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset, load_metric
from datasets import DatasetDict
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import wandb

import nltk
nltk.download('punkt')
wandb.login()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [91]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

### Model


In [92]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.hparam = hparam
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(
            hparam.model_name_or_path
        )
        wandb.init(config=self.hparam)  # initialize W&B run
        self.save_hyperparameters()


    def is_logger(self):
        return True

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        wandb.log({"train_loss": loss})
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        wandb.log({"avg_train_loss": avg_train_loss})

    def validation_step(self, batch, batch_idx):
        #val_loss = compute_val_loss(batch)
        #log the val_loss metric for the ModelCheckpoint callback to monitor
        val_loss = self._step(batch)
        wandb.log({"val_loss": val_loss})
        self.log('val_loss', val_loss, prog_bar=True)
        return {"val_loss": val_loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        wandb.log({"avg_val_loss": avg_loss}) 

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparam.learning_rate, eps=self.hparam.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self,
                       epoch=None,
                       batch_idx=None,
                       optimizer=None,
                       optimizer_idx=None,
                       optimizer_closure=None,
                       on_tpu=None,
                       using_native_amp=None,
                       using_lbfgs=None
                       ):

        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(
            self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="train", args=self.hparam)
        dataloader = DataLoader(train_dataset, batch_size=self.hparam.train_batch_size,
                                drop_last=True, shuffle=True, num_workers=2)
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.train_batch_size * max(1, self.hparam.n_gpu)))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="validation", args=self.hparam)
        return DataLoader(val_dataset, batch_size=self.hparam.eval_batch_size, num_workers=2)


In [93]:
logger = logging.getLogger(__name__)


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

            # Log and save results to file
            output_test_results_file = os.path.join(
                pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(
                            key, str(metrics[key])))


In [94]:
args_dict = dict(
    data_dir="jnlpba",  # path for data files
    output_dir="checkpoints",  # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=256,  # todo figure out
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8, # 4/2/1 if t5-small not working
    eval_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=True, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)


### Dataset


In [97]:
from datasets import DatasetDict, Dataset

jnlpba = load_dataset('jnlpba', split=['train[:3000]', "validation[:100]"])
jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})

class JnlpbDataset(Dataset):

    def __init__(self, tokenizer, dataset, type_path, max_len=512):
        self.dataset = dataset[type_path]
        self.max_len = max_len
        self.tokenizer = tokenizer
        # todo make sure i dont need this
        self.tokenizer.max_length = max_len
        self.tokenizer.model_max_length = max_len
        self.inputs = []
        self.targets = []
        self.merge()
        self.convert()
        self._build()
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        tokens = self.dataset["tokens"]
        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask, "tokens": tokens}

    def map_tags(self, row):
        mapping = {
            0: "O",
            1: "B-DNA",
            2: "I-DNA",
            3: "B-RNA",
            4: "I-RNA",
            5: "B-cell_line",
            6: "I-cell_line",
            7: "B-cell_type",
            8: "I-cell_type",
            9: "B-protein",
            10: "I-protein"
        }
        row['ner_tags'] = [[mapping[tag] for tag in row['ner_tags']]][0]
        return row

    def convert(self):
        df_train = pd.DataFrame(self.dataset)
        #df_train = df_train.apply(self.map_tags, axis=1)
        l = []
        l_temp = []
        for i in range(len(df_train)):
            for j in range(len(df_train['ner_tags'][i])):
              if df_train['ner_tags'][i][j] != "O":
                l_temp.append(df_train['ner_tags'][i][j] + ': ' + df_train['tokens'][i][j])
            l.append(l_temp)
            l_temp = []
        d = {'spans': l}
        df_train = df_train.assign(spans=l)
        train = Dataset.from_pandas(df_train)
        self.dataset = train
        return train
    
    def merge_tags(self, tags, tokens):
      #todo test if this works also in the scenario of having two B- tags side by side
      merged_tags = []
      merged_tokens = []
      i = 0
      while i < len(tags):
          if tags[i].startswith('B-'):
              merged_tag = tags[i][2:]
              merged_token = tokens[i]
              i += 1
              while i < len(tags) and tags[i].startswith('I-'):
                  merged_tag += ' ' + tags[i][2:]
                  merged_token += ' ' + tokens[i]
                  i += 1
              merged_tags.append(merged_tag)
              merged_tokens.append(merged_token)
          else:
              merged_tags.append(tags[i])
              merged_tokens.append(tokens[i])
              i += 1
      for i in range(len(merged_tags)):
        s = merged_tags[i].split()[0]
        s = s[0].upper() + s[1:]
        merged_tags[i] = s
      return merged_tags, merged_tokens

    def merge(self):
      df_train = pd.DataFrame(self.dataset)
      df_train = df_train.apply(self.map_tags, axis=1)
      df_train[['ner_tags', 'tokens']] = df_train.apply(lambda x: self.merge_tags(x['ner_tags'], x['tokens']), axis=1, result_type='expand')
      self.dataset = Dataset.from_pandas(df_train)

    def _build(self):
      for idx in range(len(self.dataset)):
          input_, target = " ".join(self.dataset[idx]["tokens"]), "; ".join(
              self.dataset[idx]["spans"])
          input_ = input_.lower() + ' </s>'
          target = target.lower() + " </s>"

          tokenized_inputs = self.tokenizer.batch_encode_plus(
              [input_], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
          )

          tokenized_targets = self.tokenizer.batch_encode_plus(
              [target], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
          )
          self.inputs.append(tokenized_inputs)
          self.targets.append(tokenized_targets)

tokenizer = AutoTokenizer.from_pretrained('t5-small')

input_dataset = JnlpbDataset(tokenizer=tokenizer, dataset=jnlpba, type_path='train')



  0%|          | 0/2 [00:00<?, ?it/s]

In [98]:
args = argparse.Namespace(**args_dict)
model = T5FineTuner(args)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [99]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filename=args.output_dir+"/checkpoint.pth", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    accelerator='gpu',
    #gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision=32,
    #amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    #checkpoint_callback=checkpoint_callback,
    callbacks=[checkpoint_callback, LoggingCallback()],
)

In [100]:
def get_dataset(tokenizer, type_path, args):
    tokenizer.max_length = args.max_seq_length
    tokenizer.model_max_length = args.max_seq_length
    jnlpba = load_dataset('jnlpba', split=['train[:3000]', "validation[:100]"])
    jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})
    dataset = jnlpba
    return JnlpbDataset(tokenizer=tokenizer, dataset=dataset, type_path=type_path)

In [101]:
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

  rank_zero_warn(
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


### Evaluation

In [102]:
#todo path needs to be adjusted depending on where model was saved 
wandb.save('/content/lightning_logs/version_4/checkpoints/checkpoints/checkpoint.pth.ckpt')
api = wandb.Api()
#todo project path needs to be adjusted
run = api.run("htw/uncategorized/7w324geo")
for file in run.files():
    file.download(replace=True)
#depends on where i save the model
model = model.load_from_checkpoint("/content/checkpoint.pth.ckpt")
wandb.finish()



VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.070149…

VBox(children=(Label(value='0.001 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.075583…

In [103]:
import textwrap
#todo adjust to full val data
jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})
input_dataset = JnlpbDataset(tokenizer=tokenizer, dataset=jnlpba, type_path='train')

dataloader = DataLoader(input_dataset, batch_size=32, num_workers=2, shuffle=True)
model.model.eval()
model = model.to("cpu")
outputs = []
targets = []
texts = []

for batch in dataloader:
    outs = model.model.generate(input_ids=batch['source_ids'],
                                attention_mask=batch['source_mask'])
    dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    text = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    texts.extend(text)
    outputs.extend(dec)
    targets.extend(target)
    break

for i in range(10):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual Entities: %s" % target[i])
    print("Predicted Entities: %s" % outputs[i])
    print("=====================================================================\n")



  0%|          | 0/2 [00:00<?, ?it/s]



text: delineation of the cd28 signaling cascade was found to involve protein tyrosine kinase
activity , followed by the activation of phospholipase a2 and 5-lipoxygenase .

Actual Entities: protein: cd28; protein: protein tyrosine kinase; protein: phospholipase a2; protein: 5-lipoxygenase
Predicted Entities: protein: cd28 signaling cascade; protein: protein tyrosine

text: hiv-1 and hiv-2 display significant differences in nucleic acid sequence and in the natural
history of clinical disease .

Actual Entities: 
Predicted Entities: protein: hiv-1; protein: hiv-2; protein: nucleic acid sequence

text: in primary t lymphocytes we show that cd28 ligation leads to the rapid intracellular formation
of reactive oxygen intermediates ( rois ) which are required for cd28 -mediated activation of the
nf-kappa b / cd28-responsive complex and il-2 expression .

Actual Entities: cell_type: primary t lymphocytes; protein: cd28; protein: cd28; protein: nf-kappa b; protein: cd28-responsive complex; prot

In [104]:
def label_pred(input, actual):
  predictions = []

  for s in input:
    pairs = s.split(";")
    dict_list = []
    for p in pairs:
      key_value = p.split(":")
      if len(key_value) == 2:
        key, value = key_value
        dict_list.append({key.strip(): value.strip()})
      else:
        dict_list.append({"key": "null"})
    predictions.append(dict_list)

  tokens = []
  t = []
  for token_tuple in actual:
    for i in token_tuple:
      t.append(i[0])
    tokens.append(t)
    t = []

  result = []

  for i, (dicts, t) in enumerate(zip(predictions, tokens)):
    lst = ["O"] * len(t)
    for d in dicts:
      for k, v in d.items():
        if v in t:
          lst[t.index(v)] = k
    result.append(lst)

  return result

In [105]:
from tqdm import tqdm

test_dataset = JnlpbDataset(tokenizer=tokenizer, dataset=jnlpba, type_path='validation')
#todo check if my removing shuffle=True of has any dramatic impact 
test_loader = DataLoader(test_dataset, batch_size=32,
                             num_workers=2)
model.model.eval()
model = model.to("cuda")
outputs = []
targets = []
all_text = []
true_labels = []
pred_labels = []
predictions = []
predictions_temp = []
counter = 0
for batch in tqdm(test_loader):
    counter += 1
    input_ids = batch['source_ids'].to("cuda")
    attention_mask = batch['source_mask'].to("cuda")
    outs = model.model.generate(input_ids=input_ids,
                                attention_mask=attention_mask)

    dec = [tokenizer.decode(ids, skip_special_tokens=True,
                            clean_up_tokenization_spaces=False).strip() for ids in outs]

    target = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    texts = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    
    predicted_label = label_pred(dec, batch["tokens"])
    true_label = label_pred(target, batch["tokens"])

    outputs.extend(dec)
    targets.extend(target)
    true_labels.extend(true_label)
    pred_labels.extend(predicted_label)
    all_text.extend(texts)

100%|██████████| 1/1 [00:00<00:00,  1.82it/s]


In [106]:
from datasets import load_metric

metric = load_metric("seqeval")

for i in range(10):
    print(f"Text:  {all_text[i]}")
    print(f"targets:  {outputs[i]}")
    print(f"Predicted Token Class:  {pred_labels[i]}")
    print(f"True Token Class:  {true_labels[i]}")
    print("=====================================================================\n")

print(metric.compute(predictions=pred_labels, references=true_labels))

Text:  number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action .
targets:  protein: glucocorticoid receptors; protein: lymphocytes;
Predicted Token Class:  ['O', 'O', 'protein', 'O', 'protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True Token Class:  ['O', 'O', 'protein', 'O', 'cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Text:  the study demonstrated a decreased level of glucocorticoid receptors ( gr ) in peripheral blood lymphocytes from hypercholesterolemic subjects , and an elevated level in patients with acute myocardial infarction .
targets:  protein: glucocorticoid receptors; protein: g; protein
Predicted Token Class:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True Token Class:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'protein', 'O', 'O', 'O', 'O', 'cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Text:  in 

