In [None]:
!pip install transformers
!pip install pytorch_lightning
!pip install sentencepiece datasets seqeval

In [1]:
from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset, load_metric
from datasets import DatasetDict
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/maxhager/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device = torch.device("cpu")

In [3]:
print(device)

cpu


In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

### Model


In [5]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.hparam = hparam
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(
            hparam.model_name_or_path
        )
        self.save_hyperparameters()

    def is_logger(self):
        return True

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}

    def validation_step(self, batch, batch_idx):
        #val_loss = compute_val_loss(batch)
        #log the val_loss metric for the ModelCheckpoint callback to monitor
        val_loss = self._step(batch)
        self.log('val_loss', val_loss, prog_bar=True)
        return {"val_loss": val_loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparam.learning_rate, eps=self.hparam.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self,
                       epoch=None,
                       batch_idx=None,
                       optimizer=None,
                       optimizer_idx=None,
                       optimizer_closure=None,
                       on_tpu=None,
                       using_native_amp=None,
                       using_lbfgs=None
                       ):

        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(
            self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="train", args=self.hparam)
        dataloader = DataLoader(train_dataset, batch_size=self.hparam.train_batch_size,
                                drop_last=True, shuffle=True, num_workers=2)
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.train_batch_size * max(1, self.hparam.n_gpu)))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(
            tokenizer=self.tokenizer, type_path="validation", args=self.hparam)
        return DataLoader(val_dataset, batch_size=self.hparam.eval_batch_size, num_workers=2)


In [6]:
logger = logging.getLogger(__name__)


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

            # Log and save results to file
            output_test_results_file = os.path.join(
                pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(
                            key, str(metrics[key])))


In [7]:
args_dict = dict(
    data_dir="jnlpba",  # path for data files
    output_dir="checkpoints",  # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=256,  # todo figure out
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8, # 4/2/1 if t5-small not working
    eval_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=16,
    #n_gpu=1,
    early_stop_callback=False,
    fp_16=True, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)


### Dataset


In [8]:
from datasets import DatasetDict, Dataset

jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})

class JnlpbDataset(Dataset):

    def __init__(self, tokenizer, dataset, type_path, max_len=512):
        self.dataset = dataset[type_path]
        self.max_len = max_len
        self.tokenizer = tokenizer
        # todo make sure i dont need this
        self.tokenizer.max_length = max_len
        self.tokenizer.model_max_length = max_len
        self.inputs = []
        self.targets = []
        self.merge()
        self.convert()
        self._build()
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        #tokens = self.datatset.tokens[index]  # add this line
        #tokens = self.tokenizer.convert_ids_to_tokens(source_ids)
        tokens = self.dataset["tokens"]
        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask, "tokens": tokens}

    def map_tags(self, row):
        mapping = {
            0: "O",
            1: "B-DNA",
            2: "I-DNA",
            3: "B-RNA",
            4: "I-RNA",
            5: "B-cell_line",
            6: "I-cell_line",
            7: "B-cell_type",
            8: "I-cell_type",
            9: "B-protein",
            10: "I-protein"
        }
        row['ner_tags'] = [[mapping[tag] for tag in row['ner_tags']]][0]
        return row

    def convert(self):
        df_train = pd.DataFrame(self.dataset)
        #df_train = df_train.apply(self.map_tags, axis=1)
        l = []
        l_temp = []
        for i in range(len(df_train)):
            for j in range(len(df_train['ner_tags'][i])):
              if df_train['ner_tags'][i][j] != "O":
                l_temp.append(df_train['ner_tags'][i][j] + ': ' + df_train['tokens'][i][j])
            l.append(l_temp)
            l_temp = []
        d = {'spans': l}
        df_train = df_train.assign(spans=l)
        for i in df_train["spans"]:
          print(i)
        train = Dataset.from_pandas(df_train)
        self.dataset = train
        return train
    
    def merge_tags(self, tags, tokens):
      #todo test if this works also in the scenario of having two B- tags side by side
      merged_tags = []
      merged_tokens = []
      i = 0
      while i < len(tags):
          if tags[i].startswith('B-'):
              merged_tag = tags[i][2:]
              merged_token = tokens[i]
              i += 1
              while i < len(tags) and tags[i].startswith('I-'):
                  merged_tag += ' ' + tags[i][2:]
                  merged_token += ' ' + tokens[i]
                  i += 1
              merged_tags.append(merged_tag)
              merged_tokens.append(merged_token)
          else:
              merged_tags.append(tags[i])
              merged_tokens.append(tokens[i])
              i += 1
      for i in range(len(merged_tags)):
        s = merged_tags[i].split()[0]
        #merged_tags[i].split()[0] = 
        s = s[0].upper() + s[1:]
        merged_tags[i] = s
      return merged_tags, merged_tokens

    def merge(self):
      df_train = pd.DataFrame(self.dataset)
      df_train = df_train.apply(self.map_tags, axis=1)
      df_train[['ner_tags', 'tokens']] = df_train.apply(lambda x: self.merge_tags(x['ner_tags'], x['tokens']), axis=1, result_type='expand')
      self.dataset = Dataset.from_pandas(df_train)

    def _build(self):
      for idx in range(len(self.dataset)):
          input_, target = " ".join(self.dataset[idx]["tokens"]), "; ".join(
              self.dataset[idx]["spans"])
          input_ = input_.lower() + ' </s>'
          target = target.lower() + " </s>"

          tokenized_inputs = self.tokenizer.batch_encode_plus(
              [input_], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
          )

          tokenized_targets = self.tokenizer.batch_encode_plus(
              [target], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
          )
          self.inputs.append(tokenized_inputs)
          self.targets.append(tokenized_targets)

tokenizer = AutoTokenizer.from_pretrained('t5-small')

input_dataset = JnlpbDataset(tokenizer=tokenizer, dataset=jnlpba, type_path='train')

Found cached dataset jnlpba (/Users/maxhager/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4)


  0%|          | 0/2 [00:00<?, ?it/s]

['DNA: IL-2 gene', 'Protein: NF-kappa B', 'Protein: CD28', 'Protein: 5-lipoxygenase']
['Protein: CD28 surface receptor', 'Protein: interleukin-2', 'Protein: IL-2']
['Cell_type: primary T lymphocytes', 'Protein: CD28', 'Protein: CD28', 'Protein: NF-kappa B', 'Protein: CD28-responsive complex', 'Protein: IL-2']
['Protein: CD28', 'Protein: protein tyrosine kinase', 'Protein: phospholipase A2', 'Protein: 5-lipoxygenase']
['Protein: lipoxygenase metabolites', 'Protein: IL-2', 'Protein: NF-kappa B']
['Protein: CD28']
['DNA: peri-kappa B site', 'DNA: human immunodeficiency virus type 2 enhancer', 'Cell_type: monocytes', 'Cell_type: T cells']
[]
[]
['DNA: enhancer/promoter region']


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
'''for i in range(len(input_dataset)):
    _ = input_dataset[i]'''

'for i in range(len(input_dataset)):\n    _ = input_dataset[i]'

In [10]:
'''data = input_dataset[0]

print(tokenizer.decode(data["source_ids"], skip_special_tokens=False))
print(tokenizer.decode(data["target_ids"], skip_special_tokens=False))'''

'data = input_dataset[0]\n\nprint(tokenizer.decode(data["source_ids"], skip_special_tokens=False))\nprint(tokenizer.decode(data["target_ids"], skip_special_tokens=False))'

In [11]:
args = argparse.Namespace(**args_dict)
model = T5FineTuner(args)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filename=args.output_dir+"/checkpoint.pth", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    #accelerator='gpu',
    #gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision=32,
    #amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    #checkpoint_callback=checkpoint_callback,
    callbacks=[checkpoint_callback, LoggingCallback()],
)

In [13]:
def get_dataset(tokenizer, type_path, args):
    tokenizer.max_length = args.max_seq_length
    tokenizer.model_max_length = args.max_seq_length
    jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
    jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})
    dataset = jnlpba
    return JnlpbDataset(tokenizer=tokenizer, dataset=dataset, type_path=type_path)

In [14]:
trainer = pl.Trainer(**train_params)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


In [15]:
trainer.fit(model)

  rank_zero_warn(

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Found cached dataset jnlpba (/Users/maxhager/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4)


  0%|          | 0/2 [00:00<?, ?it/s]

['Protein: glucocorticoid receptors', 'Cell_type: lymphocytes']
['Protein: glucocorticoid receptors', 'Protein: GR', 'Cell_type: peripheral blood lymphocytes']
['Cell_type: lymphocytes', 'Protein: GR', 'Cell_type: control cells']
['Protein: GR']
['Cell_type: lymphocytes', 'Protein: GR']
['Protein: 1 , 25-Dihydroxyvitamin D3 receptors', 'Cell_type: lymphocytes', 'Cell_type: T- and B-lymphocyte']
['Cell_type: lymphocytes']
[]
['Cell_type: T lymphocytes']
[]


  rank_zero_warn(


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/homebrew/Cellar/python@3.10/3.10.9/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'JnlpbDataset' on <module '__main__' (built-in)>


In [None]:
# add wandb logger
# push to hf hub
# metrics 
#next steps
#!rm -r "/content/lightning_logs"

### Evaluation

In [None]:
model = model.load_from_checkpoint("/content/lightning_logs/version_0/checkpoints/checkpoints/checkpoint.pth.ckpt")

In [None]:
import textwrap
jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})
input_dataset = JnlpbDataset(tokenizer=tokenizer, dataset=jnlpba, type_path='train')

dataloader = DataLoader(input_dataset, batch_size=32, num_workers=2, shuffle=True)
model.model.eval()
model = model.to("cpu")
outputs = []
targets = []
texts = []

#catched in 
#i want to check if the text and the predicted entities are alright
#i need to change the representation of the entities. 
for batch in dataloader:
    outs = model.model.generate(input_ids=batch['source_ids'],
                                attention_mask=batch['source_mask'])
    dec = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip() for ids in outs]
    target = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    text = [tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]
    texts.extend(text)
    outputs.extend(dec)
    targets.extend(target)
    break

for i in range(10):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual Entities: %s" % target[i])
    print("Predicted Entities: %s" % outputs[i])
    print("=====================================================================\n")



  0%|          | 0/2 [00:00<?, ?it/s]



text: consistent with these differences , we have previously demonstrated that the enhancer/promoter
region of hiv-2 functions quite differently from that of hiv-1 .

Actual Entities: b-dna: enhancer/promoter; i-dna: region
Predicted Entities: Wir haben bereits gezeigt, dass die enhancing/promoter region hiv-2 sehr unterschiedlich

text: our data suggest that lipoxygenase metabolites activate roi formation which then induce il-2
expression via nf-kappa b activation .

Actual Entities: b-protein: lipoxygenase; i-protein: metabolites; b-protein: il-2; b-protein: nf-kappa; i-protein: b
Predicted Entities: data suggest that lipoxygenase metabolites activate roi formation

text: hiv-1 and hiv-2 display significant differences in nucleic acid sequence and in the natural
history of clinical disease .

Actual Entities: 
Predicted Entities: hiv-1 et hiv-2 présentent signifikante Unterschiede in

text: these findings should be useful for therapeutic strategies and the development of
immunosuppre

In [None]:
def find_sub_list(sl, l):
    results = []
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind:ind+sll] == sl:
            results.append((ind, ind+sll-1))
    return results

def generate_label(input: str, target: str):
    mapper = {
        "O": 0,
        "B-DNA": 1,
        "I-DNA": 2,
        "B-RNA": 3,
        "I-RNA": 4,
        "B-cell_line": 5,
        "I-cell_line": 6,
        "B-cell_type": 7,
        "I-cell_type": 8,
        "B-protein": 9,
        "I-protein": 10
    }

    mapper_2 = {k.lower(): k for k in mapper.keys()}

    input = input.split(" ")

    target = target.split("; ")

    init_target_label = [mapper["O"]] * len(input)

    for ent in target:
        ent = ent.split(": ")
        try:
            sent_end = ent[1].split(" ")
            index = find_sub_list(sent_end, input)
        except:
            continue
        try:
            init_target_label[index[0][0]] = mapper_2[ent[0]] #mapper[f"B-{ent[0]}"]
            for i in range(index[0][0]+1, index[0][1]+1):
                init_target_label[i] = mapper_2[ent[0]] #mapper[f"I-{ent[0]}"]
        except:
            continue

    return init_target_label

In [32]:
l = []
strings = ["B-DNA: gen2, I-DNA: gene", "B-protein: CD28, B-protein: protein"]

for string in strings:
    sublist = []
    for item in string.split(", "):
        sublist.append(item.split(": ")[1])
    l.append(sublist)

#need to catch index errors

print(l)


[['gen2', 'gene'], ['CD28', 'protein']]


In [33]:
from tqdm import tqdm

test_dataset = JnlpbDataset(tokenizer=tokenizer, dataset=jnlpba, type_path='validation')

test_loader = DataLoader(test_dataset, batch_size=32,
                             num_workers=2, shuffle=True)
model.model.eval()
model = model.to("cuda")
outputs = []
targets = []
all_text = []
true_labels = []
pred_labels = []
predictions = []
predictions_temp = []
counter = 0
for batch in tqdm(test_loader):
    tokens = []
    t = []
    for token_tuple in batch["tokens"]:
      for i in token_tuple:
        t.append(i[0])
      tokens.append(t)
      t = []
    print(tokens)
    #okay now finally i have the desired state 
    #next step is to extract all the values
    #format is [B-DNA: IL-2, I-DNA: gene, B-protein: NF-kappa]
    #how can I get [IL-2, gene..]?

    counter += 1
    input_ids = batch['source_ids'].to("cuda")
    attention_mask = batch['source_mask'].to("cuda")
    outs = model.model.generate(input_ids=input_ids,
                                attention_mask=attention_mask)

    #thet
    dec = [tokenizer.decode(ids, skip_special_tokens=True,
                            clean_up_tokenization_spaces=False).strip() for ids in outs]

    l = []

    for s in dec:
        sublist = []
        for item in s.split(", "):
          try:
            sublist.append(item.split(": ")[1])
          except IndexError:
            pass
        l.append(sublist)
    print(l)

    #i have a list
    #iter over list and extract 
    target = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["target_ids"]]
    texts = [tokenizer.decode(ids, skip_special_tokens=True,  clean_up_tokenization_spaces=False).strip()
                for ids in batch["source_ids"]]

    true_label = [generate_label(texts[i].strip(), target[i].strip()) if target[i].strip() != 'none' else [
        "O"]*len(texts[i].strip().split()) for i in range(len(texts))]
    
    pred_label = [generate_label(texts[i].strip(), dec[i].strip()) if dec[i].strip() != 'none' else [
        "O"]*len(texts[i].strip().split()) for i in range(len(texts))]

    outputs.extend(dec)
    targets.extend(target)
    true_labels.extend(true_label)
    pred_labels.extend(pred_label)
    all_text.extend(texts)

['Protein: glucocorticoid receptors', 'Cell_type: lymphocytes']
['Protein: glucocorticoid receptors', 'Protein: GR', 'Cell_type: peripheral blood lymphocytes']
['Cell_type: lymphocytes', 'Protein: GR', 'Cell_type: control cells']
['Protein: GR']
['Cell_type: lymphocytes', 'Protein: GR']
['Protein: 1 , 25-Dihydroxyvitamin D3 receptors', 'Cell_type: lymphocytes', 'Cell_type: T- and B-lymphocyte']
['Cell_type: lymphocytes']
[]
['Cell_type: T lymphocytes']
[]




  0%|          | 0/1 [00:00<?, ?it/s][A[A

[['Number', 'of', 'glucocorticoid receptors', 'in', 'lymphocytes', 'and', 'their', 'sensitivity', 'to', 'hormone', 'action', '.'], ['The', 'study', 'demonstrated', 'a', 'decreased', 'level', 'of', 'glucocorticoid receptors', '(', 'GR', ')', 'in', 'peripheral blood lymphocytes', 'from', 'hypercholesterolemic', 'subjects', ',', 'and', 'an', 'elevated', 'level', 'in', 'patients', 'with', 'acute', 'myocardial', 'infarction', '.'], ['In', 'the', 'lymphocytes', 'with', 'a', 'high', 'GR', 'number', ',', 'dexamethasone', 'inhibited', '[', '3H', ']', '-thymidine', 'and', '[', '3H', ']', '-acetate', 'incorporation', 'into', 'DNA', 'and', 'cholesterol', ',', 'respectively', ',', 'in', 'the', 'same', 'manner', 'as', 'in', 'the', 'control cells', '.'], ['On', 'the', 'other', 'hand', ',', 'a', 'decreased', 'GR', 'number', 'resulted', 'in', 'a', 'less', 'efficient', 'dexamethasone', 'inhibition', 'of', 'the', 'incorporation', 'of', 'labeled', 'compounds', '.'], ['These', 'data', 'showed', 'that', 'th

  0%|          | 0/1 [00:00<?, ?it/s]


IndexError: ignored

In [None]:
all_text[4]

'on the other hand , a decreased gr number resulted in a less efficient dexamethasone inhibition of the incorporation of labeled compounds .'

In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

for i in range(10):
    print(f"Text:  {all_text[i]}")
    print(f"targets:  {outputs[i]}")
    print(f"Predicted Token Class:  {pred_labels[i]}")
    print(f"True Token Class:  {true_labels[i]}")
    print("=====================================================================\n")

    #Protein: Berlin
    #l = [0,0,Berlin,0]

print(metric.compute(predictions=pred_labels, references=true_labels))

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Text:  [ 1 , 25-dihydroxyvitamin d3 receptors in lymphocytes and t- and b-lymphocyte count in patients with glomerulonephritis ]
targets:  glomerulonephritis ]
Predicted Token Class:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
True Token Class:  [0, 'B-protein', 'I-protein', 'I-protein', 'I-protein', 'I-protein', 0, 'B-cell_type', 'I-cell_type', 'B-cell_type', 0, 'I-cell_type', 0, 0, 0, 0, 0, 0]

Text:  content of receptors to hormonal form of vitamin d3 , 1.25 ( oh ) 2d3 , constituted 27.3 fmole/mg of protein in lymphocytes of peripheric blood of children with glomerulonephritis .
targets:  , d3 , 1,25 ( oh ) 2d3
Predicted Token Class:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
True Token Class:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'B-cell_type', 0, 0, 0, 0, 0, 0, 0, 0]

Text:  the study demonstrated a decreased level of glucocorticoid receptors ( gr ) in peripheral blood lymphocytes from hy

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
