# tool NER with different gold standards
Fine tune existing bioNER models for biomedical tool identification with two different gold standards and compare performance.

In [1]:
from datasets import Dataset, Features, Sequence, Value, ClassLabel
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import pandas as pd
import torch
import random
from iob_functions import *

random.seed(602)

In [2]:
# load the data
BASE_DIR = "../data/"

training = process_tab_delim_iob(BASE_DIR + 'tags/fulltext_iob/fulltext_tools_train.iob')
validation = process_tab_delim_iob(BASE_DIR + 'tags/fulltext_iob/fulltext_tools_validation.iob')
test = process_tab_delim_iob(BASE_DIR + 'tags/fulltext_iob/fulltext_tools_test.iob')

In [14]:
softcite_train = process_tab_delim_iob(BASE_DIR + 'tags/softcite.iob')
softcite_train = combine_tags(softcite_train, 'software', 'TOOL')
softcite_train = remove_tag(softcite_train, 'version')

softcite_train, softc_validation = split_training(softcite_train, .1) # take 10% of total for validation

In [None]:
tag_stats(softcite_train)

In [3]:
tag_stats(training)

{'O': 285730,
 'B-TOOL': 2165,
 'B-UNS_METHOD': 372,
 'I-UNS_METHOD': 363,
 'I-TOOL': 764}

In [4]:
features = Features({"tokens": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
                     "tags": Sequence(feature=ClassLabel(names=["O", "B-UNS_METHOD", "I-UNS_METHOD", "B-TOOL", "I-TOOL"]))})

In [5]:
training_ds = Dataset.from_dict({"tokens": training['sentences'], "tags": training['tags']}, features=features)
validation_ds = Dataset.from_dict({"tokens": validation['sentences'], "tags": validation['tags']}, features=features)
test_ds = Dataset.from_dict({"tokens": test['sentences'], "tags": test['tags']}, features=features)

In [15]:
training_sc_ds = Dataset.from_dict({"tokens": softcite_train['sentences'], "tags": softcite_train['tags']}, features=features)
validation_sc_ds = Dataset.from_dict({"tokens": softc_validation['sentences'], "tags": softc_validation['tags']}, features=features)

In [6]:
all_tags = training_ds.features["tags"].feature
tag_list = training_ds.features["tags"].feature.names
id2tag = {idx: tag for idx, tag in enumerate(all_tags.names)}
tag2id = {tag: idx for idx, tag in enumerate(all_tags.names)}

In [7]:
# get the list of models to run
m_names = ['microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract']
MAX_LENGTH = 256

In [8]:
# from https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(data, tknzr, max_length=50):
    tokenized_inputs = tknzr(data['tokens'], truncation=True, is_split_into_words=True, max_length=max_length)

    labels = []
    for i, label in enumerate(data['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
from sklearn.metrics import precision_recall_fscore_support

seqeval = evaluate.load("seqeval")

def flatten(l):
    return [item for sublist in l for item in sublist]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tag_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tag_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    f1res = precision_recall_fscore_support(flatten(true_labels), flatten(true_predictions), labels=all_tags.names)

    df = list(zip(all_tags.names, f1res[2], f1res[0], f1res[1]))
    df = pd.DataFrame(df, columns = ['Level', 'F1-Score', 'Precision', 'Recall'])   
    print(df)

    return results

In [10]:
test_metrics = dict()

for m in m_names:
    print(m)
    tokenizer = AutoTokenizer.from_pretrained(m)
    # dynamically pad sentences to longest length in batch for efficiency
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    train_tokenized = training_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})
    val_tokenized = validation_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})
    test_tokenized = test_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})

    model = AutoModelForTokenClassification.from_pretrained(
        m, num_labels=5, id2label=id2tag, label2id=tag2id
    )

    training_args = TrainingArguments(
        output_dir="model/" + m,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    test_preds = trainer.predict(test_tokenized)
    test_metrics[m] = test_preds.metrics


microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract


Map:   0%|          | 0/10916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1948 [00:00<?, ? examples/s]

Map:   0%|          | 0/2953 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Tool,Uns Method,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,0.0175,0.006187,"{'precision': 0.691358024691358, 'recall': 0.8615384615384616, 'f1': 0.7671232876712328, 'number': 65}","{'precision': 0.7543859649122807, 'recall': 0.7413793103448276, 'f1': 0.7478260869565219, 'number': 58}",0.717391,0.804878,0.758621,0.998299
2,0.0055,0.004765,"{'precision': 0.7777777777777778, 'recall': 0.8615384615384616, 'f1': 0.8175182481751826, 'number': 65}","{'precision': 0.8035714285714286, 'recall': 0.7758620689655172, 'f1': 0.7894736842105263, 'number': 58}",0.789062,0.821138,0.804781,0.998774
3,0.0036,0.00464,"{'precision': 0.8243243243243243, 'recall': 0.9384615384615385, 'f1': 0.8776978417266187, 'number': 65}","{'precision': 0.7931034482758621, 'recall': 0.7931034482758621, 'f1': 0.7931034482758621, 'number': 58}",0.810606,0.869919,0.839216,0.998833


          Level  F1-Score  Precision    Recall
0             O  0.999205   0.999424  0.998987
1  B-UNS_METHOD  0.796460   0.818182  0.775862
2  I-UNS_METHOD  0.813559   0.750000  0.888889
3        B-TOOL  0.777778   0.708861  0.861538
4        I-TOOL  0.835165   0.826087  0.844444




          Level  F1-Score  Precision    Recall
0             O  0.999484   0.999444  0.999523
1  B-UNS_METHOD  0.814159   0.836364  0.793103
2  I-UNS_METHOD  0.821429   0.793103  0.851852
3        B-TOOL  0.852941   0.816901  0.892308
4        I-TOOL  0.860759   1.000000  0.755556




          Level  F1-Score  Precision    Recall
0             O  0.999503   0.999642  0.999364
1  B-UNS_METHOD  0.807018   0.821429  0.793103
2  I-UNS_METHOD  0.809917   0.731343  0.907407
3        B-TOOL  0.877698   0.824324  0.938462
4        I-TOOL  0.928571   1.000000  0.866667




          Level  F1-Score  Precision    Recall
0             O  0.998397   0.998939  0.997855
1  B-UNS_METHOD  0.729064   0.691589  0.770833
2  I-UNS_METHOD  0.705882   0.613636  0.830769
3        B-TOOL  0.833333   0.785714  0.887097
4        I-TOOL  0.751724   0.721854  0.784173


In [11]:
test_metrics

{'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract': {'test_loss': 0.018545329570770264,
  'test_TOOL': {'precision': 0.7790368271954674,
   'recall': 0.8870967741935484,
   'f1': 0.8295625942684767,
   'number': 310},
  'test_UNS_METHOD': {'precision': 0.6548672566371682,
   'recall': 0.7708333333333334,
   'f1': 0.7081339712918661,
   'number': 96},
  'test_overall_precision': 0.7489270386266095,
  'test_overall_recall': 0.8596059113300493,
  'test_overall_f1': 0.8004587155963303,
  'test_overall_accuracy': 0.9966448834472571,
  'test_runtime': 11.9102,
  'test_samples_per_second': 247.938,
  'test_steps_per_second': 31.066}}

In [16]:
# softcite run
test_sc_metrics = dict()

for m in m_names:
    print(m)
    tokenizer = AutoTokenizer.from_pretrained(m)
    # dynamically pad sentences to longest length in batch for efficiency
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    train_sc_tokenized = training_sc_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})
    val_sc_tokenized = validation_sc_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})
    test_tokenized = test_ds.map(tokenize_and_align_labels, batched=True, fn_kwargs={'tknzr': tokenizer, 'max_length': MAX_LENGTH})

    model = AutoModelForTokenClassification.from_pretrained(
        m, num_labels=5, id2label=id2tag, label2id=tag2id
    )

    training_args = TrainingArguments(
        output_dir="model/" + m,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_sc_tokenized,
        eval_dataset=val_sc_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    test_preds_sc = trainer.predict(test_tokenized)
    test_sc_metrics[m] = test_preds_sc.metrics

microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract


Map:   0%|          | 0/1696 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Map:   0%|          | 0/2953 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Tool,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,No log,0.021887,"{'precision': 0.670995670995671, 'recall': 0.748792270531401, 'f1': 0.7077625570776256, 'number': 207}",0.670996,0.748792,0.707763,0.993733
2,No log,0.025857,"{'precision': 0.7549019607843137, 'recall': 0.7439613526570048, 'f1': 0.7493917274939172, 'number': 207}",0.754902,0.743961,0.749392,0.994142
3,0.036100,0.027507,"{'precision': 0.7450980392156863, 'recall': 0.7342995169082126, 'f1': 0.7396593673965937, 'number': 207}",0.745098,0.7343,0.739659,0.994096


          Level  F1-Score  Precision    Recall
0             O  0.996963   0.996734  0.997193
1  B-UNS_METHOD  0.000000   0.000000  0.000000
2  I-UNS_METHOD  0.000000   0.000000  0.000000
3        B-TOOL  0.815851   0.788288  0.845411
4        I-TOOL  0.551724   0.666667  0.470588


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


          Level  F1-Score  Precision    Recall
0             O  0.997081   0.996050  0.998113
1  B-UNS_METHOD  0.000000   0.000000  0.000000
2  I-UNS_METHOD  0.000000   0.000000  0.000000
3        B-TOOL  0.826406   0.836634  0.816425
4        I-TOOL  0.538462   0.777778  0.411765


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


          Level  F1-Score  Precision    Recall
0             O  0.997080   0.996141  0.998021
1  B-UNS_METHOD  0.000000   0.000000  0.000000
2  I-UNS_METHOD  0.000000   0.000000  0.000000
3        B-TOOL  0.826406   0.836634  0.816425
4        I-TOOL  0.537313   0.734694  0.423529


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


          Level  F1-Score  Precision    Recall
0             O  0.995815   0.995062  0.996569
1  B-UNS_METHOD  0.000000   0.000000  0.000000
2  I-UNS_METHOD  0.000000   0.000000  0.000000
3        B-TOOL  0.513630   0.462532  0.577419
4        I-TOOL  0.314050   0.368932  0.273381


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
test_sc_metrics

{'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract': {'test_loss': 0.036947280168533325,
  'test_TOOL': {'precision': 0.4146341463414634,
   'recall': 0.5483870967741935,
   'f1': 0.4722222222222222,
   'number': 310},
  'test_UNS_METHOD': {'precision': 0.0,
   'recall': 0.0,
   'f1': 0.0,
   'number': 96},
  'test_overall_precision': 0.4146341463414634,
  'test_overall_recall': 0.4187192118226601,
  'test_overall_f1': 0.41666666666666663,
  'test_overall_accuracy': 0.9916748040762162,
  'test_runtime': 11.7916,
  'test_samples_per_second': 250.433,
  'test_steps_per_second': 31.378}}