https://huggingface.co/d4data/biomedical-ner-all
https://huggingface.co/datasets/singh-aditya/MACCROBAT_biomedical_ner

https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/
https://medium.com/@minhle_0210/pos-tagging-medical-ner-ffcdaef7a7b3
https://github.com/dreji18/Bio-Epidemiology-NER

https://huggingface.co/distilbert/distilbert-base-uncased
https://huggingface.co/google-bert/bert-base-uncased
https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
https://huggingface.co/Charangan/MedBERT

https://wandb.ai/jack-morris/david-vs-goliath/reports/Does-Model-Size-Matter-A-Comparison-of-BERT-and-DistilBERT--VmlldzoxMDUxNzU#:~:text=The%20BERT%20authors%20recommend%20fine,5e%2D5%2C%203e%2D5
https://datascience.stackexchange.com/questions/64583/what-are-the-good-parameter-ranges-for-bert-hyperparameters-while-finetuning-it

In [3]:
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="max")  # device=0 (gpu)

text = """The patient reported no recurrence of palpitations at follow-up 6 months after the ablation."""

out = pipe(text)
print(out)

spans = []

for row in out:
    spans.append((row["start"], row["end"], row["entity_group"]))

# show_span_ascii_markup(text, spans)
show_span_box_markup(text, spans)

[{'entity_group': 'Sign_symptom', 'score': 0.9999311, 'word': 'palpitations', 'start': 38, 'end': 50}, {'entity_group': 'Clinical_event', 'score': 0.99975544, 'word': 'follow', 'start': 54, 'end': 60}, {'entity_group': 'Date', 'score': 0.999867, 'word': '6 months after', 'start': 64, 'end': 78}]


In [1]:
import warnings

import torch
from torch.utils.data import random_split

from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers.utils.logging import set_verbosity_error

from ipymarkup import show_span_ascii_markup, show_span_box_markup

import mrb_ner_config
from mrb_ner_dataset import MRBNERDataset
from mrb_ner_evaluator import MRBNEREvaluator

warnings.filterwarnings("ignore")
set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = "singh-aditya/MACCROBAT_biomedical_ner"
data = load_dataset(DATA_PATH)
# print(dataset.data["train"]["tokens"][0])
# print(dataset.data["train"]["ner_labels"][0])

In [3]:
label_names = data["train"].features["ner_labels"].feature.names
id2label = {}
label2id = {}

for idx, label in enumerate(label_names):
    id2label[idx] = label
    label2id[label] = idx

# print(len(label2id))
# print(len(id2label))

In [4]:
data_splits = {}
data_splits["train"], data_splits["val"], data_splits["test"]  = random_split(data["train"], [0.7, 0.15, 0.15])

In [5]:
evaluator = MRBNEREvaluator(metric="seqeval", id2label=id2label)

In [6]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
EPOCHS = 1
MODELS = [
    {
        "name": "distilbert-base-uncased",
        "path": "distilbert/distilbert-base-uncased",
        "save_trainer": None,
        "test_metrics": None,
        "param": None
    },
    {
        "name": "biobert",
        "path": "dmis-lab/biobert-v1.1",
        "save_trainer": None,
        "test_metrics": None,
        "param": None
    },
    {
        "name": "bio-clinical-bert",
        "path": "emilyalsentzer/Bio_ClinicalBERT",
        "save_trainer": None,
        "test_metrics": None,
        "param": None
    },
    {
        "name": "medbert",
        "path": "Charangan/MedBERT",
        "save_trainer": None,
        "test_metrics": None,
        "param": None
    }
]
LRATES = [5e-5, 4e-5, 3e-5, 2e-5]
WDECAY = 0.01

In [7]:
for m in MODELS:
    for lr in LRATES: 
        print(f"\n---------------------- Training Model (Model:{m['name']}, Lr:{lr}) ----------------------")
        tokenizer = AutoTokenizer.from_pretrained(m["path"])
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        model = AutoModelForTokenClassification.from_pretrained(
            pretrained_model_name_or_path=m["path"],
            label2id=label2id,
            id2label=id2label,
            ignore_mismatched_sizes=True,
            num_labels=len(label2id)
        )

        datasets = {}
        datasets["train"] = MRBNERDataset(data=data_splits["train"], tokenizer=tokenizer, id2label=id2label, label2id=label2id, max_len=512)
        datasets["val"] = MRBNERDataset(data=data_splits["val"], tokenizer=tokenizer, id2label=id2label, label2id=label2id, max_len=512)
        datasets["test"] = MRBNERDataset(data=data_splits["test"], tokenizer=tokenizer, id2label=id2label, label2id=label2id, max_len=512)

        torch.cuda.empty_cache() if DEVICE == "cuda" else None

        training_args = TrainingArguments(
            output_dir=f"../output/{m['name']}/{lr}/cp",
            overwrite_output_dir=True,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=EPOCHS,
            learning_rate=lr,
            weight_decay=WDECAY,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=datasets["train"],
            eval_dataset=datasets["val"],
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=evaluator.compute_metrics
        )

        trainer.train()
        predictions, _, metrics = trainer.predict(datasets["test"])

        if m["test_metrics"] == None or m["test_metrics"]["test_overall_accuracy"] < metrics["test_overall_accuracy"]:
            best_accuracy = m['test_metrics']['test_overall_accuracy'] if m['test_metrics'] else None
            print(f"Updating best accuracy: {best_accuracy} -> {metrics['test_overall_accuracy']}")
            m["save_trainer"] = trainer
            m["test_metrics"] = metrics
            m["param"] = lr

    print("Saved Model Metrics:")
    for key, value in m["test_metrics"].items():
        print(f"{key}: {value}")
    m["save_trainer"].save_model(f"../output/{m['name']}/save_model")

---------------------- Training Model (Model:distilbert-base-uncased, Lr:5e-05) ----------------------
{'eval_loss': 2.4831795692443848, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.0, 'eval_runtime': 0.536, 'eval_samples_per_second': 55.97, 'eval_steps_per_second': 7.463, 'epoch': 1.0}
{'train_runtime': 7.416, 'train_samples_per_second': 18.878, 'train_steps_per_second': 2.427, 'train_loss': 2.9876526726616754, 'epoch': 1.0}
Updating best accuracy: None -> 0.0
---------------------- Training Model (Model:distilbert-base-uncased, Lr:4e-05) ----------------------
{'eval_loss': 2.4635941982269287, 'eval_overall_precision': 0.0, 'eval_overall_recall': 0.0, 'eval_overall_f1': 0.0, 'eval_overall_accuracy': 0.0, 'eval_runtime': 0.541, 'eval_samples_per_second': 55.453, 'eval_steps_per_second': 7.394, 'epoch': 1.0}
{'train_runtime': 6.71, 'train_samples_per_second': 20.864, 'train_steps_per_second': 2.683, 'train_loss': 2.904073

: 

In [31]:
pipe = pipeline("ner", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="max")  # device=0 (gpu)
text = """The patient reported no recurrence of palpitations at follow-up 6 months after the ablation."""

out = pipe(text)
print(out)

spans = []

for row in out:
    spans.append((row["start"], row["end"], row["entity_group"]))

show_span_box_markup(text, spans)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'SIGN_SYMPTOM', 'score': 0.49153978, 'word': 'recurrence', 'start': 24, 'end': 34}, {'entity_group': 'SIGN_SYMPTOM', 'score': 0.702181, 'word': 'palpitations', 'start': 38, 'end': 50}, {'entity_group': 'DATE', 'score': 0.46011654, 'word': '6 months after', 'start': 64, 'end': 78}]
