In [1]:
# [Cell 1]
import ipywidgets as widgets
from IPython.display import display, HTML
from spacy import displacy

# Import your custom modules
from config import COLORS
from ner_model import TransformerNERWithCRF
from main import (
    predict_with_custom_model,
    predict_with_pipeline,
    merge_dosages,
    # Import your newly refactored evaluation function
    run_evaluation
)
import sys
print(sys.executable)
print(sys.version)

required = (3, 11)
if sys.version_info != required:
    raise RuntimeError(
        f"Python {required[0]}.{required[1]} required. "
        f"You are running {sys.version}"
    )

C:\Users\Denden\AppData\Local\Programs\Python\Python312\python.exe
3.12.3 (tags/v3.12.3:f6650f9, Apr  9 2024, 14:05:25) [MSC v.1938 64 bit (AMD64)]


RuntimeError: Python 3.11 required. You are running 3.12.3 (tags/v3.12.3:f6650f9, Apr  9 2024, 14:05:25) [MSC v.1938 64 bit (AMD64)]

# Biomedical Named Entity Recognition (BioNER): Baseline Comparison & Fine Tuning

In this notebook, we explore the performance of various models on the BC5CDR dataset, a benchmark for biomedical named entity recognition (BioNER). We will compare different BERT's with a particular focus on the `tner/roberta-large-bc5cdr` model as our gold standard. Finally, we will present the results of our own fine tuned model and discuss its performance in relation to the baselines. We wanted to see how close we could get to the SOTA and see which other baseline models could be competitive with the `tner` model, which is a large, fine tuned transformer specifically optimized for this task.

## Project Overview & Methodology

### 1.The Baselines:
Named Entity Recognition in the biomedical domain is notoriously tricky due to nested entities, multi word structures, and heavy use of specific jargon. To understand our model's true performance, we are evaluating it against the following baselines:
* **RoBERTa**: https://huggingface.co/FacebookAI/roberta-base
* **BioBERT:** https://huggingface.co/dmis-lab/biobert-base-cased-v1.2
* **SciBERT:** https://huggingface.co/allenai/scibert_scivocab_uncased
* **PubMedBERT:** https://huggingface.co/NeuML/pubmedbert-base-embeddings
* **BiomedRoBERTA:** https://huggingface.co/allenai/biomed_roberta_base

### 2.The Gold Standard: `tner/roberta-large-bc5cdr`
This model serves as our primary ceiling. Fine tuned extensively on the BC5CDR dataset by the T-NER library, it represents a highly optimized, SOTA approach for this specific task.
Self reported score:
* **F1 Score (micro):** ~0.884
* **Chemical Entity F1:** ~0.925
* **Disease Entity F1:** ~0.833
Score we actually measured:
* **F1 Score (micro):** ~0.922
* **Chemical Entity F1:** ~0.930
* **Disease Entity F1:** ~0.913

### 3. Our Fine-Tuned Model
*So, how close did we get?* We took the "roberta-base" and fine-tuned it on the same BC5CDR dataset. Our goal was to see if we could close the gap with the much larger `tner` model (at least 4 times larger). This is more of an exercise in understanding the fine tuning process and the impact of domain specific pre training rather than just chasing the highest score. From the outset we knew the 'tner' model would be hard to beat, but we wanted to give it our best try.

### 4. The infrastructure:
We used the Hugging Face Transformers library but had to built our own infrastructure that could allow us to easily swap out different models and compare their performance on the same dataset. We also implemented a custom evaluation pipeline to ensure that we were measuring performance consistently across all models. Additionally we implemented a config file that allows us to turn extra layers like a CRF on or off and we implemented an automatic report generator that can take the results of the current run and generate a HTML based report that can be shared with others and easily compared with previous runs. Each report contains a random sample of test sentences as well as specific custom sentences. We will allow you to generate your own report and allow you to test custom sentences later in this notebook!


In [None]:
# [Cell 2]
import inspect

print("Our custom backbone-agnostic Transformer + CRF implementation:")
print("-" * 60)
# This will print the source code of your class directly into the notebook output!
print(inspect.getsource(TransformerNERWithCRF))

In [None]:
# [Cell 3]
model_options = {
    "RoBERTa Base": "roberta-base",
    "BioBERT Base": "dmis-lab/biobert-base-cased-v1.2",
    "SciBERT": "allenai/scibert_scivocab_uncased",
    "PubMedBERT": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    "Biomed RoBERTa": "allenai/biomed_roberta_base"
}

PRETRAINED_BASELINE = "tner/roberta-large-bc5cdr"
dropdown = widgets.Dropdown(
    options=model_options,
    value="allenai/biomed_roberta_base",
    description='Select Model:',
)

button = widgets.Button(description="Generate Report", button_style='success')
output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        selected_model = dropdown.value
        print(f"Loading pipeline for {selected_model}... This may take a moment.")

        run_evaluation(selected_model, PRETRAINED_BASELINE)

        print(f"Report successfully generated for {selected_model}!")

button.on_click(on_button_clicked)
display(dropdown, button, output)

In [None]:
# [Cell 4]
# Assuming model_ours, tokenizer_ours, and pipe_pre are loaded in memory
# (You can load them in a hidden cell prior to this one)

text_area = widgets.Textarea(
    value='The patient was prescribed 50mg of Aspirin for the severe headache.',
    placeholder='Enter clinical text here...',
    description='Text:',
    layout=widgets.Layout(width='80%', height='80px')
)

eval_button = widgets.Button(description="Extract Entities", button_style='info')
eval_output = widgets.Output()

def on_eval_clicked(b):
    with eval_output:
        eval_output.clear_output()
        text = text_area.value
        opts = {"colors": COLORS}

        # 1. Custom CRF Model
        our_preds = predict_with_custom_model(text, model_ours, tokenizer_ours)
        print("Our Custom CRF Model:")
        displacy.render({"text": text, "ents": merge_dosages(our_preds, text)},
                        style="ent", manual=True, options=opts, jupyter=True)

        # 2. Baseline Model
        pre_preds = predict_with_pipeline(text, pipe_pre)
        print("\nPretrained Baseline:")
        displacy.render({"text": text, "ents": merge_dosages(pre_preds, text)},
                        style="ent", manual=True, options=opts, jupyter=True)

eval_button.on_click(on_eval_clicked)
display(text_area, eval_button, eval_output)