# Named Entity Recognition (NER) – English Stroke Report Prototype

This notebook shows a minimal working example for NER using a synthetic English sentence relevant to stroke MRI reports. It uses `transformers`, `datasets`, and `evaluate` with the `bert-base-cased` model.

In [None]:
# Install dependencies (run once)
!pip install transformers datasets evaluate torch --quiet

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import evaluate
import numpy as np

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Sample synthetic data (token-level labels in BIO format)
examples = [
    {
        "tokens": ["The", "patient", "received", "10", "mg", "rtPA", "under", "general", "anesthesia", "."],
        "ner_tags": ["O", "O", "O", "B-DOSE", "I-DOSE", "B-DRUG", "O", "B-ANESTH", "I-ANESTH", "O"]
    }
]

ner_dataset = Dataset.from_list(examples)

In [None]:
# Define label list and map
label_list = ["O", "B-DOSE", "I-DOSE", "B-DRUG", "B-ANESTH", "I-ANESTH"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

In [None]:
# Tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label2id[example["ner_tags"][word_idx]])
        else:
            label_ids.append(label2id[example["ner_tags"][word_idx]])
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

tokenized_dataset = ner_dataset.map(tokenize_and_align_labels)

In [None]:
# Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

In [None]:
# Evaluate (dummy run on one example)
metric = evaluate.load("seqeval")
outputs = model(**{k: tokenized_dataset[0][k][None] for k in ["input_ids", "attention_mask"]})
logits = outputs.logits
predictions = logits.argmax(dim=-1)
labels = tokenized_dataset[0]["labels"]

# Map back to labels
predicted_labels = [id2label[int(p)] for p in predictions[0] if p != -100]
true_labels = [id2label[int(l)] for l in labels if l != -100]

print("PRED:", predicted_labels)
print("TRUE:", true_labels)