In [5]:
import pandas as pd
import re

# Load your MIMIC-IV discharge notes dataset
df = pd.read_csv('data/discharge_journal_df.csv')

In [6]:
df.text.iloc[0]

'Subject ID: 10000032, HAdm ID: 22595853, Chart Time: 2180-05-07 00:00:00\n\n \nName:  ___                     Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   F\n \nService: MEDICINE\n \nAllergies: \nNo Known Allergies / Adverse Drug Reactions\n \nAttending: ___\n \nChief Complaint:\nWorsening ABD distension and pain \n \nMajor Surgical or Invasive Procedure:\nParacentesis\n\n \nHistory of Present Illness:\n___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, \nbioplar, PTSD, presented from OSH ED with worsening abd \ndistension over past week.  \nPt reports self-discontinuing lasix and spirnolactone ___ weeks \nago, because she feels like "they don\'t do anything" and that \nshe "doesn\'t want to put more chemicals in her." She does not \nfollow Na-restricted diets. In the past week, she notes that she \nhas been having worsening abd distension and discomfort. She \ndenies ___ edema, or SOB, or orthopnea. She 

In [7]:

# Basic text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['cleaned_notes'] = df['text'].apply(clean_text)

# Pre-trained NER Model (SpaCy)

In [8]:
import spacy

# Load pre-trained NER model
nlp = spacy.load("en_core_web_sm")

# Apply the model to your cleaned discharge notes
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['entities'] = df['cleaned_notes'].apply(extract_entities)


In [15]:
# Extract all labels from the annotated sentences
all_labels = [label for sentence in df['entities'] for _, label in sentence]

# Get unique labels
unique_labels = set(all_labels)

# Count the number of unique labels
NUMBER_OF_LABELS = len(unique_labels)

print(f"Number of labels: {NUMBER_OF_LABELS}")
print(f"Unique labels: {unique_labels}")

Number of labels: 18
Unique labels: {'LANGUAGE', 'PRODUCT', 'LAW', 'EVENT', 'ORDINAL', 'MONEY', 'PERSON', 'CARDINAL', 'NORP', 'LOC', 'WORK_OF_ART', 'TIME', 'PERCENT', 'FAC', 'DATE', 'ORG', 'QUANTITY', 'GPE'}


In [18]:
from pprint import pprint

pprint(df.iloc[0]['cleaned_notes'])

('subject id 10000032 hadm id 22595853 chart time 21800507 000000 name ___ '
 'unit no ___ admission date ___ discharge date ___ date of birth ___ sex f '
 'service medicine allergies no known allergies  adverse drug reactions '
 'attending ___ chief complaint worsening abd distension and pain major '
 'surgical or invasive procedure paracentesis history of present illness ___ '
 'hcv cirrhosis cb ascites hiv on art ho ivdu copd bioplar ptsd presented from '
 'osh ed with worsening abd distension over past week pt reports '
 'selfdiscontinuing lasix and spirnolactone ___ weeks ago because she feels '
 'like they dont do anything and that she doesnt want to put more chemicals in '
 'her she does not follow narestricted diets in the past week she notes that '
 'she has been having worsening abd distension and discomfort she denies ___ '
 'edema or sob or orthopnea she denies fcnv dc dysuria she had food poisoning '
 'a week ago from eating stale cake nv 20 min after food ingestion which 

In [19]:
pprint(df.iloc[0]['entities'])

[('22595853', 'DATE'),
 ('21800507 000000', 'DATE'),
 ('sex f service', 'PERSON'),
 ('cb ascites hiv', 'ORG'),
 ('ho ivdu', 'PERSON'),
 ('past week', 'DATE'),
 ('weeks ago', 'DATE'),
 ('the past week', 'DATE'),
 ('a week ago', 'DATE'),
 ('20', 'CARDINAL'),
 ('recent weeks', 'DATE'),
 ('melena', 'ORG'),
 ('hematuria', 'GPE'),
 ('984', 'CARDINAL'),
 ('70 10663 16', 'DATE'),
 ('tbili16', 'PERSON'),
 ('5k', 'CARDINAL'),
 ('77', 'CARDINAL'),
 ('16', 'CARDINAL'),
 ('1', 'CARDINAL'),
 ('2', 'CARDINAL'),
 ('3', 'CARDINAL'),
 ('4', 'CARDINAL'),
 ('dr ___', 'PERSON'),
 ('5', 'CARDINAL'),
 ('6', 'CARDINAL'),
 ('this year 8', 'DATE'),
 ('9', 'CARDINAL'),
 ('15 mm', 'QUANTITY'),
 ('10', 'CARDINAL'),
 ('11', 'CARDINAL'),
 ('12', 'CARDINAL'),
 ('five', 'CARDINAL'),
 ('one', 'CARDINAL'),
 ('two months ago', 'DATE'),
 ('years ago', 'DATE'),
 ('a couple of years ago', 'DATE'),
 ('981 10761 78 18', 'DATE'),
 ('ctab', 'PERSON'),
 ('neuro aao3', 'PERSON'),
 ('3', 'CARDINAL'),
 ('5 minutes', 'TIME'),
 ('98 

In [45]:
df.text.iloc[0]

'Subject ID: 10000032, HAdm ID: 22595853, Chart Time: 2180-05-07 00:00:00\n\n \nName:  ___                     Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   F\n \nService: MEDICINE\n \nAllergies: \nNo Known Allergies / Adverse Drug Reactions\n \nAttending: ___\n \nChief Complaint:\nWorsening ABD distension and pain \n \nMajor Surgical or Invasive Procedure:\nParacentesis\n\n \nHistory of Present Illness:\n___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, \nbioplar, PTSD, presented from OSH ED with worsening abd \ndistension over past week.  \nPt reports self-discontinuing lasix and spirnolactone ___ weeks \nago, because she feels like "they don\'t do anything" and that \nshe "doesn\'t want to put more chemicals in her." She does not \nfollow Na-restricted diets. In the past week, she notes that she \nhas been having worsening abd distension and discomfort. She \ndenies ___ edema, or SOB, or orthopnea. She 

SpaCy is not very good.

## BioMedical NER

In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the BioBERT NER tokenizer and model
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Check the number of labels
num_labels = model.config.num_labels
print(f"Number of labels: {num_labels}")

# Load the model's config to get the labels
label_dict = model.config.id2label
print(f"Labels: {label_dict}")


Number of labels: 84
Labels: {0: 'O', 1: 'B-Activity', 2: 'B-Administration', 3: 'B-Age', 4: 'B-Area', 5: 'B-Biological_attribute', 6: 'B-Biological_structure', 7: 'B-Clinical_event', 8: 'B-Color', 9: 'B-Coreference', 10: 'B-Date', 11: 'B-Detailed_description', 12: 'B-Diagnostic_procedure', 13: 'B-Disease_disorder', 14: 'B-Distance', 15: 'B-Dosage', 16: 'B-Duration', 17: 'B-Family_history', 18: 'B-Frequency', 19: 'B-Height', 20: 'B-History', 21: 'B-Lab_value', 22: 'B-Mass', 23: 'B-Medication', 24: 'B-Non[biological](Detailed_description', 25: 'B-Nonbiological_location', 26: 'B-Occupation', 27: 'B-Other_entity', 28: 'B-Other_event', 29: 'B-Outcome', 30: 'B-Personal_[back](Biological_structure', 31: 'B-Personal_background', 32: 'B-Qualitative_concept', 33: 'B-Quantitative_concept', 34: 'B-Severity', 35: 'B-Sex', 36: 'B-Shape', 37: 'B-Sign_symptom', 38: 'B-Subject', 39: 'B-Texture', 40: 'B-Therapeutic_procedure', 41: 'B-Time', 42: 'B-Volume', 43: 'B-Weight', 44: 'I-Activity', 45: 'I-Admin

In [41]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[{'entity_group': 'Sign_symptom',
  'score': 0.9999311,
  'word': 'pal',
  'start': 38,
  'end': 41},
 {'entity_group': 'Sign_symptom',
  'score': 0.90633225,
  'word': '##pitations',
  'start': 41,
  'end': 50},
 {'entity_group': 'Clinical_event',
  'score': 0.99975544,
  'word': 'follow',
  'start': 54,
  'end': 60},
 {'entity_group': 'Date',
  'score': 0.999867,
  'word': '6 months after',
  'start': 64,
  'end': 78}]

In [50]:
df['ner_entities'] = df['text'].apply(pipe)

[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [None]:
df.to_csv('ner_df.csv', index=False)

# BERT

In [10]:
from datasets import Dataset

# Convert your DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df[['cleaned_notes']])

# Example function to tokenize and align labels
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["cleaned_notes"], truncation=True, padding="max_length", max_length=128)
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/15228 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=NUMBER_OF_LABELS)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
def extract_entities_with_model(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities = [(tokens[i], predictions[0][i].item()) for i in range(len(tokens))]
    return entities

df['entities_finetuned'] = df['cleaned_notes'].apply(extract_entities_with_model)
