In [1]:
import pandas as pd
import re

# Load your MIMIC-IV discharge notes dataset
df = pd.read_csv('data/discharge_journal_df.csv')

In [3]:
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import torch

# Check for MPS availability and set device accordingly
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print('device', device)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
model.to(device)


device mps


DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [5]:
# Create the NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# Define a function to process text in batches
def process_in_batches(text_list, batch_size=32):
    ner_results = []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Processing Batches"):
        batch = text_list[i:i + batch_size]
        ner_results.extend(ner_pipeline(batch))
    return ner_results

# Apply the batch processing function
texts = df['text'].tolist()
df['ner_entities'] = process_in_batches(texts)

# Save the processed DataFrame
df.to_csv('processed_data.csv', index=False)


Processing Batches:   0%|          | 0/476 [00:00<?, ?it/s]

In [13]:
train_df  = pd.read_csv('data/training_text.csv') 
train_df['training_text'].iloc[0]

'Subject ID: 10000032, Hospital Admission ID: 22595853, Admission Time: 2180-05-06 22:23:00\nAdmission Type: URGENT, Admission Location: TRANSFER FROM HOSPITAL, Insurance: Other, Language: ENGLISH, Marital Status: WIDOWED, Race: WHITE, Gender: F, Age: 52 Worsening ABD distension and pain Paracentesis ___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, \nbioplar, PTSD, presented from OSH ED with worsening abd \ndistension over past week.  \nPt reports self-discontinuing lasix and spirnolactone ___ weeks \nago, because she feels like "they don\'t do anything" and that \nshe "doesn\'t want to put more chemicals in her." She does not \nfollow Na-restricted diets. In the past week, she notes that she \nhas been having worsening abd distension and discomfort. She \ndenies ___ edema, or SOB, or orthopnea. She denies f/c/n/v, d/c, \ndysuria. She had food poisoning a week ago from eating stale \ncake (n/v 20 min after food ingestion), which resolved the same \nday. She denies other recen

In [8]:
import json

# Function to convert pipeline output to a JSON format
def format_entities_to_json(text, entities):
    annotated_text = []
    start_idx = 0

    for entity in entities:
        # Append text before the entity
        if start_idx < entity['start']:
            annotated_text.append({
                'text': text[start_idx:entity['start']],
                'label': 'O'
            })
        
        # Append the entity
        annotated_text.append({
            'text': text[entity['start']:entity['end']],
            'label': entity['entity_group']
        })
        
        start_idx = entity['end']
    
    # Append remaining text if any
    if start_idx < len(text):
        annotated_text.append({
            'text': text[start_idx:],
            'label': 'O'
        })
    
    return annotated_text

# Convert entities to JSON format
i=0
formatted_entities = format_entities_to_json(df['text'].iloc[i], df['ner_entities'].iloc[i])
json_output = json.dumps(formatted_entities, indent=2)
print(json_output)


[
  {
    "text": "Subject ID: 10000032, ",
    "label": "O"
  },
  {
    "text": "HAd",
    "label": "Disease_disorder"
  },
  {
    "text": "m ID: 22595853, Chart Time: 2180",
    "label": "O"
  },
  {
    "text": "-05-07",
    "label": "Time"
  },
  {
    "text": " ",
    "label": "O"
  },
  {
    "text": "00:",
    "label": "Time"
  },
  {
    "text": "00",
    "label": "O"
  },
  {
    "text": ":00",
    "label": "Time"
  },
  {
    "text": "\n\n \nName:  ___                     Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   F\n \nService: MEDICINE\n \nAllergies: \nNo Known Allergies / ",
    "label": "O"
  },
  {
    "text": "Adverse Drug Reactions",
    "label": "Sign_symptom"
  },
  {
    "text": "\n \nAttending: ___\n \nChief Complaint:\nWorsening ABD ",
    "label": "O"
  },
  {
    "text": "di",
    "label": "Sign_symptom"
  },
  {
    "text": "stension",
    "label": "Sign_symptom"
  },
  {
    "text": "

In [10]:
from IPython.core.display import display, HTML

# Function to visualize annotated text
def visualize_annotated_text(annotated_text):
    html = ""
    colors = {
        "Age": "#e6f7ff",
        "Sex": "#ffcccc",
        "Clinical_event": "#d9f2e6",
        "Sign_symptom": "#ffffcc",
        "Lab_value": "#e6e6ff",
        "Diagnostic_procedure": "#ffe6cc",
        "Detailed_description": "#ffccf2",
        "Disease_disorder": "#cce6ff",
        "Therapeutic_procedure": "#ffb3b3",
        "Biological_structure": "#e6ffe6",
        "Coreference": "#ffebcc",
        "Date": "#ffcccc",
        "Medication": "#cce6ff"
    }
    
    for segment in annotated_text:
        color = colors.get(segment['label'], "#ffffff")
        html += f"<span style='background-color:{color}'>{segment['text']}</span>"
    
    display(HTML(html))

# Visualize the annotated text
visualize_annotated_text(formatted_entities)


  from IPython.core.display import display, HTML
