In [None]:
import polars as pl
import requests
import io

DATA_URL = "https://huggingface.co/datasets/Ahmad0067/MedSynth/resolve/main/MedSynth_huggingface_final.csv"


print(df.shape)
print(df.head())

(10240, 4)
shape: (5, 4)
┌────────────────────┬─────────────────────────────────┬────────┬───────────────────┐
│  Note              ┆ Dialogue                        ┆ ICD10  ┆ ICD10_desc        │
│ ---                ┆ ---                             ┆ ---    ┆ ---               │
│ str                ┆ str                             ┆ str    ┆ str               │
╞════════════════════╪═════════════════════════════════╪════════╪═══════════════════╡
│ **1. Subjective:** ┆ [doctor]: Hello! It’s good to … ┆ M25562 ┆ PAIN IN LEFT KNEE │
│                    ┆                                 ┆        ┆                   │
│    **Chief…        ┆                                 ┆        ┆                   │
│ **1. Subjective:** ┆ [doctor] Hi there, how are you… ┆ M25562 ┆ PAIN IN LEFT KNEE │
│                    ┆                                 ┆        ┆                   │
│    - **Chi…        ┆                                 ┆        ┆                   │
│ **1. Subjective:** ┆ [docto

In [52]:
import pandas as pd
from icd_codes.utils.body_part_extraction import extract_body_parts_dataframe

from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env file

True

In [30]:
df = pd.read_csv("data/medsynth.csv")
df = df.rename(columns={" Note": "Note"})
df.columns

Index(['Note', 'Dialogue', 'ICD10', 'ICD10_desc'], dtype='str')

In [31]:
import pandas as pd
import re

# We define the columns and their specific stopping points (lookaheads)
# The (?i) makes it case-insensitive, and re.DOTALL (via the flags) lets '.' match newlines
sections = {
    'chief_complaint': r'(?i)Chief Complaint \(CC\):\s*(.*?)(?=\n\s*\*\*|\n\s*History|\n\s*Review|$)',
    'history': r'(?i)History of Present Illness \(HPI\):\s*(.*?)(?=\n\s*\*\*|\n\s*Review|$)',
    'ros': r'(?i)Review of Systems \(ROS\):\s*(.*?)(?=\n\s*\*\*|\n\s*2\. Objective|\n\s*Patient|$)',
    'general_appearance': r'(?i)General:\s*(.*?)(?=\n\s*Musculoskeletal|\n\s*Vital Signs|\n\s*Physical Exam|$)',
    'vital_signs': r'(?i)Vital Signs:\s*(.*?)(?=\n\s*\*\*|\n\s*Physical Exam|\n\s*3\. Assessment|$)',
    'physical_examination': r'(?i)Physical Exam(?:ination)?:\s*(.*?)(?=\n\s*\*\*|\n\s*3\. Assessment|$)',
    'assessment': r'(?i)3\. Assessment:\s*(.*?)(?=\n\s*\*\*|\n\s*4\. Plan|$)'
}

# Apply the extraction
for col, pattern in sections.items():
    # expand=False returns a Series, expand=True (default) returns a DataFrame
    df[col] = df['Note'].str.extract(pattern, flags=re.DOTALL)[0]

# Cleanup: remove excess whitespace and leading dashes/bullets
cols_to_clean = list(sections.keys())
df[cols_to_clean] = df[cols_to_clean].apply(lambda x: x.str.strip().str.lstrip('-').str.strip())

In [None]:
print(df[df['chief_complaint'].isnull()].iloc[:, 0])

6        #### EMERGENCY VISIT MEDICAL NOTE\n\n**1. Subj...
47       **1. Subjective:**\n\n**CHIEF COMPLAINT:**\nRo...
48       **1. Subjective:**\n\n**CC:**\nElevated blood ...
51       #####\n**1. Subjective:**\n\n**CHIEF COMPLAINT...
69       #####\n**1. Subjective**\n\n**CHIEF COMPLAINT*...
                               ...                        
10213    **1. Subjective:**\nMrs. Sarah Johnson is a 76...
10221    #####\n**1. Subjective**\n\n**Chief Complaint ...
10223    #####\n**1. Subjective:**\n\n**Chief Complaint...
10226    **1. Subjective:**\nLily Chen is a 52-year-old...
10227    ### Medical Note\n\n**1. Subjective:**\n\n**CH...
Name: Note, Length: 1303, dtype: str


In [None]:
import pandas as pd
import re

# We define the "start" of each section.
# We don't use strict lookaheads; we just grab content until we see a common "next" section indicator.
patterns = {
    # Matches CC, CC:, **CC**, etc., then grabs everything until the next header or double newline
    'chief_complaint': r'(?i)(?:Chief Complaint|CC|CHIEF COMPLAINT)[\s\*:]+([\s\S]*?)(?=\n\s*(?:History|HPI|ROS|####|2\.)|$)',
    'history': r'(?i)(?:History of Present Illness|HPI)[\s\*:]+([\s\S]*?)(?=\n\s*(?:Review|ROS|####|2\.)|$)',
    'ros': r'(?i)(?:Review of Systems|ROS)[\s\*:]+([\s\S]*?)(?=\n\s*(?:####|2\.|Objective)|$)',
    'general_appearance': r'(?i)General[\s\*:]+([\s\S]*?)(?=\n\s*(?:Vital|Physical|####|Cardiovascular)|$)',
    'vital_signs': r'(?i)Vital Signs[\s\*:]+([\s\S]*?)(?=\n\s*(?:Physical|####|3\.|Assessment)|$)',
    'physical_examination': r'(?i)Physical Exam(?:ination)?[\s\*:]+([\s\S]*?)(?=\n\s*(?:####|3\.|Assessment|Plan)|$)',
    'assessment': r'(?i)(?:3\.\s*)?Assessment[\s\*:]+([\s\S]*?)(?=\n\s*(?:####|4\.|Plan)|$)'
}

def robust_extract(text, pattern):
    if pd.isna(text):
        return None
    match = re.search(pattern, text, flags=re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

# Apply the extraction
for col, pattern in patterns.items():
    df[col] = df['Note'].apply(lambda x: robust_extract(x, pattern))

# Clean up leftover markdown and artifacts
def clean_text(text):
    if not text: return text
    # Remove bolding stars, hashes, and leading dashes
    text = re.sub(r'\*\*|#', '', text)
    text = re.sub(r'^[:\-\s]+', '', text)
    return text.strip()

for col in patterns.keys():
    df[col] = df[col].apply(clean_text)

TypeError: expected string or bytes-like object, got 'float'

In [48]:
df[cols_to_clean].isnull().sum()

chief_complaint         9787
history                 9845
ros                     9847
general_appearance      2914
vital_signs             2705
physical_examination    1479
assessment              1755
dtype: int64

In [3]:
import asyncio
import polars as pl
import os
from icd_codes.utils.summary_extraction import extract_summary_dataframe
from dotenv import load_dotenv
load_dotenv() 

df = pl.read_csv("data/medsynth.csv")
df = df.rename({" Note": "Note"})
print(df.columns)

# df = df.slice(offset=0, length=50)
df.shape

['Note', 'Dialogue', 'ICD10', 'ICD10_desc']


(10240, 4)

In [5]:
import pandas as pd
df = pd.read_csv(r"data/medsynth.csv")
icd_df = (
    df[["ICD10", "ICD10_desc"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
print(len(icd_df))

2037


In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import faiss
import pickle
from tqdm import tqdm

device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

  from .autonotebook import tqdm as notebook_tqdm


mps


In [2]:
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "cambridgeltl/SapBERT-from-PubMedBERT-fulltext"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 504.17it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: cambridgeltl/SapBERT-from-PubMedBERT-fulltext
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [3]:
# --------------------------------------------------
# Mean-pool embedding
# --------------------------------------------------
def embed(texts, batch_size=32, max_length=64):
    all_emb = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]

        tokens = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            out = model(**tokens)

        emb = out.last_hidden_state.mean(dim=1)
        emb = F.normalize(emb, p=2, dim=1)

        all_emb.append(emb.cpu())

    return torch.cat(all_emb)

In [6]:
print("Embedding ICD descriptions...")
icd_texts = icd_df["ICD10_desc"].tolist()
icd_embeddings = embed(icd_texts)

Embedding ICD descriptions...


100%|██████████| 64/64 [00:06<00:00,  9.38it/s]


In [7]:
print("Embeddings shape:", icd_embeddings.shape)

Embeddings shape: torch.Size([2037, 768])


In [8]:
# Convert safely for FAISS
emb_np = icd_embeddings.detach().to("cpu").numpy().astype("float32")

# --------------------------------------------------
# Build FAISS index (cosine similarity via inner product)
# --------------------------------------------------
print("Building FAISS index...")
dim = emb_np.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb_np)

faiss.write_index(index, "icd_index.faiss")
print("Saved: icd_index.faiss")

# --------------------------------------------------
# Save mapping (index → code, desc)
# --------------------------------------------------
code_map = {
    i: (row.ICD10, row.ICD10_desc)
    for i, row in icd_df.iterrows()
}

with open("code_map.pkl", "wb") as f:
    pickle.dump(code_map, f)

print("Saved: code_map.pkl")

Building FAISS index...
Saved: icd_index.faiss
Saved: code_map.pkl


In [9]:
# --------------------------------------------------
# Sanity check retrieval
# --------------------------------------------------
def retrieve(text, top_k=5):
    q = embed([text], max_length=128)
    q_np = q.detach().to("cpu").numpy().astype("float32")

    D, I = index.search(q_np, top_k)

    results = []
    for score, idx in zip(D[0], I[0]):
        code, desc = code_map[idx]
        results.append((code, desc, float(score)))

    return results


In [10]:
print("\nTest query:")
test_query = "Other specified injuries of the head initial encounter"
for r in retrieve(test_query):
    print(r)


Test query:


100%|██████████| 1/1 [00:00<00:00,  1.79it/s]

: 

In [1]:
import mlflow
print(mlflow.__version__)

  from .autonotebook import tqdm as notebook_tqdm


3.9.0


In [2]:
import pandas as pd
print(pd.__version__)

2.3.3


In [None]:
'''
  - run_id (text) - the mlflow run ID
  - row_id (int) - the original row ID from the dataset
  - note (text) - the original note text
  - summarized_note (text) - the summarized note text
  - dialogue (text) - the dialogue from the data
  - icd_ground_truth (text) - icd_code_ground_truth from the dataset(text)
  - icd_desc_ground_truth (text) - icd_desc_ground_truth from the dataset (text)
  - predicted_icd_code (bool) - predicted icd code from llm
  - is_correct - if predicted is same as icd_ground_truth
  - is_present_in_topk - if the ground truth code is present in the top-k retrieved codes
  - created_at (timestamptz default now)

'''

In [3]:
from openai import OpenAI
client = OpenAI()

x = client.models.list()


In [8]:
x.__dict__

{'data': [Model(id='gpt-4-0613', created=1686588896, object='model', owned_by='openai'),
  Model(id='gpt-4', created=1687882411, object='model', owned_by='openai'),
  Model(id='gpt-3.5-turbo', created=1677610602, object='model', owned_by='openai'),
  Model(id='gpt-5.2-codex', created=1766164985, object='model', owned_by='system'),
  Model(id='gpt-4o-mini-tts-2025-12-15', created=1765610837, object='model', owned_by='system'),
  Model(id='gpt-realtime-mini-2025-12-15', created=1765612007, object='model', owned_by='system'),
  Model(id='gpt-audio-mini-2025-12-15', created=1765760008, object='model', owned_by='system'),
  Model(id='chatgpt-image-latest', created=1765925279, object='model', owned_by='system'),
  Model(id='davinci-002', created=1692634301, object='model', owned_by='system'),
  Model(id='babbage-002', created=1692634615, object='model', owned_by='system'),
  Model(id='gpt-3.5-turbo-instruct', created=1692901427, object='model', owned_by='system'),
  Model(id='gpt-3.5-turbo-i