# eICU benchmarking with Claude

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

from openai import OpenAI
from loguru import logger
import sys
import json
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))


# Set up dataset

In [2]:
# input test set directory
test_data_dir = "../model_evaluation/dataset/test"
test_data_fp = os.path.join(test_data_dir, "test_ehrsql_eicu_data_benchmark.json")

# try loading the test set
with open(test_data_fp, "r") as f:
    test_set = json.load(f)

print("Size of test set:", len(test_set))

print("Example of a test set item:")
print(json.dumps(test_set[0], indent=4))


Size of test set: 1792
Example of a test set item:
{
    "index": 1,
    "system": "Based on DDL statements, instructions, and the current date, generate a SQL query in the following sqlite to answer the question.\n If the question cannot be answered using the available tables and columns in the DDL (i.e., it is out of scope), return only: None.\nToday is 2105-12-31 23:59:00\nDDL statements:\nDROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- unique identifier for a single ICU stay of a patient.\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient (\"female\" or \"male\") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- 

# Set up client

In [3]:
import asyncio
from openai import AsyncOpenAI
import json
from loguru import logger

# Initialize the AsyncOpenAI client
async_client = AsyncOpenAI(
    api_key=os.environ['BEDROCK_OPENAI_API_KEY'],
    base_url=os.environ['BEDROCK_OPENAI_BASE_URL']
)

In [4]:
from utils import postprocess_sql_query_from_markdown

async def get_prediction(
    record: dict, 
    model: str,
    reasoning_effort:[None, "low", "medium", "high"], 
    
    ) -> str:
    """
    Generates a SQL query for a given record asynchronously
    Args:
        record: a dictionary of Q&A pairs
        model: the model to use
        reasoning_effort: the reasoning effort to use. None means no reasoning effort. Or "low", "medium", "high"
    Returns:
        A dictionary containing the input, predict, and real fields
    """
    index = record['index']
    system = record['system']
    user = record['user']
    real = record['real']

    prompt_chat_template = [
        {
            "role": "system",
            "content": f"{system}",
        },
        {
            "role": "user",
            "content": f"{user}"
        }
    ]

    try:
        if reasoning_effort is None:
            response = await async_client.chat.completions.create(
                model=model,
                messages=prompt_chat_template,
                max_completion_tokens=512, # if no reasoning, keep at 512 same as Mistral
            )
        else:
            response = await async_client.chat.completions.create(
                model=model,
                messages=prompt_chat_template,
                max_completion_tokens=1024*2, # With reasnoning minimum is 1024. and must be greater than prompt
                reasoning_effort=reasoning_effort,
            )

        generated_sql = response.choices[0].message.content
        prediction = postprocess_sql_query_from_markdown(generated_sql)
        logger.info(f"Generated SQL for index {index}: {prediction}") 

        return {
            "index": index,
            "input": user, # user query
            "predict": prediction,  # model predicted SQL
            "real": real # ground truth SQL
        }
        
    except Exception as e:
        logger.error(f"An error occurred for index {record.get('index', 'N/A')}: {e}")
        return f"Error: {e}"

In [5]:
async def run_predictions_async(test_fp: str, output_fp: str, model: str, reasoning_effort: [None, "low", "medium", "high"]):
    """
    Loads a test set and runs predictions asynchronously.
    """
    # Path to the mini test set
    with open(test_fp, "r") as f:
        test_data = json.load(f)
    
    # chunk the data into 5 items at a time to avoid rate limit
    chunk_size = 5
    results = []

    for i in range(0, len(test_data), chunk_size):
        logger.info(f"Processing chunk {i} to {i+chunk_size-1}")
        chunk = test_data[i:i+chunk_size]
        tasks = [get_prediction(record = record, reasoning_effort=reasoning_effort, model=model) for record in chunk]
        
        logger.info(f"Sending {len(tasks)} requests to the vLLM server...")
        predictions = await asyncio.gather(*tasks)
        logger.success("All predictions done.")

        # add predictions to the test data by looking up the index
        for pred in predictions:
            # create a dictionary with the index and prediction
            results.append({
                "index": pred['index'],
                "input": pred['input'],
                "predict": pred['predict'], 
                "real": pred['real']
            })

    # handle if the output file does not exist
    with open(output_fp, "w") as f:
        json.dump(results, f, indent=4)

    logger.success(f"Predictions saved to {output_fp}")

# Generate Queries

In [12]:
from utils import preprare_directory
from time import time

# ======= CHANGE THIS =======
# model name 
model_name = "claude_4_sonnet_thinking"

# trial number 
trial_number = 1    

# reasoning effort. None means no reasoning effort. Or "low", "medium", "high"
reasoning_effort = "medium"

# model to use
model = "us.anthropic.claude-sonnet-4-20250514-v1:0"
# model = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"

# ======= CHANGE THIS END =======

In [13]:

# create output directory. set exist_ok=False to delete the old results and re-create a fresh folder
output_dir = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"    
preprare_directory(output_dir, exist_ok=False)
#

# run predictions
start_time = time()
# run predictions
await run_predictions_async(
    test_fp = test_data_fp,
    output_fp = os.path.join(output_dir, "predictions.json"), 
    reasoning_effort = reasoning_effort, 
    model = model
)
end_time = time()
logger.success(f"Model {model_name}. Trial {trial_number} completed. Time taken: {end_time - start_time:.2f} seconds")

[32m2025-07-28 20:52:12.835[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m14[0m - [1mProcessing chunk 0 to 4[0m
[32m2025-07-28 20:52:12.836[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m18[0m - [1mSending 5 requests to the vLLM server...[0m


[32m2025-07-28 20:52:16.845[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 2: SELECT DISTINCT routeadmin
FROM medication
WHERE drugname LIKE '%clonidine%';[0m
[32m2025-07-28 20:52:17.152[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 5: SELECT DISTINCT routeadmin 
FROM medication 
WHERE drugname = 'zolpidem tartrate 5 mg po tabs';[0m
[32m2025-07-28 20:52:17.891[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 4: SELECT DISTINCT routeadmin
FROM medication
WHERE drugname = 'propofol 1000 mg/100 ml (pmx)';[0m
[32m2025-07-28 20:52:19.731[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 3: SELECT DISTINCT routeadmin
FROM medication 
WHERE drugname LIKE '%morphine%' 
AND (dosage LIKE '%2 mg%' OR dosage LIKE '%1 ml%' OR dosage LIKE '%syr%');[0m


Note the time taken to run the prediction, from the logs above

# Evaluate

In [14]:
import os
from utils import preprare_directory

# create output directory for evaluation results, relative to the path of model_evaluation directory
# note that the evaluate results need a clean new folder, because it will overwrite any existing files in the folder
pred_directory = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"
eval_directory = os.path.join(pred_directory, "evaluation")
preprare_directory(eval_directory, exist_ok=False)

# the predicted file from previous step
pred_file = os.path.join(pred_directory, "predictions.json")

print("Using predictions from: ", pred_file)

# path to the eICU database
db_path = "../model_evaluation/databases/eicu.sqlite"

Using predictions from:  ../model_predictions/eICU/claude_4_sonnet_thinking/trial_1/predictions.json


In [15]:
# run evaluation
!python ../model_evaluation/ehrsql_eval.py \
    --pred_file {pred_file} \
    --db_path {db_path} \
    --num_workers -1 \
    --timeout 60 \
    --out_file {eval_directory} \
    --ndigits 2

Exception ignored in thread started by <bound method Thread._bootstrap of <StoppableThread(Thread-12 (funcwrap), started daemon 140536364836416)>>:
Traceback (most recent call last):
  File "/root/.local/share/uv/python/cpython-3.13.5-linux-x86_64-gnu/lib/python3.13/threading.py", line 1014, in _bootstrap
    self._bootstrap_inner()
  File "/root/.local/share/uv/python/cpython-3.13.5-linux-x86_64-gnu/lib/python3.13/threading.py", line 1047, in _bootstrap_inner
    self._delete()
  File "/root/.local/share/uv/python/cpython-3.13.5-linux-x86_64-gnu/lib/python3.13/threading.py", line 1049, in _delete
    def _delete(self):
func_timeout.dafunc.FunctionTimedOut-9131368032737615173: Function execute (args=("select observationtime \nfrom vitalperiodic v\njoin patient p on v.patientunitstayid = p.patientunitstayid\nwhere p.uniquepid = '027-85328'\n  and v.observationtime >= '2104-11-09 00:00:00'\n  and v.heartrate is not null\n  and v.heartrate = (\n    select min(v2.heartrate)\n    from vital

# Interpret Results

In [17]:
fp = f"../model_predictions/eICU/{model_name}/trial_{trial_number}/evaluation/predictions_metrics.json"
print("Reading from file: ", fp)

with open(fp, "r") as f:
    metrics = json.load(f)

print(json.dumps(metrics, indent=4))

Reading from file:  ../model_predictions/eICU/claude_4_sonnet_thinking/trial_1/evaluation/predictions_metrics.json
{
    "precision_ans": 100.0,
    "recall_ans": 78.91,
    "f1_ans": 88.21,
    "precision_exec": 40.66,
    "recall_exec": 32.09,
    "f1_exec": 35.87,
    "acc": 32.09
}
