# eICU benchmarking with Claude

In [10]:
from dotenv import load_dotenv
import os
load_dotenv()

from openai import OpenAI
from loguru import logger
import sys
import json
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))


# Set up dataset

In [11]:
# input test set directory
test_data_dir = "../model_evaluation/dataset/test"
test_data_fp = os.path.join(test_data_dir, "test_ehrsql_eicu_data_benchmark.json")

# try loading the test set
with open(test_data_fp, "r") as f:
    test_set = json.load(f)

print("Size of test set:", len(test_set))

print("Example of a test set item:")
print(json.dumps(test_set[0], indent=4))


Size of test set: 1792
Example of a test set item:
{
    "index": 1,
    "system": "Based on DDL statements, instructions, and the current date, generate a SQL query in the following sqlite to answer the question.\n If the question cannot be answered using the available tables and columns in the DDL (i.e., it is out of scope), return only: None.\nToday is 2105-12-31 23:59:00\nDDL statements:\nDROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- unique identifier for a single ICU stay of a patient.\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient (\"female\" or \"male\") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- 

# Set up client

In [12]:
import asyncio
from openai import AsyncOpenAI
import json
from loguru import logger

# Initialize the AsyncOpenAI client
async_client = AsyncOpenAI(
    api_key=os.environ['BEDROCK_OPENAI_API_KEY'],
    base_url=os.environ['BEDROCK_OPENAI_BASE_URL']
)

In [None]:
from utils import postprocess_sql_query_from_markdown

async def get_prediction(
    record: dict, 
    reasoning_effort:[None, "low", "medium", "high"] = None, 
    model="us.anthropic.claude-3-7-sonnet-20250219-v1:0"
    
    ) -> str:
    """
    Generates a SQL query for a given record asynchronously
    Args:
        record: a dictionary of Q&A pairs
    Returns:
        A dictionary containing the input, predict, and real fields
    """
    index = record['index']
    system = record['system']
    user = record['user']
    real = record['real']

    prompt_chat_template = [
        {
            "role": "system",
            "content": f"{system}",
        },
        {
            "role": "user",
            "content": f"{user}"
        }
    ]

    try:
        if reasoning_effort is None:
            response = await async_client.chat.completions.create(
                model=model,
                messages=prompt_chat_template,
                max_completion_tokens=512, # if no reasoning, keep at 512 same as Mistral
            )
        else:
            response = await async_client.chat.completions.create(
                model=model,
                messages=prompt_chat_template,
                max_completion_tokens=1024*2, # With reasnoning minimum is 1024. and must be greater than prompt
                reasoning_effort=reasoning_effort,
            )

        generated_sql = response.choices[0].message.content
        prediction = postprocess_sql_query_from_markdown(generated_sql)
        logger.info(f"Generated SQL for index {index}: {prediction}") 

        return {
            "index": index,
            "input": user, # user query
            "predict": prediction,  # model predicted SQL
            "real": real # ground truth SQL
        }
        
    except Exception as e:
        logger.error(f"An error occurred for index {record.get('index', 'N/A')}: {e}")
        return f"Error: {e}"

In [17]:
async def run_predictions_async(test_fp: str, output_fp: str, reasoning_effort: [None, "low", "medium", "high"] = None):
    """
    Loads a test set and runs predictions asynchronously.
    """
    # Path to the mini test set
    with open(test_fp, "r") as f:
        test_data = json.load(f)
    
    # chunk the data into 5 items at a time to avoid rate limit
    chunk_size = 5
    results = []

    for i in range(0, len(test_data), chunk_size):
        logger.info(f"Processing chunk {i} to {i+chunk_size-1}")
        chunk = test_data[i:i+chunk_size]
        tasks = [get_prediction(record = record, reasoning_effort=reasoning_effort) for record in chunk]
        
        logger.info(f"Sending {len(tasks)} requests to the vLLM server...")
        predictions = await asyncio.gather(*tasks)
        logger.success("All predictions done.")

        # add predictions to the test data by looking up the index
        for pred in predictions:
            # create a dictionary with the index and prediction
            results.append({
                "index": pred['index'],
                "input": pred['input'],
                "predict": pred['predict'], 
                "real": pred['real']
            })

    # handle if the output file does not exist
    with open(output_fp, "w") as f:
        json.dump(results, f, indent=4)

    logger.success(f"Predictions saved to {output_fp}")

# Generate Queries

In [18]:
from utils import preprare_directory
from time import time

# model name 
model_name = "claude_3_7_sonnet_thinking"
# trial number 
trial_number = 0    
# reasoning effort. None means no reasoning effort. Or "low", "medium", "high"
reasoning_effort = "medium"

# create output directory. set exist_ok=False to delete the old results and re-create a fresh folder
output_dir = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"    
preprare_directory(output_dir, exist_ok=False)


# run predictions
start_time = time()
# run predictions
await run_predictions_async(
    test_fp = test_data_fp,
    output_fp = os.path.join(output_dir, "predictions.json"), 
    reasoning_effort = reasoning_effort
)
end_time = time()
logger.success(f"Model {model_name}. Trial {trial_number} completed. Time taken: {end_time - start_time:.2f} seconds")

[32m2025-07-27 22:45:01.042[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m14[0m - [1mProcessing chunk 0 to 4[0m
[32m2025-07-27 22:45:01.043[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m18[0m - [1mSending 5 requests to the vLLM server...[0m


[32m2025-07-27 22:45:06.944[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m49[0m - [1mGenerated SQL for index 2: SELECT DISTINCT routeadmin
FROM medication
WHERE LOWER(drugname) = 'clonidine'
ORDER BY routeadmin;[0m
[32m2025-07-27 22:45:07.644[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m49[0m - [1mGenerated SQL for index 4: SELECT DISTINCT routeadmin
FROM medication
WHERE drugname LIKE '%propofol%'
AND dosage LIKE '%1000 mg/100 ml (pmx)%';[0m
[32m2025-07-27 22:45:09.664[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m49[0m - [1mGenerated SQL for index 1: None[0m
[32m2025-07-27 22:45:10.047[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m49[0m - [1mGenerated SQL for index 5: SELECT DISTINCT routeadmin
FROM medication
WHERE drugname = 'zolpidem tartrate'
AND dosage LIKE '5 mg%'
AND LOWER(routeadmin) LIKE 'po%';[0m
[32m2025-07-27 22:45:11.838[0m | [1mINFO    [0m | [36m

**Trial 0**: 
- Time taken:  4368.33 seconds

# Evaluate

In [19]:
import os
from utils import preprare_directory

# create output directory for evaluation results, relative to the path of model_evaluation directory
# note that the evaluate results need a clean new folder, because it will overwrite any existing files in the folder
pred_directory = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"
eval_directory = os.path.join(pred_directory, "evaluation")
preprare_directory(eval_directory, exist_ok=False)

# the predicted file from previous step
pred_file = os.path.join(pred_directory, "predictions.json")

print("Using predictions from: ", pred_file)

# path to the eICU database
db_path = "../model_evaluation/databases/eicu.sqlite"

Using predictions from:  ../model_predictions/eICU/claude_3_7_sonnet_thinking/trial_0/predictions.json


In [20]:
# run evaluation
!python ../model_evaluation/ehrsql_eval.py \
    --pred_file {pred_file} \
    --db_path {db_path} \
    --num_workers -1 \
    --timeout 60 \
    --out_file {eval_directory} \
    --ndigits 2

# Interpret Results

In [21]:
fp = f"../model_predictions/eICU/{model_name}/trial_{trial_number}/evaluation/predictions_metrics.json"

with open(fp, "r") as f:
    metrics = json.load(f)

print(json.dumps(metrics, indent=4))

{
    "precision_ans": 100.0,
    "recall_ans": 63.11,
    "f1_ans": 77.39,
    "precision_exec": 32.98,
    "recall_exec": 20.81,
    "f1_exec": 25.52,
    "acc": 20.81
}


In [7]:
from openai import OpenAI
client = OpenAI(
    api_key=os.environ['BEDROCK_OPENAI_API_KEY'],
    base_url=os.environ['BEDROCK_OPENAI_BASE_URL']
)

messages = [{"role": "user", "content": "which one is bigger, 3.9 or 3.11?"}]
response = client.chat.completions.create(
    model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
    messages=messages,
    reasoning_effort="medium",
    max_completion_tokens=4096,

)

# reasoning trace
reasoning_content = response.choices[0].message.reasoning_content

# final answer
content = response.choices[0].message.content

content

'3.9 is bigger than 3.11.\n\nWhen comparing decimal numbers, we need to compare the digits from left to right. Both numbers have the same whole number (3), so we look at the tenths place (first digit after the decimal point):\n\n- 3.9 has 9 in the tenths place\n- 3.11 has 1 in the tenths place\n\nSince 9 is greater than 1, 3.9 is greater than 3.11.\n\nAnother way to understand this is that 3.9 equals 3.90 when written with the same number of decimal places, and 3.90 is clearly greater than 3.11.'