# eICU Benchmark with OpenAI models

In [31]:
from dotenv import load_dotenv
import os
load_dotenv()

from openai import OpenAI
from loguru import logger
import sys
import json
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))

# Set up dataset

In [32]:
# input test set directory
test_data_dir = "../model_evaluation/dataset/test"
test_data_fp = os.path.join(test_data_dir, "test_ehrsql_eicu_data_benchmark.json")

# try loading the test set
with open(test_data_fp, "r") as f:
    test_set = json.load(f)

print("Size of test set:", len(test_set))

print("Example of a test set item:")
print(json.dumps(test_set[0], indent=4))


Size of test set: 1792
Example of a test set item:
{
    "index": 1,
    "system": "Based on DDL statements, instructions, and the current date, generate a SQL query in the following sqlite to answer the question.\n If the question cannot be answered using the available tables and columns in the DDL (i.e., it is out of scope), return only: None.\nToday is 2105-12-31 23:59:00\nDDL statements:\nDROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- unique identifier for a single ICU stay of a patient.\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient (\"female\" or \"male\") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- 

# Set up client

In [33]:
import asyncio
from openai import AsyncAzureOpenAI
import json
from loguru import logger


api_version="2025-04-01-preview"
azure_endpoint = "https://prod.api.nvidia.com/llm/v1/azure"

# Create a client instance
async_client = AsyncAzureOpenAI(
    api_key=os.getenv("LLM_GATEWAY_API"),
    api_version=api_version,
    azure_endpoint=azure_endpoint
)

In [34]:
from utils import postprocess_sql_query_from_markdown

async def get_prediction(
    record: dict, 
    model: str,
    reasoning_effort:[None, "low", "medium", "high"], 
    
    ) -> str:
    """
    Generates a SQL query for a given record asynchronously
    Args:
        record: a dictionary of Q&A pairs
        model: the model to use
        reasoning_effort: the reasoning effort to use. None means no reasoning effort. Or "low", "medium", "high"
    Returns:
        A dictionary containing the input, predict, and real fields
    """
    index = record['index']
    system = record['system']
    user = record['user']
    real = record['real']

    prompt_chat_template = [
        {
            "role": "system",
            "content": f"{system}",
        },
        {
            "role": "user",
            "content": f"{user}"
        }
    ]

    try:
        if reasoning_effort is None:
            response = await async_client.chat.completions.create(
                model=model,
                messages=prompt_chat_template,
                max_completion_tokens=512, # if no reasoning, keep at 512 same as Mistral
            )
        else:
            response = await async_client.chat.completions.create(
                model=model,
                messages=prompt_chat_template,
                max_completion_tokens=1024*2, # not compatible with o-series of models. see https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_completion_tokens
                reasoning_effort=reasoning_effort,
            )

        generated_sql = response.choices[0].message.content
        prediction = postprocess_sql_query_from_markdown(generated_sql)
        logger.info(f"Generated SQL for index {index}: {prediction}") 

        return {
            "index": index,
            "input": user, # user query
            "predict": prediction,  # model predicted SQL
            "real": real # ground truth SQL
        }
        
    except Exception as e:
        logger.error(f"An error occurred for index {record.get('index', 'N/A')}: {e}")
        return f"Error: {e}"

In [35]:
async def run_predictions_async(test_fp: str, output_fp: str, model: str, reasoning_effort: [None, "low", "medium", "high"]):
    """
    Loads a test set and runs predictions asynchronously.
    """
    # Path to the mini test set
    with open(test_fp, "r") as f:
        test_data = json.load(f)
    
    # chunk the data into 5 items at a time to avoid rate limit
    chunk_size = 5
    results = []

    for i in range(0, len(test_data), chunk_size):
        logger.info(f"Processing chunk {i} to {i+chunk_size-1}")
        chunk = test_data[i:i+chunk_size]
        tasks = [get_prediction(record = record, reasoning_effort=reasoning_effort, model=model) for record in chunk]
        
        logger.info(f"Sending {len(tasks)} requests to the vLLM server...")
        predictions = await asyncio.gather(*tasks)
        logger.success("All predictions done.")

        # add predictions to the test data by looking up the index
        # include try... execpt block because Azure OpenAI sometimes returns error because triggering its safety filter
        try:
            for pred in predictions:
                # create a dictionary with the index and prediction
                results.append({
                    "index": pred['index'],
                    "input": pred['input'],
                    "predict": pred['predict'], 
                    "real": pred['real']
                })
        except Exception as e:
            logger.error(f"Error processing predictions for chunk {i} to {i+chunk_size-1}: {e}")
            continue

    # handle if the output file does not exist
    with open(output_fp, "w") as f:
        json.dump(results, f, indent=4)

    logger.success(f"Predictions saved to {output_fp}")

# Generate Queries

In [36]:
from utils import preprare_directory
from time import time

# ======= CHANGE THIS =======
# model name 
model_name = "o4-mini"
model = model_name

# reasoning effort. None means no reasoning effort. Or "low", "medium", "high"
reasoning_effort = "medium"

# trial number 
trial_number = 1
# ======= CHANGE THIS END =======

In [37]:

# create output directory. set exist_ok=False to delete the old results and re-create a fresh folder
output_dir = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"    
preprare_directory(output_dir, exist_ok=False)
#

# run predictions
start_time = time()
# run predictions
await run_predictions_async(
    test_fp = test_data_fp,
    output_fp = os.path.join(output_dir, "predictions.json"), 
    reasoning_effort = reasoning_effort, 
    model = model
)
end_time = time()
logger.success(f"Model {model_name}. Trial {trial_number} completed. Time taken: {end_time - start_time:.2f} seconds")

[32m2025-07-29 23:11:59.606[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m14[0m - [1mProcessing chunk 0 to 4[0m
[32m2025-07-29 23:11:59.606[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m18[0m - [1mSending 5 requests to the vLLM server...[0m


[32m2025-07-29 23:12:03.056[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 2: SELECT DISTINCT routeadmin
FROM medication
WHERE drugname = 'clonidine';[0m
[32m2025-07-29 23:12:03.720[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 1: None[0m
[32m2025-07-29 23:12:05.349[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 4: SELECT DISTINCT routeadmin
FROM medication
WHERE drugname = 'propofol 1000 mg/100 ml (pmx)';[0m
[32m2025-07-29 23:12:06.739[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m51[0m - [1mGenerated SQL for index 3: SELECT DISTINCT routeadmin
FROM medication
WHERE LOWER(drugname) = 'morphine'
  AND LOWER(dosage) LIKE '%2 mg/1 ml%'
  AND LOWER(dosage) LIKE '%1 ml syr%';[0m
[32m2025-07-29 23:12:08.604[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[

For some queries, AzureOpenAI's saftey filter might be triggered for `Self Harm`, resulting in messages like this: 

```bash
2025-07-29 14:06:00.528 | ERROR    | __main__:get_prediction:61 - An error occurred for index 1386: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'detected': False, 'filtered': False}, 'self_harm': {'filtered': True, 'severity': 'medium'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}}}
```

These results are filtered out from the evaluation

Note the time taken to run the prediction, from the logs above

# Evaluate

In [38]:
import os
from utils import preprare_directory

# create output directory for evaluation results, relative to the path of model_evaluation directory
# note that the evaluate results need a clean new folder, because it will overwrite any existing files in the folder
pred_directory = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"
eval_directory = os.path.join(pred_directory, "evaluation")
preprare_directory(eval_directory, exist_ok=False)

# the predicted file from previous step
pred_file = os.path.join(pred_directory, "predictions.json")

print("Using predictions from: ", pred_file)

# path to the eICU database
db_path = "../model_evaluation/databases/eicu.sqlite"

Using predictions from:  ../model_predictions/eICU/o4-mini/trial_1/predictions.json


In [39]:
# run evaluation
!python ../model_evaluation/ehrsql_eval.py \
    --pred_file {pred_file} \
    --db_path {db_path} \
    --num_workers -1 \
    --timeout 60 \
    --out_file {eval_directory} \
    --ndigits 2

Exception ignored in thread started by: <bound method Thread._bootstrap of <StoppableThread(Thread-10 (funcwrap), started daemon 140131235669568)>>
Traceback (most recent call last):
  File "/root/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/threading.py", line 1032, in _bootstrap
    self._bootstrap_inner()
  File "/root/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/threading.py", line 1079, in _bootstrap_inner
    self._delete()
  File "/root/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/threading.py", line 1111, in _delete
    del _active[get_ident()]
                ^^^^^^^^^^^
func_timeout.dafunc.FunctionTimedOut3372939899011937204: Function execute (args=("select min(vp.observationtime) as first_time_of_max_sao2\nfrom vitalperiodic vp\njoin patient p \n  on vp.patientunitstayid = p.patientunitstayid\nwhere p.uniquepid = '018-81471'\n  and vp.observationtime <= '2102-03-18'\n  and vp.sao2 = (\n    select m

# Interpret results

In [40]:
fp = f"../model_predictions/eICU/{model_name}/trial_{trial_number}/evaluation/predictions_metrics.json"
print("Reading from file: ", fp)

with open(fp, "r") as f:
    metrics = json.load(f)

print(json.dumps(metrics, indent=4))

Reading from file:  ../model_predictions/eICU/o4-mini/trial_1/evaluation/predictions_metrics.json
{
    "precision_ans": 100.0,
    "recall_ans": 70.51,
    "f1_ans": 82.7,
    "precision_exec": 43.89,
    "recall_exec": 30.95,
    "f1_exec": 36.3,
    "acc": 30.95
}
