# Benchmark eICU using finetuned mistral nemo minitron 8B

In [1]:
import os
from openai import OpenAI
from loguru import logger
import sys
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))


# Set up client

In [4]:
import asyncio
from openai import AsyncOpenAI
import json
from loguru import logger

# Re-using vLLM server details from previous cells
IP = "localhost"
PORT = 8000
BASE_URL = f"http://{IP}:{PORT}/v1"
MODEL_PATH = "/home/ubuntu/workspace/mistral-nemo-minitron-8b-instruct-healthcare-text2sql_vV2.8"

# Initialize the AsyncOpenAI client
async_client = AsyncOpenAI(
    api_key="not-needed",
    base_url=BASE_URL,
)

In [6]:
from utils import postprocess_sql_query_from_markdown

async def get_prediction(record: dict) -> str:
    """
    Generates a SQL query for a given record asynchronously.
    """
    index = record['index']
    ddl = record['ddl']
    instruction = record['instructions']
    question = record['user_query']
    output = record['output'] # ground truth SQL statement

    prompt_chat_template = [
        {
            "role": "system",
            "content": f"Based on DDL statements, instructions, generate a SQLite query to answer the user's question.\n\nDDL statements:\n{ddl}\nInstructions:\n{instruction}",
        },
        {
            "role": "user",
            "content": f"{question}"
        }
    ]

    try:
        response = await async_client.chat.completions.create(
            model=MODEL_PATH,
            messages=prompt_chat_template,
            temperature=0.0,
            max_tokens=512,
            stop=["<extra_id_1>"]
        )
        generated_sql = response.choices[0].message.content
        prediction = postprocess_sql_query_from_markdown(generated_sql)
        logger.info(f"Generated SQL for index {index}: {prediction}") 

        return {
            "index": index,
            "input": question, # user query
            "predict": prediction,  # model predicted SQL
            "real": output # ground truth SQL
        }
        
    except Exception as e:
        logger.error(f"An error occurred for index {record.get('index', 'N/A')}: {e}")
        return f"Error: {e}"

In [7]:
async def run_predictions_async(test_fp: str, output_fp: str):
    """
    Loads a test set and runs predictions asynchronously.
    """
    # Path to the mini test set
    with open(test_fp, "r") as f:
        test_data = json.load(f)

    tasks = [get_prediction(record) for record in test_data]
    
    logger.info(f"Sending {len(tasks)} requests to the vLLM server...")
    predictions = await asyncio.gather(*tasks)
    logger.success("All predictions done.")

    # add predictions to the test data by looking up the index
    results = []
    for pred in predictions:
        # create a dictionary with the index and prediction
        results.append({
            "index": pred['index'],
            "input": pred['input'],
            "predict": pred['predict'], 
            "real": pred['real']
        })

    # handle if the output file does not exist
    with open(output_fp, "w") as f:
        json.dump(results, f, indent=2)

    logger.success(f"Predictions saved to {output_fp}")

# Generate queries

In [12]:
from utils import preprare_directory
from time import time

# model name 
model_name = "mistral_nemo_minitron_8B_finetuned"

# trial number 
trial_number = 1    

# create output directory. set exist_ok=False to delete the old results and re-create a fresh folder
output_dir = f"../model_predictions/eICU/{model_name}/trial_{trial_number}"    
preprare_directory(output_dir, exist_ok=False)

# input test set directory
test_data_dir = "../model_evaluation/dataset/test"
test_data_fp = os.path.join(test_data_dir, "test_ehrsql_eicu_data.json")


# run predictions
start_time = time()
# run predictions
await run_predictions_async(
    test_fp = test_data_fp,
    output_fp = os.path.join(output_dir, "predictions.json")
)
end_time = time()
logger.success(f"Model {model_name}. Trial {trial_number} completed. Time taken: {end_time - start_time:.2f} seconds")

[32m2025-07-27 04:55:28.285[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_predictions_async[0m:[36m11[0m - [1mSending 1792 requests to the vLLM server...[0m


[32m2025-07-27 04:56:36.148[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m34[0m - [1mGenerated SQL for index 2: select distinct medication.routeadmin from medication where medication.drugname = 'clonidine';[0m
[32m2025-07-27 04:56:36.386[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m34[0m - [1mGenerated SQL for index 5: select distinct medication.routeadmin from medication where medication.drugname = 'zolpidem tartrate 5 mg po tabs';[0m
[32m2025-07-27 04:56:36.861[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m34[0m - [1mGenerated SQL for index 6: select distinct treatment.treatmentname from treatment where treatment.treatmentname = 'vancomycin inj 1,000 mg vial';[0m
[32m2025-07-27 04:56:36.862[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_prediction[0m:[36m34[0m - [1mGenerated SQL for index 3: select distinct medication.routeadmin from medication where medication.drugname = 'morphine 2 

**Trial 0**: 
- NVIDIA L40S (48GiB), 1 GPUs x 8 CPUs | 147GiB
- vLLM setting: `--max-num-seqs 4`
- 1xL40S GPU utlization: 97%
- Time taken: 1343.81 seconds

**Trial 1**: 
- Settings same as above
- Time taken


**Average**
- Avg Time Taken: 1334.81 seconds


# Evaluate SQL query

To evaluate the SQL query, we will execute BOTH the ground truth SQL statement and the predicted SQL statement against the SQL database, and compare their outcomes

In [15]:
import os
from utils import preprare_directory

# create output directory for evaluation results, relative to the path of model_evaluation directory
# note that the evaluate results need a clean new folder, because it will overwrite any existing files in the folder
pred_directory = f"../model_predictions/eICU/mistral_nemo_minitron_8B_finetuned/trial_{trial_number}"
eval_directory = os.path.join(pred_directory, "evaluation")
preprare_directory(eval_directory, exist_ok=False)

# the predicted file from previous step
pred_file = os.path.join(pred_directory, "predictions.json")

print("Using predictions from: ", pred_file)

# path to the eICU database
db_path = "../model_evaluation/databases/eicu.sqlite"


Using predictions from:  ../model_predictions/eICU/mistral_nemo_minitron_8B_finetuned/trial_1/predictions.json


In [16]:
# run evaluation
!python ../model_evaluation/ehrsql_eval.py \
    --pred_file {pred_file} \
    --db_path {db_path} \
    --num_workers -1 \
    --timeout 60 \
    --out_file {eval_directory} \
    --ndigits 2

# Interpret results

In [17]:
fp = f"../model_predictions/eICU/mistral_nemo_minitron_8B_finetuned/trial_{trial_number}/evaluation/predictions_metrics.json"

with open(fp, "r") as f:
    metrics = json.load(f)

print(json.dumps(metrics, indent=4))

{
    "precision_ans": 100.0,
    "recall_ans": 96.65,
    "f1_ans": 98.3,
    "precision_exec": 36.26,
    "recall_exec": 35.04,
    "f1_exec": 35.64,
    "acc": 35.04
}


**Results**: 

**Trial 0**: 
```json
{
    "precision_ans": 100.0,
    "recall_ans": 96.71,
    "f1_ans": 98.33,
    "precision_exec": 36.81,
    "recall_exec": 35.6,
    "f1_exec": 36.2,
    "acc": 35.6
}
```

**Trial 1**: 

```json
{
    "precision_ans": 100.0,
    "recall_ans": 96.65,
    "f1_ans": 98.3,
    "precision_exec": 36.26,
    "recall_exec": 35.04,
    "f1_exec": 35.64,
    "acc": 35.04
}
```

In [19]:
# average accuracy
import numpy as np
print("Average accuracy: ", np.mean([35.6, 35.04]), "%")

Average accuracy:  35.32 %


# Notes

- If DDL does not have comments, then the average accuracy is 35.32%
    ```python
    print("Average accuracy: ", np.mean([35.6, 35.04]), "%")
    ```