In [1]:
import json
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))
from utils import postprocess_sql_query_from_markdown


# Curating the test_ehrsql_eicu_data_benchmark dataset

In [2]:
data = []

fp = "/home/ubuntu/workspace/vrdc_text2sql/model_evaluation/dataset/test/test_mistral-nemo-minitron-8b-instruct-dv8-pv2-24x4-ehrsql-eicu_result.jsonl"

# This is a regular JSON file (not JSONL), so we read it directly
with open(fp, 'r') as f:
    for line in f:
        if line.strip():  # Skip empty lines
            data.append(json.loads(line))

# Count number of records
print(f"Number of records: {len(data)}")

# Display first record as example
if data:
    print("\nFirst record:")
    print(json.dumps(data[0], indent=4))


Number of records: 1792

First record:
{
    "input": "<extra_id_0>System\nBased on DDL statements, instructions, and the current date, generate a SQL query in the following sqlite to answer the question.\n If the question cannot be answered using the available tables and columns in the DDL (i.e., it is out of scope), return only: None.\nToday is 2105-12-31 23:59:00\nDDL statements:\nDROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- unique identifier for a single ICU stay of a patient.\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient (\"female\" or \"male\") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- Ethnicity

In [3]:
import re

def extract_system_message(input_text):
    """
    Extract the system message from the input text.
    
    Args:
        input_text (str): The input text containing system and user messages
        
    Returns:
        str: The system message
    """
    # Pattern to match system message between <extra_id_0>System\n and <extra_id_1>User\n
    pattern = r'<extra_id_0>System\n(.*?)<extra_id_1>User\n'
    
    match = re.search(pattern, input_text, re.DOTALL)
    
    if match:
        return match.group(1).strip()
    else:
        return ""

In [4]:
def extract_user_message(input_text):
    """
    Extract the user message from the input text.
    
    Args:
        input_text (str): The input text containing system and user messages
        
    Returns:
        str: The user message
    """
    # Pattern to match user message between <extra_id_1>User\n and <extra_id_1>Assistant\n
    pattern = r'<extra_id_1>User\n(.*?)<extra_id_1>Assistant\n'
    
    match = re.search(pattern, input_text, re.DOTALL)
    
    if match:
        return match.group(1).strip()
    else:
        return ""


In [5]:
def extract_messages(input_text):
    """
    Extract both system and user messages from the input text.
    
    Args:
        input_text (str): The input text containing system and user messages
        
    Returns:
        dict: Dictionary with 'system' and 'user' keys containing the respective messages
    """
    return {
        'system': extract_system_message(input_text),
        'user': extract_user_message(input_text)
    }


In [6]:
data_cleaned = []
# Add DDL field to each record
for i, record in enumerate(data):
    index = i+1

    input = record['input']
    messages = extract_messages(input)
    system = messages['system']
    user = messages['user']
    output = postprocess_sql_query_from_markdown(record['output']) # ground truth

    data_cleaned.append({
        'index': index,
        'system': system,
        'user': user,
        'real': output
    })


In [7]:
print(json.dumps(data_cleaned[0], indent=4))

{
    "index": 1,
    "system": "Based on DDL statements, instructions, and the current date, generate a SQL query in the following sqlite to answer the question.\n If the question cannot be answered using the available tables and columns in the DDL (i.e., it is out of scope), return only: None.\nToday is 2105-12-31 23:59:00\nDDL statements:\nDROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- unique identifier for a single ICU stay of a patient.\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient (\"female\" or \"male\") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- Ethnicity of the patient (e.g: \"caucasian\", \"nat

In [8]:
print(json.dumps(data_cleaned[1], indent=4))

{
    "index": 2,
    "system": "Based on DDL statements, instructions, and the current date, generate a SQL query in the following sqlite to answer the question.\n If the question cannot be answered using the available tables and columns in the DDL (i.e., it is out of scope), return only: None.\nToday is 2105-12-31 23:59:00\nDDL statements:\nDROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- unique identifier for a single ICU stay of a patient.\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient (\"female\" or \"male\") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- Ethnicity of the patient (e.g: \"caucasian\", \"nat

In [9]:
# dump this data into test folder 

output_dir = "/home/ubuntu/workspace/vrdc_text2sql/model_evaluation/dataset/test"

json.dump(data_cleaned, open(os.path.join(output_dir, "test_ehrsql_eicu_data_benchmark.json"), "w"))

# DDL chunking

In [7]:
import json
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))
from dotenv import load_dotenv
load_dotenv()
from utils.experimental import FAISSRetriever, split_sql_blocks

# read ddl from file
sql_file = "../model_evaluation/dataset/metadata/eicu_instruct_benchmark_rag.sql"

# Split the SQL file into blocks
blocks = split_sql_blocks(sql_file)

# Print information about the blocks
print(f"Found {len(blocks)} SQL blocks:\n")

# inspect a block
print("example block:")
print(blocks[2])

Found 10 SQL blocks:

example block:
DROP TABLE IF EXISTS treatment;
CREATE TABLE treatment  -- store treatments administered during ICU stay
(
    treatmentid INT NOT NULL PRIMARY KEY, -- Unique treatment record ID
    patientunitstayid INT NOT NULL, -- ICU stay ID (FK to patient)
    treatmentname VARCHAR(200) NOT NULL, -- Name of the treatment administered (lowercase)
    treatmenttime TIMESTAMP(0) NOT NULL, -- Time the treatment was given
    FOREIGN KEY(patientunitstayid) REFERENCES patient(patientunitstayid)
);


In [14]:
# create a retriever
ddl_retreiver = FAISSRetriever(
    api_key=os.getenv("LLM_GATEWAY_API"),
    api_version="2025-04-01-preview",
    azure_endpoint = "https://prod.api.nvidia.com/llm/v1/azure",
    model = "text-embedding-3-large", 
)

# specify the cache file for the DDL vector database. Use None to re-generate the embeddings without saving. 
ddl_embedding_cache_file = "/root/workspace/vrdc_text2sql/model_evaluation/dataset/train_eval/eicu/ddl_database_openai_text-embedding-3-large.pkl"
ddl_retreiver.embed_blocks(
    text_blocks=blocks,
    cache_file=ddl_embedding_cache_file,
)

[32m2025-07-31 03:23:38.594[0m | [1mINFO    [0m | [36mutils.experimental[0m:[36membed_blocks[0m:[36m110[0m - [1mLoading cached embeddings from /root/workspace/vrdc_text2sql/model_evaluation/dataset/train_eval/eicu/ddl_database_openai_text-embedding-3-large.pkl[0m


FAISS index created with 10 entries


In [15]:
# try retriving relevant blocks from the DDL vector database
retrieved_blocks = ddl_retreiver.retrieve(
    query = "What is the age of patient Camole Yu",
    top_k=5
)
print(retrieved_blocks)

Generating query embedding...
[{'block_id': 0, 'content': 'DROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- Unique ID for patient\'s entire hospital stay\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient\'s ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient ("female" or "male") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- Ethnicity of the patient (e.g: "caucasian", "native american", "hispanic", "african american", "other/unknown", "asian" or null) (lowercase)\n    hospitalid INT NOT NULL, -- ID of the hospital\n    wardid INT NOT NULL, -- ID of the hospital ward/unit\n    admissionheight NUMERIC(10,2), -- Patient\'s height on admission (in cm)\n    admissionweigh

In [16]:
# reconstruct the DDL from the retrieved blocks
reconstructed_ddl = "\n".join([b['content'] for b in retrieved_blocks])
print(reconstructed_ddl)

DROP TABLE IF EXISTS patient;
CREATE TABLE patient    -- store patient demographics and admission information
(
    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system
    patienthealthsystemstayid INT NOT NULL, -- Unique ID for patient's entire hospital stay
    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient's ICU stay
    gender VARCHAR(25) NOT NULL, -- Gender of the patient ("female" or "male") (lowercase)
    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)
    ethnicity VARCHAR(50), -- Ethnicity of the patient (e.g: "caucasian", "native american", "hispanic", "african american", "other/unknown", "asian" or null) (lowercase)
    hospitalid INT NOT NULL, -- ID of the hospital
    wardid INT NOT NULL, -- ID of the hospital ward/unit
    admissionheight NUMERIC(10,2), -- Patient's height on admission (in cm)
    admissionweight NUMERIC(10,2), -- Weight on admission (in kg)
    dischargeweight NUMERI

In [26]:
from dotenv import load_dotenv
from openai import AsyncOpenAI, OpenAI
import os
load_dotenv()

os.environ['BEDROCK_OPENAI_BASE_URL']

'http://xiyu-B-Proxy-UFJeMXebNeDR-1665117153.us-east-1.elb.amazonaws.com/api/v1'

In [30]:
from dotenv import load_dotenv
from openai import AsyncOpenAI, OpenAI
import os
load_dotenv()

client = OpenAI(
    api_key=os.environ['BEDROCK_OPENAI_API_KEY'],
    base_url=os.environ['BEDROCK_OPENAI_BASE_URL']
)

def test_claude():
    response = client.chat.completions.create(
        model="us.anthropic.claude-sonnet-4-20250514-v1:0",
        messages=[{"role": "user", "content": "What is the age of patient Camole Yu"}],
        max_completion_tokens=1024*2,
    )
    print(response.choices[0].message.content)
    print(response.usage.total_tokens)

In [31]:
response = test_claude()

I don't have access to any patient records or medical information for someone named Camole Yu. To obtain a patient's age or any other medical information, you would need to:

1. Check the patient's medical records or chart if you're an authorized healthcare provider
2. Ask the patient directly
3. Access the information through proper medical databases if you have appropriate authorization

If you're a healthcare professional needing this information for patient care, please consult your facility's medical records system or electronic health records (EHR). Patient information should only be accessed by authorized personnel for legitimate medical purposes in accordance with HIPAA and other privacy regulations.
154
