In [45]:
import json
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))
from utils import postprocess_sql_query_from_markdown


In [46]:
fp = "/home/ubuntu/workspace/vrdc_text2sql/model_evaluation/dataset/test/test.jsonl"

# For JSONL files, we need to read line by line
data = []
with open(fp, 'r') as f:
    for line in f:
        if line.strip():  # Skip empty lines
            data.append(json.loads(line))

# Count number of records
print(f"Number of records: {len(data)}")

# Display first record as example
if data:
    print("\nFirst record:")
    print(json.dumps(data[0], indent=4))


Number of records: 1792

First record:
{
    "input": "tell me the method of dextrose 5% in water (d5w) iv : 1000 ml bag intake?",
    "output": "```sql\nselect distinct medication.routeadmin from medication where medication.drugname = 'dextrose 5% in water (d5w) iv : 1000 ml bag'\n```\n<extra_id_1></s>",
    "predict": "```sql\nselect distinct treatment.treatmentname from treatment where treatment.treatmentname = 'dextrose 5% in water (d5w) iv : 1000 ml bag'\n```",
    "real_result": "[\"('iv',)\"]",
    "pred_result": "[]",
    "sample_scores": {
        "precision_ans": 1,
        "precision_exec": 0,
        "recall_ans": 1,
        "recall_exec": 0,
        "accuracy": 0
    }
}


In [47]:
# original test set
original = "/home/ubuntu/workspace/vrdc_text2sql/model_evaluation/dataset/metadata/eicu_instruct.sql"

# Read the DDL file
with open(original, 'r') as f:
    ddl_content = f.read().strip()

# Add DDL field to each record
for record in data:
    record['ddl'] = ddl_content
    record['output'] = postprocess_sql_query_from_markdown(record['output'])
    # remove real_result, pred_result, and sample_scores
    record.pop('real_result', None)
    record.pop('pred_result', None)
    record.pop('sample_scores', None)

# Verify it worked - show the first record with DDL field
print(f"Successfully added DDL to {len(data)} records")
print(f"\nDDL field length: {len(ddl_content)} characters")
print(f"\nFirst 200 characters of DDL:")
print(ddl_content[:200] + "...")

# Show the keys of the first record to confirm DDL field is added
if data:
    print(f"\nKeys in first record: {list(data[0].keys())}")



Successfully added DDL to 1792 records

DDL field length: 7272 characters

First 200 characters of DDL:
DROP TABLE IF EXISTS patient;
CREATE TABLE patient    -- store patient demographics and admission information
(
    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system
    p...

Keys in first record: ['input', 'output', 'predict', 'ddl']


In [48]:
data[0]

{'input': 'tell me the method of dextrose 5% in water (d5w) iv : 1000 ml bag intake?',
 'output': "select distinct medication.routeadmin from medication where medication.drugname = 'dextrose 5% in water (d5w) iv : 1000 ml bag';",
 'predict': "```sql\nselect distinct treatment.treatmentname from treatment where treatment.treatmentname = 'dextrose 5% in water (d5w) iv : 1000 ml bag'\n```",
 'ddl': 'DROP TABLE IF EXISTS patient;\nCREATE TABLE patient    -- store patient demographics and admission information\n(\n    uniquepid VARCHAR(10) NOT NULL, -- Unique patient identifier across the system\n    patienthealthsystemstayid INT NOT NULL, -- Unique ID for patient\'s entire hospital stay\n    patientunitstayid INT NOT NULL PRIMARY KEY, -- Unique ID for the patient\'s ICU stay\n    gender VARCHAR(25) NOT NULL, -- Gender of the patient ("female" or "male") (lowercase)\n    age VARCHAR(10) NOT NULL, -- Age at admission (can be in years or an age category)\n    ethnicity VARCHAR(50), -- Ethnici

In [49]:
len(data)

1792

In [50]:
# dump this data into test folder 

output_dir = "/home/ubuntu/workspace/vrdc_text2sql/model_evaluation/dataset/test"

json.dump(data, open(os.path.join(output_dir, "test_ehrsql_eicu_data_with_ddl_comments.json"), "w"))