In [2]:
import json

def open_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)
    
def open_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]
    
def open_sql(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
        
def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for sample in data:
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')


In [9]:
eicu_data = open_jsonl("/localhome/local-hndo/hndo_eval/model_evaluation/model_predictions/hvnguyen/mistral-text2sql-v6/test_mistral-text2sql-instruct-benchmark-data-v6-ehrsql-eicu-test_result.jsonl")

question = "what was patient 031-3355's last blood, venipuncture microbiology test time in this month?"

for sample in eicu_data:
    if question in sample["question"]:
        print(sample["pred"])

```sql
select microlab.culturetakentime from microlab where microlab.patientunitstayid in ( select patient.patientunitstayid from patient where patient.uniquepid = '031-3355' ) and microlab.culturesite = 'blood, venipuncture' and datetime(microlab.culturetakentime,'start of month') = datetime('2123-12-31','start of month') order by microlab.culturetakentime desc limit 1
```


## error summary

In [8]:
data_logs = {
    "ehrsql-eicu": open_jsonl("outputs/hvnguyen/mistral-text2sql-v6/test_mistral-text2sql-instruct-benchmark-data-v6-ehrsql-eicu-test_result_evaluation_results.jsonl"),
    "ehrsql-mimic-iii": open_jsonl("outputs/hvnguyen/mistral-text2sql-v6/test_mistral-text2sql-instruct-benchmark-data-v6-ehrsql-mimiciii-test_result_evaluation_results.jsonl"),
    "mimicsql": open_jsonl("outputs/hvnguyen/mistral-text2sql-v6/test_mistral-text2sql-instruct-benchmark-data-v6-mimicsql-test_result_evaluation_results.jsonl"),
}

print(len(data_logs["ehrsql-eicu"]))
print(len(data_logs["ehrsql-mimic-iii"]))
print(len(data_logs["mimicsql"]))

def count_error_sample(data):
    error_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0:
            error_count += 1
    return error_count

print(count_error_sample(data_logs["ehrsql-eicu"]))
print(count_error_sample(data_logs["ehrsql-mimic-iii"]))
print(count_error_sample(data_logs["mimicsql"]))

1792
1786
1000
1059
1075
233


In [10]:
def count_range_error_sample(data):
    range_error_count = 0
    for sample in data:
        if sample["real"] is not None and "_lower" in sample["real"]:
            range_error_count += 1
    return range_error_count

print(count_range_error_sample(data_logs["ehrsql-eicu"]))
print(count_range_error_sample(data_logs["ehrsql-mimic-iii"]))
print(count_range_error_sample(data_logs["mimicsql"]))

0
0
0


In [16]:
def count_timeout_sample(data):
    timeout_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0 and sample["pred_result"] == "[\"('timeout_pred',)\"]":
            timeout_count += 1
    return timeout_count

print("ehrsql-eicu: ", count_timeout_sample(data_logs["ehrsql-eicu"]))
print("ehrsql-mimic-iii: ", count_timeout_sample(data_logs["ehrsql-mimic-iii"]))
print("mimicsql: ", count_timeout_sample(data_logs["mimicsql"]))

ehrsql-eicu:  12
ehrsql-mimic-iii:  1
mimicsql:  0


In [17]:
def count_error_syntax_sample(data):
    error_syntax_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0 and sample["pred_result"] == "[\"('error_pred',)\"]":
            error_syntax_count += 1
    return error_syntax_count

print("ehrsql-eicu: ", count_error_syntax_sample(data_logs["ehrsql-eicu"]))
print("ehrsql-mimic-iii: ", count_error_syntax_sample(data_logs["ehrsql-mimic-iii"]))
print("mimicsql: ", count_error_syntax_sample(data_logs["mimicsql"]))

ehrsql-eicu:  200
ehrsql-mimic-iii:  338
mimicsql:  109


In [18]:
def count_empty_sample(data):
    empty_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0 and sample["pred_result"] == "[]":
            empty_count += 1
    return empty_count

print(count_empty_sample(data_logs["ehrsql-eicu"]))
print(count_empty_sample(data_logs["ehrsql-mimic-iii"]))
print(count_empty_sample(data_logs["mimicsql"]))

276
221
20


In [19]:
def count_none_sample(data):
    none_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0 and sample["pred_result"] == "['(None,)']":
            none_count += 1
    return none_count

print(count_none_sample(data_logs["ehrsql-eicu"]))
print(count_none_sample(data_logs["ehrsql-mimic-iii"]))
print(count_none_sample(data_logs["mimicsql"]))

50
47
4


## find out the difference between code v2 --> they give diff results

In [3]:
data_eicu_v2 = {
    "v2-old": open_jsonl("/localhome/local-hndo/hndo_eval/model_evaluation/outputs/hvnguyen/mistral-text2sql-v24/test_mistral-text2sql-instruct-benchmark-data-genval-512-16x4-ehrsql-eicu_result_evaluation_results.jsonl"),
    "v2-new": open_jsonl("/localhome/local-hndo/hndo_eval/model_evaluation/outputs/hvnguyen/mistral-text2sql-v24/eval_v2/test_mistral-text2sql-instruct-benchmark-data-genval-512-16x4-ehrsql-eicu_result_evaluation_results.jsonl"),
}

print(len(data_eicu_v2["v2-old"]))
print(len(data_eicu_v2["v2-new"]))

def count_error_sample(data):
    error_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0:
            error_count += 1
    return error_count

print(count_error_sample(data_eicu_v2["v2-old"]))
print(count_error_sample(data_eicu_v2["v2-new"]))


1792
1792
714
815


In [8]:
count = 0
for old, new in zip(data_eicu_v2["v2-old"], data_eicu_v2["v2-new"]):
    if old["input"] == new["input"] and old["sample_scores"]["accuracy"] != new["sample_scores"]["accuracy"]:
        count += 1
        print("OLD - ACC:")
        print("REAL: ", old["real_result"])
        print("PRED: ", old["pred_result"])
        print(old["sample_scores"]["accuracy"])
        print("-"*100)
        print("NEW - Accuracy:")
        print("REAL: ", new["real_result"])
        print("PRED: ", new["pred_result"])
        print(new["sample_scores"]["accuracy"])
        print("="*100)
print(count)

OLD - ACC:
REAL:  []
PRED:  []
1
----------------------------------------------------------------------------------------------------
NEW - Accuracy:
REAL:  ['(77.1,)']
PRED:  []
0
OLD - ACC:
REAL:  []
PRED:  []
1
----------------------------------------------------------------------------------------------------
NEW - Accuracy:
REAL:  ["('surgical drains - jackson-pratt',)"]
PRED:  []
0
OLD - ACC:
REAL:  []
PRED:  []
1
----------------------------------------------------------------------------------------------------
NEW - Accuracy:
REAL:  ["('lasix',)", "('oxycodone hcl 5 mg po tabs',)", "('sennosides-docusate sodium 8.6-50 mg po tabs',)"]
PRED:  []
0
OLD - ACC:
REAL:  []
PRED:  []
1
----------------------------------------------------------------------------------------------------
NEW - Accuracy:
REAL:  ["('potassium chloride',)"]
PRED:  []
0
OLD - ACC:
REAL:  []
PRED:  []
1
----------------------------------------------------------------------------------------------------
NEW - 

In [9]:
data_mimic = {
    "v2": open_jsonl("/localhome/local-hndo/hndo_eval/model_evaluation/outputs/model_base/llama31-nemotron-nano-8b-v1/llama31-nemotron-nano-8b-v1_mimicsql_results_evaluation_results.jsonl"),
    "v3": open_jsonl("/localhome/local-hndo/hndo_eval/model_evaluation/outputs/new-ddl-inst/llama31-nemotron-nano-8b-v1/Llama-3.1-Nemotron-Nano-8B-v1_mimicsql_result_thinking_on_evaluation_results.jsonl"),
}

print(len(data_mimic["v2"]))
print(len(data_mimic["v3"]))

def count_error_sample(data):
    error_count = 0
    for sample in data:
        if sample["sample_scores"]["accuracy"] == 0:
            error_count += 1
    return error_count

print(count_error_sample(data_mimic["v2"]))
print(count_error_sample(data_mimic["v3"]))


1000
1000
406
698


In [15]:
count = 0
for v2, v3 in zip(data_mimic["v2"], data_mimic["v3"]):
    if v3["user_query"] in v2["input"] and v2["sample_scores"]["accuracy"] != v3["sample_scores"]["accuracy"]:
        count += 1
        print("V2 - ACC:")
        print("SQL REAL: ", v2["output"])
        print("SQL PRED: ", v2["predict"])
        print("REAL: ", v2["real_result"])
        print("PRED: ", v2["pred_result"])
        print(v2["sample_scores"]["accuracy"])
        print("-"*100)
        print("V3 - Accuracy:")
        print("SQL REAL: ", v3["output"])
        print("SQL PRED: ", v3["predict"])
        print("REAL: ", v3["real_result"])
        print("PRED: ", v3["pred_result"])
        print(v3["sample_scores"]["accuracy"])
        print("="*100)
        print("="*100)
print(count)

V2 - ACC:
SQL REAL:  ```sql
SELECT COUNT ( DISTINCT DEMOGRAPHIC."SUBJECT_ID" ) FROM DEMOGRAPHIC INNER JOIN DIAGNOSES on DEMOGRAPHIC.HADM_ID = DIAGNOSES.HADM_ID WHERE DIAGNOSES."ICD9_CODE" = "53190"
``` 
SQL PRED:  SELECT COUNT(*)
FROM DIAGNOSES
JOIN DEMOGRAPHIC ON DIAGNOSES.SUBJECT_ID = DEMOGRAPHIC.SUBJECT_ID
WHERE DIAGNOSES.ICD9_CODE = '53190';
REAL:  ['(1,)']
PRED:  ['(2,)']
0
----------------------------------------------------------------------------------------------------
V3 - Accuracy:
SQL REAL:  ```sql
SELECT COUNT ( DISTINCT DEMOGRAPHIC."SUBJECT_ID" ) FROM DEMOGRAPHIC INNER JOIN DIAGNOSES on DEMOGRAPHIC.HADM_ID = DIAGNOSES.HADM_ID WHERE DIAGNOSES."ICD9_CODE" = "53190"
``` 
SQL PRED:  SELECT COUNT(*)
FROM DIAGNOSES
WHERE ICD9_CODE = '53190';
REAL:  ['(1,)']
PRED:  ['(1,)']
1
V2 - ACC:
SQL REAL:  ```sql
SELECT COUNT ( DISTINCT DEMOGRAPHIC."SUBJECT_ID" ) FROM DEMOGRAPHIC INNER JOIN DIAGNOSES on DEMOGRAPHIC.HADM_ID = DIAGNOSES.HADM_ID WHERE DIAGNOSES."SHORT_TITLE" = "Chordae tendi

## convert mimicall database into new database whose string value convert to lower case

In [1]:
import sqlite3
import os

# Paths to your databases
src_db = '/localhome/local-hndo/hndo_eval/model_evaluation/databases/mimic_all.db'
dst_db = '/localhome/local-hndo/hndo_eval/model_evaluation/databases/mimic_all_lower.db'

# Remove destination DB if it exists
if os.path.exists(dst_db):
    os.remove(dst_db)

# Connect to source and destination databases
src_conn = sqlite3.connect(src_db)
dst_conn = sqlite3.connect(dst_db)

src_cur = src_conn.cursor()
dst_cur = dst_conn.cursor()

# Get all table names
src_cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = [row[0] for row in src_cur.fetchall()]

for table in tables:
    # Get table schema
    src_cur.execute(f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table}';")
    create_table_sql = src_cur.fetchone()[0]
    dst_cur.execute(create_table_sql)

    # Get column info
    src_cur.execute(f"PRAGMA table_info({table});")
    columns_info = src_cur.fetchall()
    columns = [col[1] for col in columns_info]
    types = [col[2].upper() for col in columns_info]

    # Select all data
    src_cur.execute(f"SELECT * FROM {table};")
    rows = src_cur.fetchall()

    # Prepare insert statement
    placeholders = ','.join(['?'] * len(columns))
    insert_sql = f"INSERT INTO {table} ({', '.join(columns)}) VALUES ({placeholders})"

    # Convert text columns to lowercase
    new_rows = []
    for row in rows:
        new_row = []
        for value, col_type in zip(row, types):
            if col_type == 'TEXT' and value is not None:
                new_row.append(value.lower())
            else:
                new_row.append(value)
        new_rows.append(tuple(new_row))

    # Insert into new database
    dst_cur.executemany(insert_sql, new_rows)
    dst_conn.commit()

# Close connections
src_conn.close()
dst_conn.close()

print("All string values have been converted to lowercase and saved to mimic_all_lower.db")

All string values have been converted to lowercase and saved to mimic_all_lower.db
