# Text2SQL Combined with RAG

In [1]:
import os, sys
sys.path.append(os.path.abspath(os.path.join('..', 'model_evaluation')))
from utils import preprare_directory
from dotenv import load_dotenv
load_dotenv()


True

# Workflow Diagram

**Features**:
- By default, the whole DDL directly inserted into prompt. Though it can also be chunked
- RAG is performed only on Q&A pairs from the training dataset
- Default to retrieve top 3
- No documentation RAG support
- Async execution. Better than vanna because vanna doesn't support async. See [here](https://github.com/vanna-ai/vanna/discussions/394)
- Using Faiss-CPU for now
- Swap with different embeddings models

# Q&A Pair database

The train and validation split of eICU is used as the vector database 

In [25]:
import pandas as pd
fp = "/root/workspace/vrdc_text2sql/model_evaluation/dataset/train_eval/eicu/train_val.csv"
df = pd.read_csv(fp)

print("Number of Q&A pairs in the vector database: ", len(df))

Number of Q&A pairs in the vector database:  10387


# Steps 

# Evaluation

In [8]:


# create output directory for evaluation results, relative to the path of model_evaluation directory
# note that the evaluate results need a clean new folder, because it will overwrite any existing files in the folder
pred_directory = f"/root/workspace/vrdc_text2sql/model_predictions/eICU/rag/mistral_finetuned_nv-embedqa_ddl5_qa6"  
eval_directory = os.path.join(pred_directory, "evaluation")
preprare_directory(eval_directory, exist_ok=False)

# the predicted file from previous step
pred_file = f"{pred_directory}/test_rag_vllm_ehrsql_eicu_result_mis_embedd.jsonl"

print("Using predictions from: ", pred_file)

# path to the eICU database
db_path = "/root/workspace/vrdc_text2sql/model_evaluation/databases/eicu.sqlite"

Using predictions from:  /root/workspace/vrdc_text2sql/model_predictions/eICU/rag/mistral_finetuned_nv-embedqa_ddl5_qa6/test_rag_vllm_ehrsql_eicu_result_mis_embedd.jsonl


In [9]:
# run evaluation
!python ../model_evaluation/ehrsql_eval.py \
    --pred_file {pred_file} \
    --db_path {db_path} \
    --num_workers -1 \
    --timeout 60 \
    --out_file {eval_directory} \
    --ndigits 2

In [10]:
import json

# file path to the evaluation result file. 
fp = f"{pred_directory}/evaluation/test_rag_vllm_ehrsql_eicu_result_mis_embedd_metrics.json"
print("Reading from file: ", fp)

with open(fp, "r") as f:
    metrics = json.load(f)

print(json.dumps(metrics, indent=4))

Reading from file:  /root/workspace/vrdc_text2sql/model_predictions/eICU/rag/mistral_finetuned_nv-embedqa_ddl5_qa6/evaluation/test_rag_vllm_ehrsql_eicu_result_mis_embedd_metrics.json
{
    "precision_ans": 100.0,
    "recall_ans": 100.0,
    "f1_ans": 100.0,
    "precision_exec": 82.42,
    "recall_exec": 82.42,
    "f1_exec": 82.42,
    "acc": 82.42
}
