## Download evaluation dataset

In [None]:
!pip install wget

In [None]:
import wget

url = "https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip"
downloaded_file = wget.download(url)

In [None]:
!cp dev.zip ./data/tabular/dev.zip
!rm -r dev.zip

In [None]:
import zipfile

with zipfile.ZipFile("./data/tabular/dev.zip", "r") as zip_ref:
    zip_ref.extractall("./data/tabular/")

In [None]:
!rm -r ./data/tabular/dev.zip

In [None]:
with zipfile.ZipFile("./data/tabular/dev_20240627/dev_databases.zip", "r") as zip_ref:
    zip_ref.extractall("./data/tabular/")

In [None]:
!rm -r ./data/tabular/dev_20240627/

In [None]:
import sqlite3
import pandas as pd
import os

# Path to your SQLite database file
db_path = "./data/tabular/dev_databases/financial/financial.sqlite"
# Directory where CSVs will be saved
output_dir = "./data/tabular/csv_tables"
os.makedirs(output_dir, exist_ok=True)
# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Fetch all table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
# Export each table to CSV
for table_name_tuple in tables:
    table_name = table_name_tuple[0]
    print(f"Exporting table: {table_name}")
    # Quote the table name to handle reserved keywords
    df = pd.read_sql_query(f'SELECT * FROM "{table_name}"', conn)
    csv_path = os.path.join(output_dir, f"{table_name}.csv")
    df.to_csv(csv_path, index=False)
# Cleanup
conn.close()
print("All tables exported successfully.")

In [None]:
!cp ./data/tabular/dev_databases/financial/financial.sqlite ./data/tabular/financial.sqlite

In [None]:
!rm -r ./data/tabular/dev_databases
!rm -r ./data/tabular/__MACOSX/*
!rm -r ./data/tabular/__MACOSX

## Test Tabular route within Redbox

Now that the csv files are downloaded, upload them into Redbox via the UI and execute the questions within the financial_dataset_original.json. Save the SQL statements into the financial_dataset_results.json.
Do not upload the trans table as it is too big (3Millions rows). We will test questions that do not involve querying this table.

Once finished, delete the csv files

In [None]:
!rm -r ./data/tabular/csv_tables

## Compare results against ground truth

Read evaluation dataset

In [None]:
import json

with open("./data/tabular/financial_dataset_results.json") as f:
    eval_data = json.load(f)

select a record from the dataset

In [None]:
row = eval_data[29]

In [None]:
row

In [None]:
row["SQL"]

Check Ground truth answer

In [None]:
# Connect to the database
import sqlite3

db_path = "./data/tabular/financial.sqlite"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(row["SQL"])
results = cursor.fetchall()
conn.close()
results

Download generated sqlite database from the docker container. For this, you need to change processes.py to disable the deletion of the database file after each query. The code line need to be commented out: state = delete_db_file_if_exists(state) 
- Get the name of the local db file generated by tabular route: 
1. docker exec -it redbox-django-app-1 bash

2. find . -name *.db
- Download db file from the docker container to your local host

3. docker cp redbox-django-app-1:/usr/src/app/<name_local_db_file>.db .

Check the answer when no evidence (external knowledge) is supplied. In this case, the prompt is the question

In [None]:
# Connect to the database
db_path = "./data/tabular/generated_db_a2df5245-db22-4872-911c-6564340f9027.db"  # replace the name of the local db here
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# check results of the SQL query when evidence is not defined in the prompt
# in this case, the prompt is the question
cursor.execute(row["SQL_redbox_without_evidence"])
results = cursor.fetchall()
results

Check the answer when evidence (external knowledge) is supplied. In this case, the prompt is the question + evidence

In [None]:
# check results of the SQL query when evidence is added to the prompt
# in this case, the prompt is the question + evidence

cursor.execute(row["SQL_redbox_with_evidence"])
results = cursor.fetchall()
conn.close()
results

then record the accuracy in the financial_dataset_results.json. 
- is_accurate is 0 if the results from redbox does not match ground truth, otherwise it is 1

## Calculate performance metrics

In [None]:
accuracy_with_evidence_list = []
for row in eval_data:
    if row["evidence"] != "":
        accuracy_with_evidence_list.append(row["is_accurate_with_evidence"])

In [None]:
accuracy_without_evidence_list = []
for row in eval_data:
    accuracy_without_evidence_list.append(row["is_accurate_without_evidence"])

In [None]:
accuracy_without_evidence = sum(accuracy_without_evidence_list) / len(accuracy_without_evidence_list)

In [None]:
# accuracy when evidence (external knowledge) is not defined in the prompt
accuracy_without_evidence

In [None]:
accuracy_with_evidence = sum(accuracy_with_evidence_list) / len(accuracy_with_evidence_list)

In [None]:
# accuracy when evidence (external knowledge) is in the prompt
accuracy_with_evidence

In [None]:
accuracy_with_evidence_notchallenging_list = []
for row in eval_data:
    if row["evidence"] != "" and row["difficulty"] != "challenging":
        accuracy_with_evidence_notchallenging_list.append(row["is_accurate_with_evidence"])

In [None]:
accuracy_with_evidence_notchallenging = sum(accuracy_with_evidence_notchallenging_list) / len(
    accuracy_with_evidence_notchallenging_list
)

In [None]:
# accuracy when evidence (external knowledge) is in the prompt excluding challenging questions
accuracy_with_evidence_notchallenging

Final Clean-up : Delete database files

delete database file of evaluation dataset

In [None]:
!rm -r ./data/tabular/financial.sqlite

delete local database file created by tabular agent. Use the following command and replace the name of the database:
- !rm -r ./data/tabular/name_local_db_file>.db