In [11]:
import os
import pandas as pd

from agents.EvaluatorForBIRD import EvaluatorForBIRD
from agents.ZeroShotAgent import ZeroShotAgent
from agents.OptimizerAgent import OptimizerAgent
from agents.MultiAgentDiscussion import MultiAgentDiscussion

from config import MODEL, METHOD, OUTPUT_PATH
from utility import read_dataset, get_db_cursor, fetch_BIRD_schemas, get_openai_client, dump_to_json
import api_keys

os.environ['OPENAI_API_KEY'] = api_keys.OPENAI_API_KEY

In [12]:
df, db_names = read_dataset()
db_schemas   = fetch_BIRD_schemas(db_names)
print(f'{db_names=}, {len(df)=}')

client = get_openai_client()
multi_agent = MultiAgentDiscussion(MODEL, client, get_db_cursor, db_schemas, OUTPUT_PATH, is_debug=True)

# multi_agent.batched_generate(df[df.question_id == 944])

db_names=['formula_1', 'debit_card_specializing', 'thrombosis_prediction'], len(df)=146




In [13]:
print(db_schemas['debit_card_specializing'])

CREATE TABLE customers
(
    CustomerID INTEGER UNIQUE     not null
        primary key,
    Segment    TEXT null,
    Currency   TEXT null
)
CREATE TABLE gasstations
(
    GasStationID INTEGER    UNIQUE   not null
        primary key,
    ChainID      INTEGER          null,
    Country      TEXT null,
    Segment      TEXT null
)
CREATE TABLE products
(
    ProductID   INTEGER   UNIQUE      not null
        primary key,
    Description TEXT null
)
CREATE TABLE "transactions_1k"
(
    TransactionID INTEGER
        primary key autoincrement,
    Date          DATE,
    Time          TEXT,
    CustomerID    INTEGER,
    CardID        INTEGER,
    GasStationID  INTEGER,
    ProductID     INTEGER,
    Amount        INTEGER,
    Price         REAL
)
CREATE TABLE "yearmonth"
(
    CustomerID  INTEGER not null
        references customers
            on update cascade on delete cascade
        references customers,
    Date        TEXT    not null,
    Consumption REAL,
    primary key (Date,

# Run Experiment

## GPT-4o Zero-shot

In [26]:
if METHOD == 'zero-shot':
    print(f"Experiment: {MODEL}_{METHOD}")
    
    # Setup
    df, db_names = read_dataset()
    db_schemas   = fetch_BIRD_schemas(db_names)
    print(f'{db_names=}, {len(df)=}')
    
    client = get_openai_client()
    agent = ZeroShotAgent(MODEL, client, get_db_cursor, db_schemas, OUTPUT_PATH)
    evaluator = EvaluatorForBIRD(get_db_cursor)
    
    # Generate
    raw_responses = agent.batched_generate(df)
    dump_to_json('raw_responses', raw_responses)

    # Parse
    print("Finished Generating. Attempting SQL auto-parsing...")
    cleaned_sql = agent.auto_parse_sql_from_response(raw_responses)
    dump_to_json('cleaned_sql', cleaned_sql)
    print("SQL auto-parsing successful")

    # Evaluate
    df['prediction'] = cleaned_sql
    df['label'] = evaluator.evaluate(df, pred_col_name='prediction')
    
    # Save results
    df.to_json(OUTPUT_PATH / f'{MODEL}_{METHOD}_df.json', orient='records')

Q_1150: OperationalError no such function: YEAR
Q_1156: OperationalError no such function: YEAR
Q_1162: OperationalError no such function: YEAR
Q_1164: OperationalError no such function: YEAR
Q_1168: OperationalError no such function: YEAR
Q_1171: OperationalError no such function: YEAR
Q_1175: OperationalError no such function: YEAR
Q_1195: OperationalError no such column: L.LAB
Q_1201: OperationalError no such function: YEAR
Q_1227: OperationalError no such function: YEAR
Q_1229: OperationalError no such function: YEAR
Q_1231: OperationalError no such function: YEAR
Q_1232: OperationalError no such function: YEAR
Q_1235: OperationalError no such function: YEAR
Q_1239: OperationalError no such function: YEAR
Q_1242: OperationalError no such function: YEAR
Q_1243: OperationalError no such function: YEAR
Q_1254: OperationalError no such function: YEAR
Q_1257: OperationalError no such function: YEAR
Q_955: OperationalError no such column: d.driverId

=== EX Results ===
Accuracy :  35.616

## GPT-4o Zero-shot + Optimizer

In [8]:
if METHOD == 'optimizer-agent':
    print(f"Experiment: {MODEL}_{METHOD}")
    
    # Setup
    df, db_names = read_dataset()
    db_schemas   = fetch_BIRD_schemas(db_names)
    print(f'{db_names=}, {len(df)=}')
    
    client = get_openai_client()
    agent = OptimizerAgent(MODEL, client, get_db_cursor, db_schemas, OUTPUT_PATH)
    evaluator = EvaluatorForBIRD(get_db_cursor)
    
    # Generate
    df = pd.read_json('gpt-4o_zero-shot_df.json')
    raw_responses = agent.batched_generate(df)
    dump_to_json('raw_responses', raw_responses)

    # Parse
    print(f"Finished Generating. Attempting SQL auto-parsing...")
    cleaned_sql = agent.auto_parse_sql_from_response(raw_responses)
    dump_to_json('cleaned_sql', cleaned_sql)
    print(f"SQL auto-parsing successful")

    # Evaluate
    df['optimized'] = cleaned_sql
    df['opt-label'] = evaluator.evaluate(df, pred_col_name='optimized')
    
    # Save results
    df.to_json(OUTPUT_PATH / f'{MODEL}_{METHOD}_df.json', orient='records')

Generating SQL: gpt-4o_optimizer-agent
Finished Generating
SQL auto-parsing successful


Q_944: OperationalError incomplete input

=== EX Results ===
Accuracy :  45.890%
Breakdown by Difficulty:
	simple:  61.224% (30 / 49)
	moderate:  43.077% (28 / 65)
	challenging:  28.125% (9 / 32)
=== end ===



## GPT-4o Multi-Agent Discussion

In [10]:
if METHOD == 'discussion':
    print(f"Experiment: {MODEL}_{METHOD}")
    
    # Setup
    df, db_names = read_dataset()
    db_schemas   = fetch_BIRD_schemas(db_names)
    print(f'{db_names=}, {len(df)=}')

    client = get_openai_client()
    multi_agent = MultiAgentDiscussion(MODEL, client, get_db_cursor, db_schemas, OUTPUT_PATH)
    evaluator = EvaluatorForBIRD(get_db_cursor)


    # Generate
    raw_responses = multi_agent.batched_generate(df, rounds=3)
    dump_to_json('raw_responses', raw_responses)

    # Parse
    print(f"Finished Generating. Attempting SQL auto-parse...")

    starter_zero = multi_agent.auto_parse_sql_from_response([response['agent_zero_shot'][0] for response in raw_responses])
    dump_to_json('cleaned_zeroshot_starter', starter_zero)

    starter_meta = multi_agent.auto_parse_sql_from_response([response['agent_meta_prompt'][0] for response in raw_responses])
    dump_to_json('cleaned_starter_meta', starter_meta)
    
    cleaned_sql  = multi_agent.auto_parse_sql_from_response([response['verdict'] for response in raw_responses])
    dump_to_json('cleaned_sql', cleaned_sql)

    print(f"SQL auto-parsing successful\n\n")


    # Evaluate results
    print("Evaluating Zero-shot starter generated queries")
    df['starter_zero_shot'] = starter_zero
    df['zero_shot_labels']  = evaluator.evaluate(df, pred_col_name='starter_zero_shot')

    print("Evaluating meta-prompt starter generated queries")
    df['starter_meta_prompt'] = starter_meta
    df['meta_prompt_labels']  = evaluator.evaluate(df, pred_col_name='starter_meta_prompt')

    print("Evaluating Multi-Agent Discussion generated queries")
    df['prediction'] = cleaned_sql
    df['label']      = evaluator.evaluate(df, pred_col_name='prediction')


    # Save results
    df.to_json(OUTPUT_PATH / f'{MODEL}_{METHOD}_df.json', orient='records')

Experiment: gpt-4o_discussion
db_names=['formula_1', 'debit_card_specializing', 'thrombosis_prediction'], len(df)=146
hola


Generating SQL:   0%|          | 0/146 [00:00<?, ?it/s]

Generating SQL: 100%|██████████| 146/146 [2:46:51<00:00, 68.57s/it]  


Finished Generating. Attempting SQL auto-parse...
SQL auto-parsing successful


Evaluating Zero-shot starter generated queries
--- Evaluating Performance ---


Executing SQL:   3%|▎         | 5/146 [00:01<00:42,  3.30it/s]

Q_1480: OperationalError incomplete input


Executing SQL:   5%|▍         | 7/146 [00:01<00:43,  3.19it/s]

Q_1482: OperationalError ambiguous column name: CustomerID


Executing SQL:  47%|████▋     | 68/146 [00:03<00:01, 71.90it/s]

Q_1227: OperationalError no such function: YEAR
Q_1231: OperationalError no such function: YEAR


Executing SQL:  84%|████████▎ | 122/146 [00:04<00:00, 48.02it/s] 

Q_955: OperationalError no such column: ds.year


Executing SQL:  90%|█████████ | 132/146 [00:04<00:00, 43.34it/s]

Q_972: OperationalError no such function: year


Executing SQL: 100%|██████████| 146/146 [00:05<00:00, 26.73it/s]


=== EX Results ===
Accuracy :  39.726%
Breakdown by Difficulty:
	simple:  46.939% (23 of 49)
	moderate:  44.615% (29 of 65)
	challenging:  18.750% (6 of 32)
=== end ===

Evaluating meta-prompt starter generated queries
--- Evaluating Performance ---


Executing SQL:  45%|████▌     | 66/146 [00:03<00:00, 80.58it/s]



Executing SQL:  86%|████████▌ | 125/146 [00:04<00:00, 72.02it/s] 

Q_955: OperationalError no such column: ds.year


Executing SQL: 100%|██████████| 146/146 [00:05<00:00, 25.06it/s]


=== EX Results ===
Accuracy :  48.630%
Breakdown by Difficulty:
	simple:  61.224% (30 of 49)
	moderate:  47.692% (31 of 65)
	challenging:  31.250% (10 of 32)
=== end ===

Evaluating Multi-Agent Discussion generated queries
--- Evaluating Performance ---


Executing SQL:   3%|▎         | 5/146 [00:01<00:40,  3.49it/s]

Q_1480: OperationalError incomplete input


Executing SQL:   8%|▊         | 12/146 [00:01<00:15,  8.90it/s]

Q_1482: OperationalError ORDER BY clause should come after UNION ALL not before


Executing SQL: 100%|██████████| 146/146 [00:19<00:00,  7.62it/s] 

=== EX Results ===
Accuracy :  45.890%
Breakdown by Difficulty:
	simple:  55.102% (27 of 49)
	moderate:  43.077% (28 of 65)
	challenging:  37.500% (12 of 32)
=== end ===






# Experiments:
- Zero Shot
    - with/without COT
- Optimizer (on top of zero-shot)
- Multi-agent:
    - Zero-shot -> Optimizer -> Multi-agent Debate
    - Zero-shot -> Optimizer -> Multi-agent Discussion
    - Best of the above -> Optimizer
- Decomposition and Generation via Multi-agent Debate/Discussion
- Sparse Topology Multi-agent Debate/Discussion
- Augmenting schema with LLM calls:
    - Point out relationships (graph idea)
    - Write short descriptions regarding tables, columns