In [1]:
import sys
import os
from dotenv import load_dotenv
import pandas as pd

# Add the parent directory to the sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from sqltoolkit.connectors import PostgreSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
from sqltoolkit.compiler import SQLQueryChecker

load_dotenv()

# Define the connection parameters
server = os.getenv('POSTGRES_HOST')
database = 'bird_dev_mini'
username = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PWD')
port = '5432'

psql_connector = PostgreSQLConnector(server, database, username, password, port)
sql_client = DatabaseClient(psql_connector)

from openai import AzureOpenAI

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT') # should be GPT-4o
aoai_embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT') # should be text-embedding-small

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

In [6]:
import os
import pandas as pd

directory = 'documentation'

schema_descriptions =[]
for root, dirs, files in os.walk(directory):
    concatenated_content = ""
    if 'database_description' in dirs:
        db_desc_path = os.path.join(root, 'database_description')
        for filename in os.listdir(db_desc_path):
            if filename.endswith(".csv"):
                table_name = filename.split('.')[0]
                df = pd.read_csv(os.path.join(db_desc_path, filename), encoding='ISO-8859-1')
                table_markdown = df.to_markdown(index=False)
                concatenated_content += f"\n### Table: {table_name}\n\n{table_markdown}\n\n"
        concatenated_content = concatenated_content.encode('utf-8').decode('utf-8')
        schema_descriptions.append({'description':concatenated_content, 'schema':root.split('/')[-1]})
    


In [9]:
[sc['schema'] for sc in schema_descriptions]

['financial',
 'toxicology',
 'card_games',
 'codebase_community',
 'debit_card_specializing',
 'superhero',
 'thrombosis_prediction',
 'formula_1',
 'california_schools',
 'student_club',
 'european_football_2']

In [2]:

search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')
index_name = "bird-dev-mini"

In [10]:
from sqltoolkit.indexer import DatabaseIndexer

search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')
index_name = "bird-dev-mini"

for schema in schema_descriptions:
    if schema['schema'] == 'european_football_2':
        print(f"Indexing schema: {schema['schema']}")
        context = schema['description']
        schema_name = schema['schema']

        indexer = DatabaseIndexer(sql_client, 
                                openai_client, 
                                aoai_deployment,
                                aoai_embedding_deployment,
                                extra_context=context)
        
        manifest = indexer.fetch_and_describe_tables(regex_filter=f"{schema_name}.*")
        indexer.generate_table_embeddings()

        indexer.create_azure_ai_search_index(
        search_endpoint=search_endpoint,
        search_credential=search_key,
        index_name=index_name,
        embedding_deployment=aoai_embedding_deployment,
        openai_endpoint=aoai_endpoint,
        openai_key=aoai_key,
    )

    # write to AI Search
        indexer.push_to_ai_search()


2025-01-24 12:54:30,614 - DatabaseIndexer - INFO - Fetching tables from the database.
2025-01-24 12:54:30,678 - DatabaseIndexer - INFO - Filtered tables by regex: ['european_football_2.league', 'european_football_2.match', 'european_football_2.country', 'european_football_2.player', 'european_football_2.player_attributes', 'european_football_2.team', 'european_football_2.team_attributes']
2025-01-24 12:54:30,680 - DatabaseIndexer - INFO - Processing table: european_football_2.league


Indexing schema: european_football_2


2025-01-24 12:54:47,979 - DatabaseIndexer - INFO - Completed processing table: european_football_2.league
2025-01-24 12:54:47,981 - DatabaseIndexer - INFO - Processing table: european_football_2.match


KeyboardInterrupt: 

In [3]:
import time
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
import json

search_client = SearchClient(endpoint=search_endpoint, 
                             credential=AzureKeyCredential(search_key), 
                             index_name=index_name)

all_tables = json.loads(sql_client.list_database_tables())

def get_tables_from_schema(schema_name: str, all_tables: list) -> str:
    schema_tables = [table['TABLE_NAME'] for table in all_tables if table['TABLE_NAME'].startswith(schema_name)]
    return ','.join(schema_tables)

def rewrite_query(openai_client: AzureOpenAI, 
                  deployment_name:str,
                  user_question:str,
                  evidence:str=None,
                  tables:list=[])->str:
    rewriter_prompt = f"""
    - You are a helpful AI SQL Agent. Your role is to help people translate their questions 
    into better questions that can be translated into SQL queries.
    - You must translate the natural language question into a better version of the question
    - You should make sure each term in the question could be a column name, table name, or a keyword in the database
    
    ## YOU MUST USE THE FOLLOWING CONTEXT ABOUT THE DATABASE TO REWRITE THE QUESTION:
    {tables}

    ## Do not use specific column names or table names but use the provided tables as context for the terminology
    ## Do not make any assumptions about the SQL query that will be generated
    ## Use as many words as needed to make the question better
    
    You must only return a single sentence 
    """

    messages = [
        {
            "role": "system",
            "content": rewriter_prompt
        },
        {
            "role": "user",
            "content": user_question
        }
    ]

    rewriter_response = openai_client.chat.completions.create(
        model=deployment_name,
        messages=messages,
    )

    rewriter_response = rewriter_response.choices[0].message.content
    return rewriter_response


def search_tables_filtered(search_client: SearchClient, user_question: str, schema: str) -> list:
    
    filter_string = get_tables_from_schema(schema, all_tables).replace('.','__')
    vector_query = VectorizableTextQuery(text=user_question, k_nearest_neighbors=50, fields="embedding")
    results = search_client.search(
        search_text=user_question,
        vector_queries=[vector_query],
        filter=f"search.in(name_key, '{filter_string}',',')",
        vector_filter_mode="preFilter",
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name='my-semantic-config',
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=10
    )

    candidate_tables = []
    for result in results:
        candidate_tables.append(
            {key: result[key] for key in ['name', 'business_readable_name', 'description', 'columns'] if key in result}
        )

    return candidate_tables

def table_re_ranker(openai_client: AzureOpenAI, 
                    deployment:str,
                    user_question:str, 
                    candidate_tables:list)->str:
    
    reranker_prompt = f"""You are a helpful AI SQL Agent.
    Your role is to identify the most relevant tables that can be used to answer the user's question.
    You are provided with a list of tables and their schema.

    ## Important instructions:
    - Your role is to select the tables that might be relevant to the user's question.
    - Don't make assumptions on the SQL query that will be generated, only select the tables that might be relevant to the user's question.
    - You must also select a subset of the columns that contain the information needed to answer the user's question.
    - You must return 5 tables
    - You must rank the tables based on their relevance to the user's question

    You should return the 5 tables in the following JSON format:
    {{ "tables":[
        {{
            "table": "table_name",
            "columns": ["column1", "column2", "column3"]
        }}
    ]}}

    Do not return anything other than the json

    """
    user_prompt = f"""
    <User Question>: {user_question} </User Question>
    <Candidate Tables>: {candidate_tables} </Candidate Tables>"""

    messages = [
        {
            "role": "system",
            "content": reranker_prompt
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]
    
    response = openai_client.chat.completions.create(
        model=deployment,
        messages=messages,
        temperature=0.3,
        response_format={"type": "json_object"}
    )

    response_message = response.choices[0].message.content
    return response_message

def filter_candidate_tables(candidate_tables, reranked_tables_json):
    filtered_tables = []
    for table in candidate_tables:
        for reranked_table in reranked_tables_json['tables']:
            if table['name'] == reranked_table['table']:
                filtered_columns = [col for col in table['columns'] if col['name'] in reranked_table['columns']]
                filtered_tables.append({
                    'name': table['name'],
                    'description': table['description'],
                    'columns': filtered_columns
                })
    return filtered_tables

In [4]:
import json

query = "Which driver ranked the first in the Canadian Grand Prix in 2007? Please give his reference name."
evidence = None

tables = search_client.search(search_text="formula_1")
tables = [{'name':table['name'],
           'description':table['description']} for table in tables]


rewritten_query = rewrite_query(openai_client, 'gpt-4o-mini', query, evidence, tables)
print(rewritten_query)

candidate_tables = search_tables_filtered(search_client, query,schema='formula_1')
print(len(candidate_tables))

#reranked_tables = table_re_ranker(openai_client, 'gpt-4o-mini', rewritten_query, candidate_tables)


print(candidate_tables)

What is the reference name of the driver who achieved the highest ranking in the Canadian Grand Prix during the 2007 racing season?
10
[{'name': 'formula_1.drivers', 'business_readable_name': 'Driver Profiles', 'description': 'The "drivers" table contains information about Formula 1 drivers, serving as a comprehensive record of individual racers. Each driver is uniquely identified by a driver ID, which links them to their respective data in other related tables. The table includes personal details such as the driver\'s forename, surname, date of birth, and nationality, as well as a driver reference name that serves as a shorthand identifier. Additional information like racing numbers, abbreviated codes, and URL links to driver profiles provides a richer view of each participant\'s identity and career background within the racing database. This table is essential for analyzing driver-specific data, demographics, and career statistics.', 'columns': [{'name': 'driverid', 'type': 'bigint',

In [5]:
def check_query(sql_query:str, tables:list, openai_client, deployment)->bool:
    query_checker = SQLQueryChecker(openai_client, deployment,'Postgres', tables)
    query_checks = query_checker.validate_query(sql_query)
    return query_checks


def generate_sql_query(openai_client: AzureOpenAI,
                          deployment:str,
                          user_question:str,
                          evidence:str,
                          candidate_tables:list)->str:
    
    candidate_tables_min = [{'table': table['name'], 'columns': [col['name'] for col in table['columns']]} for table in candidate_tables]
    system_prompt = f"""
     You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a SQL database,
     The database is SQL server, use the right syntax to generate queries
    
     ### You must use the POSTGRES SQL syntax to query the database
     ### You must use double quotes around the column names but not around table names
     ### Your output must be a single SQL query
     ### You must use the column sample values to infer the value in your WHERE clause
    
     ### These are the available tables in the database alongside their descriptions:
     <table_descriptions>
     {candidate_tables}
     </table_descriptions>
    
     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
     - Identify the tables that contain the information needed to answer the question
     - Only once this is done, create a SQL query to retrieve the information based on your understanding of the table
    
     # You must use the exact table names and column names provided in the table descriptions
     # Do not rewrite any column names in a way that is different from the table descriptions
     # You must write the column names and table names exactly as they are in the table descriptions
     # if the table name provided in the table descriptions is of the format schema.table, you must use the schema.table format in your query
     # You must produce a valid SQL query that can be executed, the query should not have any syntax errors
     # When calculating ratios, you must prevent division by zero errors 

     # Call the check_query function to validate your query before returning it
     # You must provide your entire query to the check_query function, do not split it into parts
    
     Think step by step, before doing anything, share the different steps you'll execute to get the answer
    
    ## Your response should be in JSON format with the following structure:
    
    {{"chain_of_thought": "Your reasoning",
    "sql_query": "The generated SQL query, this can only be a single query"}}

    ## You must return a valid JSON
    ## VERY IMPORTANT: You cannot return anything other than the JSON object

    ## If the provided context does not give enough information to generate a query, return an empty query

    """
     
    user_prompt = f"""<User Question>: {user_question} </User Question>
                <Additional Context>: {evidence} </Additional Context>"""
     
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]
    
    response = openai_client.chat.completions.create(
        model=deployment,
        messages=messages,
        response_format={"type": "json_object"},
    )
    response_message = response.choices[0].message
    messages.append(response_message)

    # Check the query for common mistakes
    response_json = json.loads(response_message.content)['sql_query']
    query_checks = check_query(response_json, candidate_tables_min, openai_client, deployment)
    
    counter = 0
    while counter < 2:
        if not query_checks.get('query_valid'):
            messages.append({
                "role": "user",
                "content": f"Query validation failed, please fix your query: {query_checks.get('error')}"
            })

            response = openai_client.chat.completions.create(
                model=deployment,
                messages=messages,
                response_format={"type": "json_object"},
            )
            response_message = response.choices[0].message
            response_json = json.loads(response_message.content)['sql_query']
            query_checks = check_query(response_json, candidate_tables_min, openai_client, deployment)
            counter += 1
        else:
            break

    return response




def execute_query_pipeline(user_question,schema, evidence=None):
    tables = search_client.search(search_text=schema)
    tables = [{'name':table['name'],
           'description':table['description']} for table in tables if table['name'].startswith(schema)]
    rewriter_response = rewrite_query(openai_client, 'gpt-4o-mini', user_question, evidence, tables)
    candidate_tables = search_tables_filtered(search_client, user_question, schema)
    response = generate_sql_query(openai_client, aoai_deployment, user_question, evidence, candidate_tables)
    response_json = json.loads(response.choices[0].message.content, strict=False)
    candidate_table_names = [table['name'] for table in candidate_tables]
    
    return {'sql_query':response_json['sql_query'],
            'chain_of_thought':response_json['chain_of_thought'],
            'tables':candidate_table_names,
            'rewritten_question':rewriter_response,
            'table_context':candidate_tables,
            }
    
    

# Example usage
user_question = "Which driver ranked the first in the Canadian Grand Prix in 2007? Please give his reference name."
hint = "formula_1"
start_time = time.time()
result = execute_query_pipeline(user_question,'formula_1' ,evidence=hint)
end_time = time.time()

latency = end_time - start_time
print("Result:", result)
print("Latency:", latency, "seconds")

Result: {'sql_query': 'SELECT d."driverref" FROM formula_1.drivers d JOIN formula_1.results r ON d."driverid" = r."driverid" JOIN formula_1.races ra ON r."raceid" = ra."raceid" WHERE ra."year" = 2007 AND ra."name" = \'Canadian Grand Prix\' AND r."position" = 1;', 'chain_of_thought': "To determine which driver ranked first in the Canadian Grand Prix in 2007 and get his reference name, I need to follow these steps: 1) Identify the race ID for the Canadian Grand Prix in 2007 from the 'formula_1.races' table using the year of 2007 and the race name 'Canadian Grand Prix'. 2) With the race ID obtained, I will find the driver that ranked first in that race from the 'formula_1.results' table. 3) Using the driver ID who ranked first, I will query the 'formula_1.drivers' table to get the driver's reference name.", 'tables': ['formula_1.drivers', 'formula_1.circuits', 'formula_1.driverstandings', 'formula_1.constructors', 'formula_1.pitstops', 'formula_1.seasons', 'formula_1.constructorstandings'

In [7]:
sql_client.query(result['sql_query'])

'[{"driverref": "hamilton"}]'

## Benchmark on the whole bird_dev_mini

In [8]:
import pandas as pd

df = pd.read_json('mini_dev_postgresql.json')
eval = df[df['db_id']!='european_football_2']
eval.shape

(449, 6)

In [10]:
from tqdm import tqdm
from datetime import datetime

tqdm.pandas()

eval['llm_result'] = eval.progress_apply(lambda row: execute_query_pipeline(row['question'],row['db_id'], row['evidence']), axis=1)
eval['generated_sql_query'] = eval['llm_result'].apply(lambda x: x['sql_query'])
eval['rewritten_question'] = eval['llm_result'].apply(lambda x: x['rewritten_question'])
eval['generated_context_tables'] = eval['llm_result'].apply(lambda x: x['tables'])
eval['generated_chain_of_thought'] = eval['llm_result'].apply(lambda x: x['chain_of_thought'])

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
eval.to_csv(f'mini_dev_postgresql_evaluated_{timestamp}.csv', index=False)

100%|██████████| 449/449 [41:08<00:00,  5.50s/it]


In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()


eval = pd.read_csv('mini_dev_postgresql_evaluated.csv')

def run_query(query):
    try:
        result = sql_client.query(query)
        return result
    except Exception as e:
        return str(e)
    
class isValidSQL:
    def __init__(self, db_client: DatabaseClient):
        self.db_client = db_client

    def __call__(self,*, query,**kwds):
        try:
            result = self.db_client.query(query)
            return 1
        except Exception as e:
            return 0
        

#eval['generated_query_result'] = eval['generated_sql_query'].apply(run_query)
#eval['ground_truth_result'] = eval['SQL'].apply(run_query)
#eval['IsValidSQL']=eval.apply(lambda row: isValidSQL(sql_client)(query=row['generated_sql_query']), axis=1)

for i, row in eval.head(15).iterrows():
    print(i)
    row['generated_query_result'] = run_query(row['generated_sql_query'])
    row['ground_truth_result'] = run_query(row['SQL'])



0
1
2
3
4
5
6
7


In [108]:
from langchain_community.utilities.sql_database import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
from langchain_openai import AzureChatOpenAI
import psycopg2
from langchain_community.agent_toolkits import create_sql_agent
from langchain.callbacks.base import BaseCallbackHandler

# Construct the database URI
pg_db_uri = f"postgresql+psycopg2://{username}:{password}@{server}:5432/{database}"

# Establish database connections
# Initialize the Azure OpenAI language model
azure_llm = AzureChatOpenAI(
  azure_endpoint = aoai_endpoint,
  api_key=aoai_key,
  api_version="2024-10-21",
  deployment_name='gpt-4o-global',
)

class SQLHandler(BaseCallbackHandler):
    def __init__(self):
        self.sql_result = []

    def on_agent_action(self, action, **kwargs):
        """Run on agent action. if the tool being used is sql_db_query,
         it means we're submitting the sql and we can 
         record it as the final sql"""

        if action.tool in ["sql_db_query_checker","sql_db_query"]:
            self.sql_result.append(action.tool_input)


def execute_sql_agent(user_query, schema, hint):
    pg_db = SQLDatabase.from_uri(pg_db_uri, schema=schema)
    agent_executor = create_sql_agent(azure_llm, db=pg_db, agent_type="openai-tools", verbose=False)
    prompt = f"""
- Here's additional context to help answer the question {hint}

Question: {user_query}
"""
    handler = SQLHandler()
    try:
        response = agent_executor.invoke({'input': prompt}, {'callbacks': [handler]})
        print(response)
        sql_queries = handler.sql_result
        return sql_queries
    except Exception as e:
        return None

eval['langchain_result'] = eval.progress_apply(lambda row: execute_sql_agent(row['question'], row['db_id'], row['evidence']), axis=1)

  0%|          | 0/10 [00:00<?, ?it/s]

 20%|██        | 2/10 [00:09<00:39,  4.92s/it]

{'input': "\n- Here's additional context to help answer the question the superheroes from Marvel Comics refers to publisher_name = 'Marvel Comics'; most common color refers to COUNT(superhero.id) DESC;\n\nQuestion: Rank superheroes from Marvel Comics by their eye color popularity, starting with the most common color.\n", 'output': 'The ranking of superheroes from Marvel Comics by their eye color popularity, starting with the most common color, is as follows:\n\n1. Blue - 126 superheroes\n2. Brown - 89 superheroes\n3. No Colour - 47 superheroes\n4. Green - 40 superheroes\n5. Red - 31 superheroes\n6. Yellow - 13 superheroes\n7. White - 12 superheroes\n8. Black - 11 superheroes\n9. Grey - 4 superheroes\n10. Hazel - 3 superheroes'}


 30%|███       | 3/10 [00:16<00:39,  5.71s/it]

{'input': "\n- Here's additional context to help answer the question Spanish Grand Prix is the name of race refers to name = 'Spanish Grand Prix'; average fastest lap speed refers to avg(fastestLapSpeed);\n\nQuestion: What is the average of fastest lap speed in the 2009 Spanish Grand Prix race?\n", 'output': 'The average fastest lap speed in the 2009 Spanish Grand Prix race was approximately 199.64 km/h.'}


 40%|████      | 4/10 [00:22<00:35,  5.86s/it]

{'input': "\n- Here's additional context to help answer the question hydrogen refers to element = 'h'; ratio = DIVIDE(SUM(element = 'h'), count(element)) where molecule_id = 'TR006' ; label = '+' mean molecules are carcinogenic; label = '-' means molecules are non-carcinogenic\n\nQuestion: What is the ratio of Hydrogen elements in molecule ID TR006? List the ratio with its label.\n", 'output': "The ratio of Hydrogen elements in molecule ID TR006 is approximately 0.362, and the label for this molecule is '+', indicating it is carcinogenic."}


 50%|█████     | 5/10 [00:28<00:29,  5.92s/it]

{'input': "\n- Here's additional context to help answer the question drivers who finished the race refers to time is not empty (i.e. time IS NOT NULL); race number refers to raceId; date of birth refers to drivers.dob; The larger the birthday value, the younger the person is, and vice versa;\n\nQuestion: For all the drivers who finished the game in race No. 592, who is the oldest?\n", 'output': 'The oldest driver who finished the race in race No. 592 is Jean-Pierre Beltoise, born on April 26, 1937.'}


 60%|██████    | 6/10 [00:49<00:43, 10.77s/it]

{'input': "\n- Here's additional context to help answer the question August of 2012 refers to the Date value = '201208' ; Price per unit of product = Price / Amount;\n\nQuestion: For all the people who paid more than 29.00 per unit of product id No.5. Give their consumption status in the August of 2012.\n", 'output': 'Here are the customers who paid more than $29.00 per unit for product id No.5 and their consumption status in August 2012:\n\n1. Customer ID: 18831, Consumption: 1903.2\n2. Customer ID: 5443, Consumption: 88265.39\n3. Customer ID: 46933, Consumption: 1129.2\n4. Customer ID: 6768, Consumption: 126157.7\n5. Customer ID: 6768, Consumption: 126157.7\n6. Customer ID: 20447, Consumption: 58.19\n7. Customer ID: 45868, Consumption: 1142.95\n8. Customer ID: 23861, Consumption: 8878.07\n9. Customer ID: 24700, Consumption: 69331.72\n10. Customer ID: 7626, Consumption: 45937.22\n\nNote: Customer ID 6768 appears twice, indicating multiple qualifying transactions.'}


 70%|███████   | 7/10 [00:57<00:29,  9.82s/it]

{'input': "\n- Here's additional context to help answer the question List refers to GROUP_CONCAT(DISTINCT ID); total bilirubin (T-BIL) not within normal range refers to T-BIL > = 2.0\n\nQuestion: List and group all patients by sex for total bilirubin (T-BIL) level not within the normal range.\n", 'output': 'Here are the grouped patient IDs by sex for those with total bilirubin (T-BIL) levels not within the normal range (T-BIL >= 2.0):\n\n- Female (F): 1180510, 2308236, 250391, 2931207, 4862013\n- Male (M): 1137040, 2307640, 43003, 444499, 4618443'}


 80%|████████  | 8/10 [01:06<00:19,  9.53s/it]

{'input': "\n- Here's additional context to help answer the question in posts with 1 comment refers to CommentCount = 1;\n\nQuestion: In posts with 1 comment, how many of the comments have 0 score?\n", 'output': 'In posts with 1 comment, there are 10,997 comments that have a score of 0.'}


 90%|█████████ | 9/10 [01:11<00:08,  8.39s/it]

{'input': "\n- Here's additional context to help answer the question user no. 23853 refers to UserId = '23853'; at 9:08:18 on 2013/7/12 refers to CreationDate = '2013-07-12 09:08:18.0'; not well-finished refers to ClosedDate IS NULL and vice versa\n\nQuestion: User No.23853 gave a comment to a post at 9:08:18 on 2013/7/12, was that post well-finished?\n", 'output': 'The post commented on by User No.23853 at 9:08:18 on 2013/7/12 was not well-finished, as its `closeddate` is `NULL`.'}


100%|██████████| 10/10 [01:18<00:00,  7.73s/it]

{'input': "\n- Here's additional context to help answer the question born in 1980 refers to YEAR(BIRTHDAY) = '1980'; 'RA' refers to Diagnosis='RA' ; women refers to SEX = 'F'; calculation = DIVIDE(SUM(SEX = 'F'), COUNT(SEX)) * 100\n\nQuestion: What percentage of patients who were born in 1980 and were diagnosed with RA are women?\n", 'output': 'The percentage of patients who were born in 1980 and were diagnosed with RA that are women is 100%.'}


100%|██████████| 10/10 [01:28<00:00,  8.89s/it]

{'input': "\n- Here's additional context to help answer the question Percent (%) Eligible FRPM (Ages 5-17) can be acquired by `FRPM Count (Ages 5-17)` / `Enrollment (Ages 5-17)` * 100\n\nQuestion: Which schools served a grade span of Kindergarten to 9th grade in the county of Los Angeles and what is its Percent (%) Eligible FRPM (Ages 5-17)?\n", 'output': 'Here are schools in Los Angeles County that served a grade span of Kindergarten to 9th grade and their respective Percent (%) Eligible FRPM (Ages 5-17):\n\n1. White Oak Elementary: 3.76%\n2. The Accelerated: 97.64%'}



