In [1]:
from sqltoolkit.connectors import PostgreSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()


# Define the connection parameters
server = os.getenv('POSTGRES_HOST')
database = 'nfl'
username = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PWD')
port = '5432'

psql_connector = PostgreSQLConnector(server, database, username, password, port)
sql_client = DatabaseClient(psql_connector)
  

In [2]:
from openai import AzureOpenAI

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT') # should be GPT-4o
aoai_embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT') # should be text-embedding-small

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

In [3]:
from sqltoolkit.indexer import DatabaseIndexer

context = """
The database contain statistics about the National Football League (NFL).
The tables contain information about the players, teams, games, and statistics.
"""

indexer = DatabaseIndexer(sql_client, 
                          openai_client, 
                          aoai_deployment,
                          aoai_embedding_deployment,
                          extra_context=context)

manifest = indexer.fetch_and_describe_tables(regex_filter='public.nfl_teams_stats_2024_.*')
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

2025-01-27 15:55:54,623 - DatabaseIndexer - INFO - Fetching tables from the database.
2025-01-27 15:55:54,955 - DatabaseIndexer - INFO - Filtered tables by regex: ['public.nfl_teams_stats_2024_passing', 'public.nfl_teams_stats_2024_receiving', 'public.nfl_teams_stats_2024_rushing', 'public.nfl_teams_stats_2024_scoring', 'public.nfl_teams_stats_2024_downs']
2025-01-27 15:55:54,956 - DatabaseIndexer - INFO - Processing table: public.nfl_teams_stats_2024_passing


KeyboardInterrupt: 

In [11]:
indexer.tables[1].model_dump()

{'name': 'public.nfl_teams_stats_2024_receiving',
 'business_readable_name': 'NFL Teams 2024 Receiving Stats',
 'description': 'The table public.nfl_teams_stats_2024_receiving contains statistical data on the receiving performance of NFL teams during the 2024 season. It includes metrics on team performance such as total receptions, receiving yards, receiving touchdowns, and significant plays like receptions of 20 or more yards and 40 or more yards. Additionally, it tracks first down receptions, the percentage of receptions that resulted in first downs, and fumbles made by receivers. This table is used to evaluate and compare the receiving effectiveness and efficiency of different NFL teams for the specified season.',
 'columns': [{'name': 'Team',
   'type': 'text',
   'description': None,
   'definition': 'The "Team" column in the "public.nfl_teams_stats_2024_receiving" table contains textual data representing the names of NFL teams. Each entry in this column specifies the name of a te

In [4]:
search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')
index_name = 'nfl_database'


In [None]:

indexer.create_azure_ai_search_index(
    search_endpoint=search_endpoint,
    search_credential=search_key,
    index_name=index_name,
    embedding_deployment=aoai_embedding_deployment,
    openai_endpoint=aoai_endpoint,
    openai_key=aoai_key,
)

# write to AI Search
indexer.push_to_ai_search()

In [21]:
# query rewriting
user_question = "Show me passing yards for the Chicago bears in 2024"

rewriter_prompt = f"""
You are a helpful AI SQL Agent. Your role is to help people translate their questions 
into better questions that can be translated into SQL queries.
You should disimbiguate the question and provide a more specific version of the question to the best of your ability
You should rewrite the question in a way that is more likely to be answered by the database.

You must only return a single sentence that is a better version of the user's question.

"""

messages =[
    {
        "role": "system",
        "content": rewriter_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

rewriter_response = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
               )

rewriter_response = rewriter_response.choices[0].message.content
print(rewriter_response)

What are the total passing yards for the Chicago Bears during the 2024 NFL season?


In [22]:
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
import json

search_client = SearchClient(endpoint=search_endpoint, 
                             credential=AzureKeyCredential(search_key), 
                             index_name=index_name)

  
vector_query = VectorizableTextQuery(text=rewriter_response, 
                                     k_nearest_neighbors=50, 
                                     fields="embedding")

results = search_client.search(  
    search_text=user_question,  
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)
  
candidate_tables = []
for result in results:
    candidate_tables.append(
        json.dumps({key: result[key] for key in ['name',
                                                 'business_readable_name',
                                                 'description',
                                                 'columns'] if key in result}, indent=4))

In [23]:
candidate_tables

['{\n    "name": "public.nfl_teams_stats_2024_passing",\n    "business_readable_name": "NFL Teams 2024 Passing Stats",\n    "description": "The \\"public.nfl_teams_stats_2024_passing\\" table contains statistical data on the passing performance of NFL teams during the 2024 season. This table includes metrics such as passing attempts, completions, yards gained, interceptions, touchdowns, and completions resulting in significant gains or first downs. The data helps evaluate multiple aspects of a team\'s passing efficiency, strategy, and offensive effectiveness. Additionally, it provides insights into both the accomplishments and shortcomings of each team\'s passing game, including quarterback performance and offensive line protection.",\n    "columns": [\n        {\n            "name": "Team",\n            "type": "text",\n            "description": null,\n            "definition": "The \\"Team\\" column in the \\"public.nfl_teams_stats_2024_passing\\" table contains the names of the NFL

In [None]:
tables_prompt = candidate_tables

system_prompt = f"""
You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a sql database,
     The database is SQL server, use the right syntax to generate queries


     ### You must use the POSTGRES SQL syntax to query the database
     ### You must use double quotes around the column names
     ### You must use double quotes around schema names and table names
     
     ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contains the information needed to answer the question
     - Only once this is done, create a sql query to retrieve the information based on your understanding of the table
 
    Think step by step, before doing anything, share the different steps you'll execute to get the answer
    
    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
     "sql_query": "The generated sql query"


    ## You must always try to match the formatting and syntax of these example queries
    
    ### Important Examples:    
    question: "Show me the first 5 rows of the sales_data table"
    output: "chain_of_thought": "Your reasoning",
     "sql_query": "SELECT * FROM "public"."sales_data" LIMIT 5"

"""

messages =[
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

for i in range(10):
    response = openai_client.chat.completions.create(
                    model="gpt-4o",
                    messages=messages,
                    response_format={"type": "json_object"},
                    temperature=0.3,
                    seed=42,
                )

    response_message = response.choices[0].message.content
    response_json = json.loads(response_message)

    print(response_json['sql_query'])

SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'
SELECT "Pass Yds" FROM "public"."nfl_teams_stats_2024_passing" WHERE "Team" = 'Bears  Bears'


In [20]:
sql_client.query(response_json['sql_query'])

'[{"Team": "Bears  Bears", "Att": 468, "Cmp": 290, "Cmp %": 62.0, "Yds/Att": 6.3, "Pass Yds": 2948, "TD": 17, "INT": 5, "Rate": 87.6, "1st": 140, "1st%": 29.9, "20+": 33, "40+": 5, "Lng": "47", "Sck": 58, "SckY": 402}]'

In [20]:
import time

def execute_query_pipeline(user_question):
    # Query rewriting
    rewriter_prompt = f"""
    You are a helpful AI SQL Agent. Your role is to help people translate their questions 
    into better questions that can be translated into SQL queries.
    You should disambiguate the question and provide a more specific version of the question to the best of your ability.
    You should rewrite the question in a way that is more likely to be answered by the database.

    You must only return a single sentence that is a better version of the user's question.
    """

    messages = [
        {
            "role": "system",
            "content": rewriter_prompt
        },
        {
            "role": "user",
            "content": user_question
        }
    ]

    rewriter_response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )

    rewriter_response = rewriter_response.choices[0].message.content
    print("Rewritten Question:", rewriter_response)

    # AI Search
    vector_query = VectorizableTextQuery(text=rewriter_response, k_nearest_neighbors=50, fields="embedding")

    results = search_client.search(
        search_text=user_question,
        vector_queries=[vector_query],
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name='my-semantic-config',
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=3
    )

    candidate_tables = []
    for result in results:
        candidate_tables.append(
            json.dumps({key: result[key] for key in ['name', 'business_readable_name', 'description', 'columns'] if key in result}, indent=4)
        )

    # LLM Call
    tables_prompt = candidate_tables

    system_prompt = f"""
    You are a helpful AI data analyst assistant, 
    You can execute SQL queries to retrieve information from a SQL database,
    The database is SQL server, use the right syntax to generate queries

    ### You must use the POSTGRES SQL syntax to query the database
    ### You must use double quotes around the column names but not around table names
    ### Your output must be a single SQL query
    ### You must use the column sample values to infer the value in your WHERE clause

    ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

    When asked a question that could be answered with a SQL query: 
    - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contain the information needed to answer the question
    - Only once this is done, create a SQL query to retrieve the information based on your understanding of the table

    Think step by step, before doing anything, share the different steps you'll execute to get the answer

    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
    "sql_query": "The generated SQL query, this can only be a single query"
    """

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_question
        }
    ]

    response = openai_client.chat.completions.create(
        model="gpt-4o-global",
        messages=messages,
        response_format={"type": "json_object"}
    )

    response_message = response.choices[0].message.content
    response_json = json.loads(response_message)
    print("Candidate Tables:", [json.loads(tbl).get('name') for tbl in candidate_tables])
    print("Generated Chain of Thought:", response_json['chain_of_thought'])
    print("Generated SQL Query:", response_json['sql_query'])

    # Execute SQL Query
    query_result = sql_client.query(response_json['sql_query'])
    return query_result

# Example usage
user_question = "Show me passing stats for all AFC teams"

start_time = time.time()
result = execute_query_pipeline(user_question)
end_time = time.time()

latency = end_time - start_time
print("Result:", result)
print("Latency:", latency, "seconds")

Rewritten Question: What are the passing statistics for each team in the AFC for the current season?
Candidate Tables: ['public.nfl_teams_stats_2024_passing', 'public.nfl_teams_stats_2024_downs', 'public.nfl_teams_stats_2024_receiving']
Generated Chain of Thought: To show passing stats for all AFC teams, we first need to filter the data by teams that belong to the AFC. However, the given tables do not explicitly categorize teams by their conferences (AFC or NFC). We need to manually filter the teams known to be part of the AFC using the 'public.nfl_teams_stats_2024_passing' table. I'll use the 'Team' column and assume values for AFC teams like 'Bills', 'Broncos', 'Browns', 'Chiefs', etc. are known as sample values indicate typical AFC teams. Thus, I'll generate a WHERE clause that filters the results for these specific teams.
Generated SQL Query: SELECT * FROM public.nfl_teams_stats_2024_passing WHERE "Team" IN ('Bills  Bills', 'Broncos  Broncos', 'Browns  Browns', 'Chargers  Chargers'

In [28]:
import time 
import numpy as np

response_generation_prompt = """
You are a helpful AI data analyst assistant,
You are given a question and the result of the SQL query that answers the question

Your role is to generate a response to the question only based on the result of the SQL query
You cannot use anything else than the result of the SQL query to generate the response

Your response must be short and formatted in markdown for any table or list.
"""

response_generation_model = "gpt-4o-mini"

messages = [
    {
        "role": "system",
        "content": response_generation_prompt
    },
    {
        "role": "user",
        "content": f"""{time.strftime("%Y-%m-%d %H:%M:%S")}
<Question> {user_question} </Question>
<SQL_Result> {result} </SQL_Result>
        
        """
    }
]

print(f"User Question: {user_question}")
latencies = {"gpt-4o-mini": [], "gpt-4o-global": []}
tokens = {"gpt-4o-mini": [], "gpt-4o-global": []}

for _ in range(20):
    for generation_model in ["gpt-4o-mini", "gpt-4o-global"]:
        start = time.time()
        response = openai_client.chat.completions.create(
            model=generation_model,
            messages=messages,
        )
        end = time.time()
        latencies[generation_model].append(end - start)
        tokens[generation_model].append(response.usage.completion_tokens)

# Calculate average and p95 latency
for model in ["gpt-4o-mini", "gpt-4o-global"]:
    avg_latency = np.mean(latencies[model])
    p95_latency = np.percentile(latencies[model], 95)
    avg_tokens = np.mean(tokens[model])
    print(f"{model} - Average Latency: {avg_latency:.2f} seconds")
    print(f"{model} - P95 Latency: {p95_latency:.2f} seconds")
    print(f"{model} - Average Tokens: {avg_tokens:.2f}")

User Question: Show me passing stats for all AFC teams
gpt-4o-mini - Average Latency: 9.89 seconds
gpt-4o-mini - P95 Latency: 12.25 seconds
gpt-4o-mini - Average Tokens: 1138.80
gpt-4o-global - Average Latency: 14.94 seconds
gpt-4o-global - P95 Latency: 24.68 seconds
gpt-4o-global - Average Tokens: 1155.25
