In [18]:
from sqltoolkit.connectors import PostgreSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()


# Define the connection parameters
server = os.getenv('POSTGRES_HOST')
database = 'nfl'
username = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PWD')
port = '5432'

psql_connector = PostgreSQLConnector(server, database, username, password, port)
sql_client = DatabaseClient(psql_connector)
  

In [8]:
from openai import AzureOpenAI

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT') # should be GPT-4o
aoai_embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT') # should be text-embedding-small

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

In [9]:
from sqltoolkit.indexer import DatabaseIndexer

context = """
The database contain statistics about the National Football League (NFL).
The tables contain information about the players, teams, games, and statistics.
"""

indexer = DatabaseIndexer(sql_client, 
                          openai_client, 
                          aoai_deployment,
                          aoai_embedding_deployment,
                          extra_context=context)

manifest = indexer.fetch_and_describe_tables(regex_filter='public.nfl_teams_stats_2024_.*')
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

2024-12-17 17:13:33,692 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-17 17:13:33,692 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-17 17:13:33,737 - DatabaseIndexer - INFO - Filtered tables by regex: ['public.nfl_teams_stats_2024_passing', 'public.nfl_teams_stats_2024_receiving', 'public.nfl_teams_stats_2024_rushing', 'public.nfl_teams_stats_2024_scoring', 'public.nfl_teams_stats_2024_downs']
2024-12-17 17:13:33,737 - DatabaseIndexer - INFO - Filtered tables by regex: ['public.nfl_teams_stats_2024_passing', 'public.nfl_teams_stats_2024_receiving', 'public.nfl_teams_stats_2024_rushing', 'public.nfl_teams_stats_2024_scoring', 'public.nfl_teams_stats_2024_downs']
2024-12-17 17:13:33,738 - DatabaseIndexer - INFO - Processing table: public.nfl_teams_stats_2024_passing
2024-12-17 17:13:33,738 - DatabaseIndexer - INFO - Processing table: public.nfl_teams_stats_2024_passing
2024-12-17 17:13:58,823 - DatabaseIndexer - INFO - Completed processing 

In [11]:
indexer.tables[1].model_dump()

{'name': 'public.nfl_teams_stats_2024_receiving',
 'business_readable_name': 'NFL Teams 2024 Receiving Stats',
 'description': 'The table public.nfl_teams_stats_2024_receiving contains statistical data on the receiving performance of NFL teams during the 2024 season. It includes metrics on team performance such as total receptions, receiving yards, receiving touchdowns, and significant plays like receptions of 20 or more yards and 40 or more yards. Additionally, it tracks first down receptions, the percentage of receptions that resulted in first downs, and fumbles made by receivers. This table is used to evaluate and compare the receiving effectiveness and efficiency of different NFL teams for the specified season.',
 'columns': [{'name': 'Team',
   'type': 'text',
   'description': None,
   'definition': 'The "Team" column in the "public.nfl_teams_stats_2024_receiving" table contains textual data representing the names of NFL teams. Each entry in this column specifies the name of a te

In [12]:
search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')
index_name = 'nfl_database'

indexer.create_azure_ai_search_index(
    search_endpoint=search_endpoint,
    search_credential=search_key,
    index_name=index_name,
    embedding_deployment=aoai_embedding_deployment,
    openai_endpoint=aoai_endpoint,
    openai_key=aoai_key,
)

# write to AI Search
indexer.push_to_ai_search()

2024-12-17 18:15:47,215 - DatabaseIndexer - INFO - Creating Azure AI Search Index.
2024-12-17 18:15:47,215 - DatabaseIndexer - INFO - Creating Azure AI Search Index.
2024-12-17 18:15:47,367 - DatabaseIndexer - INFO - SearchIndexClient created.
2024-12-17 18:15:47,367 - DatabaseIndexer - INFO - SearchIndexClient created.
2024-12-17 18:15:47,689 - DatabaseIndexer - INFO - Index 'nfl_database' does not exist. Creating a new one.
2024-12-17 18:15:47,689 - DatabaseIndexer - INFO - Index 'nfl_database' does not exist. Creating a new one.
2024-12-17 18:15:47,691 - DatabaseIndexer - INFO - Fields for the index defined.
2024-12-17 18:15:47,691 - DatabaseIndexer - INFO - Fields for the index defined.
2024-12-17 18:15:47,693 - DatabaseIndexer - INFO - Vector search configuration defined.
2024-12-17 18:15:47,693 - DatabaseIndexer - INFO - Vector search configuration defined.
2024-12-17 18:15:47,696 - DatabaseIndexer - INFO - Semantic configuration defined.
2024-12-17 18:15:47,696 - DatabaseIndexer

Pushing data for table public.nfl_teams_stats_2024_passing to the index.


2024-12-17 18:15:48,561 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_passing pushed to the index.
2024-12-17 18:15:48,561 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_passing pushed to the index.
2024-12-17 18:15:48,706 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_receiving pushed to the index.
2024-12-17 18:15:48,706 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_receiving pushed to the index.


Pushing data for table public.nfl_teams_stats_2024_receiving to the index.
Pushing data for table public.nfl_teams_stats_2024_rushing to the index.


2024-12-17 18:15:48,818 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_rushing pushed to the index.
2024-12-17 18:15:48,818 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_rushing pushed to the index.
2024-12-17 18:15:48,959 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_scoring pushed to the index.
2024-12-17 18:15:48,959 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_scoring pushed to the index.


Pushing data for table public.nfl_teams_stats_2024_scoring to the index.
Pushing data for table public.nfl_teams_stats_2024_downs to the index.


2024-12-17 18:15:49,077 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_downs pushed to the index.
2024-12-17 18:15:49,077 - DatabaseIndexer - INFO - Data for table public.nfl_teams_stats_2024_downs pushed to the index.


In [14]:
# query rewriting
user_question = "which 3 teams have the most receiving yars in 2024"

rewriter_prompt = f"""
You are a helpful AI SQL Agent. Your role is to help people translate their questions 
into better questions that can be translated into SQL queries.
You should disimbiguate the question and provide a more specific version of the question to the best of your ability
You should rewrite the question in a way that is more likely to be answered by the database.

You must only return a single sentence that is a better version of the user's question.

"""

messages =[
    {
        "role": "system",
        "content": rewriter_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

rewriter_response = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
               )

rewriter_response = rewriter_response.choices[0].message.content
print(rewriter_response)

Which three football teams had the highest total receiving yards during the 2024 season?


In [15]:
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
import json

search_client = SearchClient(endpoint=search_endpoint, 
                             credential=AzureKeyCredential(search_key), 
                             index_name=index_name)

  
vector_query = VectorizableTextQuery(text=rewriter_response, 
                                     k_nearest_neighbors=50, 
                                     fields="embedding")

results = search_client.search(  
    search_text=user_question,  
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)
  
candidate_tables = []
for result in results:
    candidate_tables.append(
        json.dumps({key: result[key] for key in ['name',
                                                 'business_readable_name',
                                                 'description',
                                                 'columns'] if key in result}, indent=4))

In [9]:
candidate_tables

['{\n    "name": "public.nfl_teams_stats_2024_receiving",\n    "business_readable_name": "2024 NFL Team Receiving Stats",\n    "description": "The table \\"public.nfl_teams_stats_2024_receiving\\" contains statistical data related to the receiving performance of NFL teams for the 2024 season. It includes various metrics that help evaluate the effectiveness, efficiency, and productivity of the teams\' receiving games, such as total receptions, receiving yards, touchdowns, and notable big plays. This data is essential for analyzing team performance in passing offense, including their ability to gain significant yardage, secure first downs, and maintain ball security.",\n    "columns": [\n        {\n            "name": "Team",\n            "type": "text",\n            "description": null,\n            "definition": "The \\"Team\\" column in the \\"public.nfl_teams_stats_2024_receiving\\" table contains the names of NFL teams. This column identifies which NFL team the statistical data in t

In [14]:
tables_prompt = candidate_tables

system_prompt = f"""
You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a sql database,
     The database is SQL server, use the right syntax to generate queries


     ### You must use the POSTGRES SQL syntax to query the database
     ### You must use double quotes around the column names
     
     ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contains the information needed to answer the question
     - Only once this is done, create a sql query to retrieve the information based on your understanding of the table
 
    Think step by step, before doing anything, share the different steps you'll execute to get the answer
    
    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
     "sql_query": "The generated sql query"

    Example:    
    question: "Show me the first 5 rows of the sales_data table"
    output: "chain_of_thought": "Your reasoning",
     "sql_query": "SELECT * FROM sales_data LIMIT 5"

"""

messages =[
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

response = openai_client.chat.completions.create(
                model="gpt-4o-global",
                messages=messages,
                response_format={"type": "json_object"}
               )

response_message = response.choices[0].message.content
response_json = json.loads(response_message)

print(response_json['sql_query'])

SELECT "Team", "Yds" FROM public.nfl_teams_stats_2024_receiving ORDER BY "Yds" DESC LIMIT 3


In [15]:
sql_client.query(response_json['sql_query'])

'[{"Team": "Bengals  Bengals", "Yds": 3977}, {"Team": "Lions  Lions", "Yds": 3848}, {"Team": "Seahawks  Seahawks", "Yds": 3682}]'

In [20]:
import time

def execute_query_pipeline(user_question):
    # Query rewriting
    rewriter_prompt = f"""
    You are a helpful AI SQL Agent. Your role is to help people translate their questions 
    into better questions that can be translated into SQL queries.
    You should disambiguate the question and provide a more specific version of the question to the best of your ability.
    You should rewrite the question in a way that is more likely to be answered by the database.

    You must only return a single sentence that is a better version of the user's question.
    """

    messages = [
        {
            "role": "system",
            "content": rewriter_prompt
        },
        {
            "role": "user",
            "content": user_question
        }
    ]

    rewriter_response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
    )

    rewriter_response = rewriter_response.choices[0].message.content
    print("Rewritten Question:", rewriter_response)

    # AI Search
    vector_query = VectorizableTextQuery(text=rewriter_response, k_nearest_neighbors=50, fields="embedding")

    results = search_client.search(
        search_text=user_question,
        vector_queries=[vector_query],
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name='my-semantic-config',
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=3
    )

    candidate_tables = []
    for result in results:
        candidate_tables.append(
            json.dumps({key: result[key] for key in ['name', 'business_readable_name', 'description', 'columns'] if key in result}, indent=4)
        )

    # LLM Call
    tables_prompt = candidate_tables

    system_prompt = f"""
    You are a helpful AI data analyst assistant, 
    You can execute SQL queries to retrieve information from a SQL database,
    The database is SQL server, use the right syntax to generate queries

    ### You must use the POSTGRES SQL syntax to query the database
    ### You must use double quotes around the column names
    ### Your output must be a single SQL query

    ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

    When asked a question that could be answered with a SQL query: 
    - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contain the information needed to answer the question
    - Only once this is done, create a SQL query to retrieve the information based on your understanding of the table

    Think step by step, before doing anything, share the different steps you'll execute to get the answer

    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
    "sql_query": "The generated SQL query, this can only be a single query"
    """

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_question
        }
    ]

    response = openai_client.chat.completions.create(
        model="gpt-4o-global",
        messages=messages,
        response_format={"type": "json_object"}
    )

    response_message = response.choices[0].message.content
    response_json = json.loads(response_message)

    print("Generated SQL Query:", response_json['sql_query'])

    # Execute SQL Query
    query_result = sql_client.query(response_json['sql_query'])
    return query_result

# Example usage
user_question = "Does buffalo have more passing or rushing touchdowns?"

start_time = time.time()
result = execute_query_pipeline(user_question)
end_time = time.time()

latency = end_time - start_time
print("Result:", result)
print("Latency:", latency, "seconds")

Rewritten Question: How many passing touchdowns and rushing touchdowns does the Buffalo team have this season, and which total is higher?
Generated SQL Query: SELECT r."Team", r."TD" AS rushing_td, p."TD" AS passing_td FROM public.nfl_teams_stats_2024_rushing r JOIN public.nfl_teams_stats_2024_passing p ON r."Team" = p."Team" WHERE r."Team" = 'Bills  Bills';
Result: [{"Team": "Bills  Bills", "rushing_td": 28, "passing_td": 25}]
Latency: 3.8042643070220947 seconds
