In [5]:
import sys
sys.path.append('..')

from sqltoolkit.connectors import SnowflakeConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()


user = os.getenv("SNOWFLAKE_USER")
password = os.getenv("SNOWFLAKE_PWD")
account = os.getenv("SNOWFLAKE_ACCOUNT")
warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")
database = os.getenv("SNOWFLAKE_DB")
schema = os.getenv("SNOWFLAKE_SCHEMA")

snowflake_connector = SnowflakeConnector(
    user=user, password=password, account=account,
    warehouse=warehouse, database=database, schema=schema
)
sql_client = DatabaseClient(snowflake_connector)

In [6]:
from openai import AzureOpenAI

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT') # should be GPT-4o
aoai_embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT') # should be text-embedding-small

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

### 1. Generating metadata from the Snowflake Database

In [7]:
from sqltoolkit.indexer import DatabaseIndexer

context = """
The database contains table about a food delivery service called Tasty Bytes
Tasty Bytes is a global food truck network with localized menu options, 15 countries, 30 major cities and 15 core brands

"""

indexer = DatabaseIndexer(sql_client, 
                          openai_client, 
                          aoai_deployment,
                          aoai_embedding_deployment,
                          extra_context=context)

manifest = indexer.fetch_and_describe_tables(regex_filter="RAW_POS*")
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

2025-03-03 14:17:04,062 - DatabaseIndexer - INFO - Fetching tables from the database.
2025-03-03 14:17:05,759 - DatabaseIndexer - INFO - Filtered tables by regex: ['RAW_POS.COUNTRY', 'RAW_POS.LOCATION', 'RAW_POS.MENU', 'RAW_POS.ORDER_DETAIL', 'RAW_POS.TRUCK', 'RAW_POS.FRANCHISE', 'RAW_POS.ORDER_HEADER']
2025-03-03 14:17:05,760 - DatabaseIndexer - INFO - Processing table: RAW_POS.COUNTRY
2025-03-03 14:17:23,005 - DatabaseIndexer - INFO - Completed processing table: RAW_POS.COUNTRY
2025-03-03 14:17:23,007 - DatabaseIndexer - INFO - Processing table: RAW_POS.LOCATION
2025-03-03 14:17:43,037 - DatabaseIndexer - INFO - Completed processing table: RAW_POS.LOCATION
2025-03-03 14:17:43,038 - DatabaseIndexer - INFO - Processing table: RAW_POS.MENU
2025-03-03 14:18:12,271 - DatabaseIndexer - INFO - Completed processing table: RAW_POS.MENU
2025-03-03 14:18:12,272 - DatabaseIndexer - INFO - Processing table: RAW_POS.ORDER_DETAIL
2025-03-03 14:18:39,330 - DatabaseIndexer - INFO - Completed processi

In [11]:
print(f'Table Name: {indexer.tables[0].name}')
print(f'Table Description: {indexer.tables[0].description}')

Table Name: RAW_POS.COUNTRY
Table Description: The RAW_POS.COUNTRY table is a key component of the Tasty Bytes database, providing detailed information on the countries and cities where the food delivery service operates. The table includes numerical identifiers and textual details for countries and cities, enhancing data categorization and retrieval. It covers essential data points like ISO currency and country codes, as well as city-specific identifiers and population figures. This data is integral for analyzing regional performance, understanding market demographics, and facilitating financial transactions across Tasty Bytes' global network. The table is foundational for ensuring data accuracy and segmentation based on geographic locations in the network.


### 2. Pushing metadata to an Azure AI Search Index

In [12]:
search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')
index_name = 'tasty_bytes_snowflake'

In [13]:
indexer.create_azure_ai_search_index(
    search_endpoint=search_endpoint,
    search_credential=search_key,
    index_name=index_name,
    embedding_deployment=aoai_embedding_deployment,
    openai_endpoint=aoai_endpoint,
    openai_key=aoai_key,
)

# write to AI Search
indexer.push_to_ai_search()

2025-03-03 14:27:09,897 - DatabaseIndexer - INFO - Creating Azure AI Search Index.
2025-03-03 14:27:09,912 - DatabaseIndexer - INFO - SearchIndexClient created.
2025-03-03 14:27:10,233 - DatabaseIndexer - INFO - Index 'tasty_bytes_snowflake' does not exist. Creating a new one.
2025-03-03 14:27:10,234 - DatabaseIndexer - INFO - Fields for the index defined.
2025-03-03 14:27:10,234 - DatabaseIndexer - INFO - Vector search configuration defined.
2025-03-03 14:27:10,235 - DatabaseIndexer - INFO - Semantic configuration defined.
2025-03-03 14:27:10,236 - DatabaseIndexer - INFO - Semantic search configuration defined.
2025-03-03 14:27:11,374 - DatabaseIndexer - INFO - Index 'tasty_bytes_snowflake' created successfully.
2025-03-03 14:27:11,378 - DatabaseIndexer - INFO - Pushing metadata for 7 tables to Azure AI Search.


Pushing data for table RAW_POS.COUNTRY to the index.


2025-03-03 14:27:11,734 - DatabaseIndexer - INFO - Data for table RAW_POS.COUNTRY pushed to the index.
2025-03-03 14:27:11,863 - DatabaseIndexer - INFO - Data for table RAW_POS.LOCATION pushed to the index.


Pushing data for table RAW_POS.LOCATION to the index.
Pushing data for table RAW_POS.MENU to the index.


2025-03-03 14:27:11,978 - DatabaseIndexer - INFO - Data for table RAW_POS.MENU pushed to the index.
2025-03-03 14:27:12,113 - DatabaseIndexer - INFO - Data for table RAW_POS.ORDER_DETAIL pushed to the index.


Pushing data for table RAW_POS.ORDER_DETAIL to the index.
Pushing data for table RAW_POS.TRUCK to the index.


2025-03-03 14:27:12,217 - DatabaseIndexer - INFO - Data for table RAW_POS.TRUCK pushed to the index.
2025-03-03 14:27:12,326 - DatabaseIndexer - INFO - Data for table RAW_POS.FRANCHISE pushed to the index.


Pushing data for table RAW_POS.FRANCHISE to the index.
Pushing data for table RAW_POS.ORDER_HEADER to the index.


2025-03-03 14:27:12,429 - DatabaseIndexer - INFO - Data for table RAW_POS.ORDER_HEADER pushed to the index.


### 3. RETRIEVAL: Query rewriter call
This step ensures the query is rewritten in a way that will improve table search results

In [14]:
# query rewriting
user_question = "What is the top selling menu item?"

rewriter_prompt = f"""
You are a helpful AI SQL Agent. Your role is to help people translate their questions 
into better questions that can be translated into SQL queries.
You should disimbiguate the question and provide a more specific version of the question to the best of your ability
You should rewrite the question in a way that is more likely to be answered by the database.

You must only return a single sentence that is a better version of the user's question.

"""

messages =[
    {
        "role": "system",
        "content": rewriter_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

rewriter_response = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
               )

rewriter_response = rewriter_response.choices[0].message.content
print(rewriter_response)

What is the name and total sales amount of the top-selling menu item in the restaurant's sales database?


### 3. RETRIEVAL: Table metadata retrieval from Azure AI Search

This narrows down the table metadata that is retrieved and passed to the LLM inference call

In [15]:
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
import json

search_client = SearchClient(endpoint=search_endpoint, 
                             credential=AzureKeyCredential(search_key), 
                             index_name=index_name)

  
vector_query = VectorizableTextQuery(text=rewriter_response, 
                                     k_nearest_neighbors=50, 
                                     fields="embedding")

results = search_client.search(  
    search_text=user_question,  
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)
  
candidate_tables = []
for result in results:
    candidate_tables.append(
        json.dumps({key: result[key] for key in ['name',
                                                 'business_readable_name',
                                                 'description',
                                                 'columns'] if key in result}, indent=4))

In [16]:
candidate_tables

['{\n    "name": "RAW_POS.MENU",\n    "business_readable_name": "Menu Information",\n    "description": "The RAW_POS.MENU table contains detailed information about the various menus offered by Tasty Bytes, a global food truck network that operates across multiple countries and cities. This table includes data on specific menu items, their associated costs, sale prices, and health metrics, as well as classification details such as menu types and categories. It serves as a central resource for managing menu details, linking them to truck brand names and different cuisine styles. The table is essential for understanding the diversity of culinary offerings, pricing strategies, and health considerations of menu items within the Tasty Bytes network.",\n    "columns": [\n        {\n            "name": "MENU_ID",\n            "type": "NUMBER",\n            "description": null,\n            "definition": "The MENU_ID column is a numerical identifier that uniquely distinguishes each menu offered

### 3. RETRIEVAL: SQL Query Generation

In [17]:
tables_prompt = candidate_tables

system_prompt = f"""
You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a sql database,
     The database is SQL server, use the right syntax to generate queries


     ### You must use the POSTGRES SQL syntax to query the database
     ### You must use double quotes around the column names
     ### You must use double quotes around schema names and table names
     
     ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contains the information needed to answer the question
     - Only once this is done, create a sql query to retrieve the information based on your understanding of the table
 
    Think step by step, before doing anything, share the different steps you'll execute to get the answer
    
    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
     "sql_query": "The generated sql query"


    ## You must always try to match the formatting and syntax of these example queries
    
    ### Important Examples:    
    question: "Show me the first 5 rows of the sales_data table"
    output: "chain_of_thought": "Your reasoning",
     "sql_query": "SELECT * FROM "public"."sales_data" LIMIT 5"

"""

messages =[
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

response = openai_client.chat.completions.create(
                model="gpt-4o-global",
                messages=messages,
                response_format={"type": "json_object"},
                temperature=0.3,
                seed=42,
            )

response_message = response.choices[0].message.content
response_json = json.loads(response_message)

print(response_json['sql_query'])

SELECT "MENU_ITEM_NAME", SUM("QUANTITY") AS total_quantity_sold FROM "RAW_POS"."ORDER_DETAIL" JOIN "RAW_POS"."MENU" ON "ORDER_DETAIL"."MENU_ITEM_ID" = "MENU"."MENU_ITEM_ID" GROUP BY "MENU_ITEM_NAME" ORDER BY total_quantity_sold DESC LIMIT 1


In [18]:
sql_client.query(response_json['sql_query'])

'[{"MENU_ITEM_NAME": "Ice Tea", "TOTAL_QUANTITY_SOLD": 62129143}]'