In [None]:
import sys
sys.path.append('..')

from sqltoolkit.connectors import AzureSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
  
# Azure SQL Connection  
server = 'vince-h-sql.database.windows.net'  
database = 'adventureworks'  
azure_connector = AzureSQLConnector(server=server, database=database)  
sql_client = DatabaseClient(azure_connector)
  

In [2]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT') # should be GPT-4o
aoai_embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT') # should be text-embedding-small

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

### Generating table descriptions from the database

In [3]:
from sqltoolkit.indexer import DatabaseIndexer

indexer = DatabaseIndexer(sql_client, 
                          openai_client, 
                          aoai_deployment,
                          aoai_embedding_deployment)

table_subset = ['SalesLT.SalesOrderHeader', 
                'SalesLT.SalesOrderDetail',
                'SalesLT.Product',
                'SalesLT.Customer',
                'SalesLT.ProductCategory']

table_subset = ['SalesLT.SalesOrderHeader']
manifest = indexer.fetch_and_describe_tables(table_list=table_subset)
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

with open('tables_manifest.json', 'w') as f:
    f.write(tables_dict)

2024-12-17 12:12:05,296 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-17 12:12:05,662 - DatabaseIndexer - INFO - Filtered tables: ['SalesLT.SalesOrderHeader']
2024-12-17 12:12:05,664 - DatabaseIndexer - INFO - Processing table: SalesLT.SalesOrderHeader
2024-12-17 12:12:37,719 - DatabaseIndexer - INFO - Completed processing table: SalesLT.SalesOrderHeader
2024-12-17 12:12:37,720 - DatabaseIndexer - INFO - Completed fetching and processing all tables.


In [4]:
print(indexer.tables[0].description)

The SalesLT.SalesOrderHeader table contains information related to sales orders, including details such as order dates, shipping and billing addresses, and payment specifics. This table is essential for managing and tracking sales orders from initiation through fulfillment and delivery. It includes identifiers for sales, customer, and purchase orders, as well as financial data like subtotals, taxes, and totals due. The table also provides status updates and comments for each order, facilitating comprehensive order management and analysis.


### Building AI Search Index and pushing data

In [15]:
search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')
index_name = 'adventureworks'


In [None]:

indexer.create_azure_ai_search_index(
    search_endpoint=search_endpoint,
    search_credential=search_key,
    index_name=index_name,
    embedding_deployment=aoai_embedding_deployment,
    openai_endpoint=aoai_endpoint,
    openai_key=aoai_key,
)

In [6]:
# write to AI Search
indexer.push_to_ai_search()

2024-12-09 15:58:43,476 - DatabaseIndexer - INFO - Pushing metadata for 12 tables to Azure AI Search.


Pushing data for table SalesLT.Customer to the index.


2024-12-09 15:58:43,812 - DatabaseIndexer - INFO - Data for table SalesLT.Customer pushed to the index.
2024-12-09 15:58:43,898 - DatabaseIndexer - INFO - Data for table SalesLT.ProductModel pushed to the index.
2024-12-09 15:58:43,999 - DatabaseIndexer - INFO - Data for table SalesLT.ProductDescription pushed to the index.


Pushing data for table SalesLT.ProductModel to the index.
Pushing data for table SalesLT.ProductDescription to the index.
Pushing data for table SalesLT.Product to the index.


2024-12-09 15:58:44,116 - DatabaseIndexer - INFO - Data for table SalesLT.Product pushed to the index.
2024-12-09 15:58:44,206 - DatabaseIndexer - INFO - Data for table SalesLT.ProductModelProductDescription pushed to the index.
2024-12-09 15:58:44,284 - DatabaseIndexer - INFO - Data for table SalesLT.ProductCategory pushed to the index.


Pushing data for table SalesLT.ProductModelProductDescription to the index.
Pushing data for table SalesLT.ProductCategory to the index.
Pushing data for table dbo.BuildVersion to the index.


2024-12-09 15:58:44,380 - DatabaseIndexer - INFO - Data for table dbo.BuildVersion pushed to the index.
2024-12-09 15:58:44,455 - DatabaseIndexer - INFO - Data for table dbo.ErrorLog pushed to the index.
2024-12-09 15:58:44,529 - DatabaseIndexer - INFO - Data for table SalesLT.Address pushed to the index.


Pushing data for table dbo.ErrorLog to the index.
Pushing data for table SalesLT.Address to the index.
Pushing data for table SalesLT.CustomerAddress to the index.


2024-12-09 15:58:44,612 - DatabaseIndexer - INFO - Data for table SalesLT.CustomerAddress pushed to the index.
2024-12-09 15:58:44,706 - DatabaseIndexer - INFO - Data for table SalesLT.SalesOrderDetail pushed to the index.


Pushing data for table SalesLT.SalesOrderDetail to the index.
Pushing data for table SalesLT.SalesOrderHeader to the index.


2024-12-09 15:58:44,829 - DatabaseIndexer - INFO - Data for table SalesLT.SalesOrderHeader pushed to the index.


### RAG + Query generation with gpt-4o

In [26]:
# query rewriting
user_question = "what's the best product"

rewriter_prompt = f"""
You are a helpful AI SQL Agent. Your role is to help people translate their questions 
into better questions that can be translated into SQL queries.
You should disimbiguate the question and provide a more specific version of the question to the best of your ability
You should rewrite the question in a way that is more likely to be answered by the database.

You must only return a single sentence that is a better version of the user's question.

"""

messages =[
    {
        "role": "system",
        "content": rewriter_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

rewriter_response = openai_client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
               )

rewriter_response = rewriter_response.choices[0].message.content
print(rewriter_response)

Which product has the highest customer rating or sales in the database?


In [22]:
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
import json

search_client = SearchClient(endpoint=search_endpoint, 
                             credential=AzureKeyCredential(search_key), 
                             index_name=index_name)

  
vector_query = VectorizableTextQuery(text=rewriter_response, 
                                     k_nearest_neighbors=50, 
                                     fields="embedding")

results = search_client.search(  
    search_text=user_question,  
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)
  
candidate_tables = []
for result in results:
    candidate_tables.append(
        json.dumps({key: result[key] for key in ['name',
                                                 'business_readable_name',
                                                 'description',
                                                 'columns'] if key in result}, indent=4))


In [21]:
candidate_tables

['{\n    "name": "SalesLT.Product",\n    "business_readable_name": "Product Details",\n    "description": "The SalesLT.Product table in the database contains detailed information about products available for sale. It includes identifiers for products and their categories and models, as well as attributes such as product names, numbers, colors, sizes, weights, costs, and prices. The table also records important dates related to product availability and discontinuation, as well as details for managing product images. This table is essential for inventory management, product categorization, and sales analysis.",\n    "columns": [\n        {\n            "name": "ProductID",\n            "type": "int",\n            "description": null,\n            "definition": "The ProductID column in the SalesLT.Product table is an integer field that serves as the primary key for the table, uniquely identifying each product entry. It contains distinct numerical identifiers assigned to each product in th

In [23]:

tables_prompt = candidate_tables

system_prompt = f"""
You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a sql database,
     The database is SQL server, use the right syntax to generate queries


     ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contains the information needed to answer the question
     - Only once this is done, create a sql query to retrieve the information based on your understanding of the table
 
    Think step by step, before doing anything, share the different steps you'll execute to get the answer
    
    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
     "sql_query": "The generated sql query"

    Example:    
    question: "Show me the first 5 rows of the sales_data table"
    output: "chain_of_thought": "Your reasoning",
     "sql_query": "SELECT TOP 5 * FROM sales_data"

"""

messages =[
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

response = openai_client.chat.completions.create(
                model="gpt-4o-global",
                messages=messages,
                response_format={"type": "json_object"}
               )

response_message = response.choices[0].message.content
response_json = json.loads(response_message)

response_json



{'chain_of_thought': "To determine the difference between the most popular and least popular products, we first need to identify what metric we are using to determine popularity. Usually, this metric could be based on sales numbers, quantities sold, or customer reviews. Since the provided information does not include such metrics, we'll assume sales numbers as the determining factor.\n We assume that there is another table that tracks sales transactions including product sales metrics. Let's break this down step-by-step:\n\n1. Identify the table that includes sales information related to products (this information is not given, but I'll assume a common name for such a table such as `SalesLT.SalesOrderDetail`).\n2. Calculate the total sales for each product.\n3. Identify the most popular product (highest sales) and the least popular product (lowest sales).\n4. Calculate the difference between the sales of the most popular and least popular products.",
 'sql_query': 'WITH ProductSales AS

In [24]:
print(response_json['sql_query'])

WITH ProductSales AS (
    SELECT
        ProductID,
        SUM(OrderQty) AS TotalSales
    FROM SalesLT.SalesOrderDetail
    GROUP BY ProductID
),
MostPopularProduct AS (
    SELECT TOP 1
        ProductID,
        TotalSales
    FROM ProductSales
    ORDER BY TotalSales DESC
),
LeastPopularProduct AS (
    SELECT TOP 1
        ProductID,
        TotalSales
    FROM ProductSales
    ORDER BY TotalSales ASC
)
SELECT
    mp.ProductID AS MostPopularProductID,
    mp.TotalSales AS MostPopularSales,
    lp.ProductID AS LeastPopularProductID,
    lp.TotalSales AS LeastPopularSales,
    (mp.TotalSales - lp.TotalSales) AS SalesDifference
FROM MostPopularProduct mp,
     LeastPopularProduct lp


In [25]:
sql_client.query(response_json['sql_query'])

'[{"MostPopularProductID": 864, "MostPopularSales": 87, "LeastPopularProductID": 886, "LeastPopularSales": 1, "SalesDifference": 86}]'