In [1]:
from sqltoolkit.connectors import AzureSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
  
# Azure SQL Connection  
server = 'vince-h-sql.database.windows.net'  
database = 'adventureworks'  
azure_connector = AzureSQLConnector(server=server, database=database)  
sql_client = DatabaseClient(azure_connector)
  

In [2]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT')

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

### Generating table descriptions from the database

In [3]:
from sqltoolkit.indexer import DatabaseIndexer

indexer = DatabaseIndexer(sql_client, openai_client)
table_subset = ['SalesLT.SalesOrderHeader', 
                'SalesLT.SalesOrderDetail',
                'SalesLT.Product',
                'SalesLT.Customer',
                'SalesLT.ProductCategory']

manifest = indexer.fetch_and_describe_tables()
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

with open('tables_manifest.json', 'w') as f:
    f.write(tables_dict)

2024-12-09 15:54:47,365 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-09 15:54:47,631 - DatabaseIndexer - INFO - Processing table: SalesLT.Customer
2024-12-09 15:55:10,278 - DatabaseIndexer - INFO - Completed processing table: SalesLT.Customer
2024-12-09 15:55:10,280 - DatabaseIndexer - INFO - Processing table: SalesLT.ProductModel
2024-12-09 15:55:19,636 - DatabaseIndexer - INFO - Completed processing table: SalesLT.ProductModel
2024-12-09 15:55:19,638 - DatabaseIndexer - INFO - Processing table: SalesLT.ProductDescription
2024-12-09 15:55:26,172 - DatabaseIndexer - INFO - Completed processing table: SalesLT.ProductDescription
2024-12-09 15:55:26,173 - DatabaseIndexer - INFO - Processing table: SalesLT.Product
2024-12-09 15:55:50,248 - DatabaseIndexer - INFO - Completed processing table: SalesLT.Product
2024-12-09 15:55:50,249 - DatabaseIndexer - INFO - Processing table: SalesLT.ProductModelProductDescription
2024-12-09 15:56:02,152 - DatabaseIndexer - INFO - C

In [4]:
print(indexer.tables[0].description)

The SalesLT.Customer table contains detailed information about customers within a sales or business database. Each entry represents a unique customer, identified by a primary key. The table includes various fields such as names, titles, contact details, company affiliation, sales representatives, and security-related information like password hashes and salts. Additionally, it tracks record modifications through a timestamp, ensuring that customer data can be accurately maintained and updated.


### Building AI Search Index and pushing data

In [5]:
search_endpoint = os.getenv('AI_SEARCH_ENDPOINT')
search_key = os.getenv('AI_SEARCH_API_KEY')
embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT')

index_name = 'adventureworks'

indexer.create_azure_ai_search_index(
    search_endpoint=search_endpoint,
    search_credential=search_key,
    index_name=index_name,
    embedding_deployment=embedding_deployment,
    openai_endpoint=aoai_endpoint,
    openai_key=aoai_key,
)

2024-12-09 15:57:49,726 - DatabaseIndexer - INFO - Creating Azure AI Search Index.
2024-12-09 15:57:49,728 - DatabaseIndexer - INFO - SearchIndexClient created.


2024-12-09 15:57:50,023 - DatabaseIndexer - INFO - Index 'adventureworks' does not exist. Creating a new one.
2024-12-09 15:57:50,024 - DatabaseIndexer - INFO - Fields for the index defined.
2024-12-09 15:57:50,025 - DatabaseIndexer - INFO - Vector search configuration defined.
2024-12-09 15:57:50,026 - DatabaseIndexer - INFO - Semantic configuration defined.
2024-12-09 15:57:50,027 - DatabaseIndexer - INFO - Semantic search configuration defined.
2024-12-09 15:57:50,466 - DatabaseIndexer - INFO - Index 'adventureworks' created successfully.


In [6]:
# write to AI Search
indexer.push_to_ai_search()

2024-12-09 15:58:43,476 - DatabaseIndexer - INFO - Pushing metadata for 12 tables to Azure AI Search.


Pushing data for table SalesLT.Customer to the index.


2024-12-09 15:58:43,812 - DatabaseIndexer - INFO - Data for table SalesLT.Customer pushed to the index.
2024-12-09 15:58:43,898 - DatabaseIndexer - INFO - Data for table SalesLT.ProductModel pushed to the index.
2024-12-09 15:58:43,999 - DatabaseIndexer - INFO - Data for table SalesLT.ProductDescription pushed to the index.


Pushing data for table SalesLT.ProductModel to the index.
Pushing data for table SalesLT.ProductDescription to the index.
Pushing data for table SalesLT.Product to the index.


2024-12-09 15:58:44,116 - DatabaseIndexer - INFO - Data for table SalesLT.Product pushed to the index.
2024-12-09 15:58:44,206 - DatabaseIndexer - INFO - Data for table SalesLT.ProductModelProductDescription pushed to the index.
2024-12-09 15:58:44,284 - DatabaseIndexer - INFO - Data for table SalesLT.ProductCategory pushed to the index.


Pushing data for table SalesLT.ProductModelProductDescription to the index.
Pushing data for table SalesLT.ProductCategory to the index.
Pushing data for table dbo.BuildVersion to the index.


2024-12-09 15:58:44,380 - DatabaseIndexer - INFO - Data for table dbo.BuildVersion pushed to the index.
2024-12-09 15:58:44,455 - DatabaseIndexer - INFO - Data for table dbo.ErrorLog pushed to the index.
2024-12-09 15:58:44,529 - DatabaseIndexer - INFO - Data for table SalesLT.Address pushed to the index.


Pushing data for table dbo.ErrorLog to the index.
Pushing data for table SalesLT.Address to the index.
Pushing data for table SalesLT.CustomerAddress to the index.


2024-12-09 15:58:44,612 - DatabaseIndexer - INFO - Data for table SalesLT.CustomerAddress pushed to the index.
2024-12-09 15:58:44,706 - DatabaseIndexer - INFO - Data for table SalesLT.SalesOrderDetail pushed to the index.


Pushing data for table SalesLT.SalesOrderDetail to the index.
Pushing data for table SalesLT.SalesOrderHeader to the index.


2024-12-09 15:58:44,829 - DatabaseIndexer - INFO - Data for table SalesLT.SalesOrderHeader pushed to the index.


### RAG + Query generation with gpt-4o

In [8]:
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorizableTextQuery
from azure.core.credentials import AzureKeyCredential
import json

search_client = SearchClient(endpoint=search_endpoint, 
                             credential=AzureKeyCredential(search_key), 
                             index_name=index_name)

user_question = "What is the first name of the customer that put in the most orders?"

  
vector_query = VectorizableTextQuery(text=user_question, k_nearest_neighbors=50, fields="embedding")

results = search_client.search(  
    search_text=user_question,  
    vector_queries=[vector_query],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)
  
candidate_tables = []
for result in results:
    candidate_tables.append(
        json.dumps({key: result[key] for key in ['name',
                                                 'business_readable_name',
                                                 'description',
                                                 'columns'] if key in result}, indent=4))


In [14]:
print(candidate_tables[0])

{
    "name": "SalesLT.Customer",
    "business_readable_name": "Customer Details",
    "description": "The SalesLT.Customer table contains detailed information about customers within a sales or business database. Each entry represents a unique customer, identified by a primary key. The table includes various fields such as names, titles, contact details, company affiliation, sales representatives, and security-related information like password hashes and salts. Additionally, it tracks record modifications through a timestamp, ensuring that customer data can be accurately maintained and updated.",
    "columns": [
        {
            "name": "CustomerID",
            "type": "int",
            "description": null,
            "definition": "CustomerID is an integer column in the SalesLT.Customer table. This column serves as the primary key, uniquely identifying each customer in the table. It contains unique numeric values assigned to each customer, ensuring that no two customers shar

In [10]:

tables_prompt = candidate_tables

system_prompt = f"""
You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a sql database,
     The database is SQL server, use the right syntax to generate queries


     ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contains the information needed to answer the question
     - Only once this is done, create a sql query to retrieve the information based on your understanding of the table
 
    Think step by step, before doing anything, share the different steps you'll execute to get the answer
    
    Your response should be in JSON format with the following structure:
    "chain_of_thought": "Your reasoning",
     "sql_query": "The generated sql query"

    Example:    
    question: "Show me the first 5 rows of the sales_data table"
    output: "chain_of_thought": "Your reasoning",
     "sql_query": "SELECT TOP 5 * FROM sales_data"

"""

messages =[
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": user_question
    }
]

response = openai_client.chat.completions.create(
                model="gpt-4o-global",
                messages=messages,
                response_format={"type": "json_object"}
               )

response_message = response.choices[0].message.content
response_json = json.loads(response_message)

response_json



{'chain_of_thought': 'To determine the first name of the customer who placed the most orders, we will follow these steps:\n1. Count the number of orders placed by each customer in the SalesLT.SalesOrderHeader table, grouping by CustomerID.\n2. Identify the CustomerID with the highest count of orders.\n3. Retrieve the first name of the customer with the identified CustomerID from the SalesLT.Customer table.',
 'sql_query': 'WITH OrderCounts AS (\n    SELECT CustomerID, COUNT(*) AS OrderCount\n    FROM SalesLT.SalesOrderHeader\n    GROUP BY CustomerID\n), MaxOrderCustomer AS (\n    SELECT TOP 1 CustomerID\n    FROM OrderCounts\n    ORDER BY OrderCount DESC\n)\nSELECT c.FirstName\nFROM SalesLT.Customer c\nINNER JOIN MaxOrderCustomer moc ON c.CustomerID = moc.CustomerID;'}

In [12]:
print(response_json['sql_query'])

WITH OrderCounts AS (
    SELECT CustomerID, COUNT(*) AS OrderCount
    FROM SalesLT.SalesOrderHeader
    GROUP BY CustomerID
), MaxOrderCustomer AS (
    SELECT TOP 1 CustomerID
    FROM OrderCounts
    ORDER BY OrderCount DESC
)
SELECT c.FirstName
FROM SalesLT.Customer c
INNER JOIN MaxOrderCustomer moc ON c.CustomerID = moc.CustomerID;
