In [1]:
from sqltoolkit.connectors import AzureSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
  
# Azure SQL Connection  
server = 'vince-h-sql.database.windows.net'  
database = 'adventureworks'  
azure_connector = AzureSQLConnector(server=server, database=database)  
sql_client = DatabaseClient(azure_connector)
  

In [3]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_DEPLOYMENT')

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-02-01')

### Generating table descriptions from the database

In [10]:
from sqltoolkit.indexer import DatabaseIndexer

indexer = DatabaseIndexer(sql_client, openai_client)
manifest = indexer.fetch_and_describe_tables(table_list=['SalesLT.SalesOrderHeader', 
                                                   'SalesLT.SalesOrderDetail',
                                                   'SalesLT.Product',
                                                   'SalesLT.Customer',
                                                   'SalesLT.ProductCategory'])
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

with open('tables_manifest.json', 'w') as f:
    f.write(tables_dict)

2024-12-04 09:59:25,923 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-04 09:59:25,923 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-04 09:59:26,018 - DatabaseIndexer - INFO - Filtered tables: ['SalesLT.Customer', 'SalesLT.Product', 'SalesLT.ProductCategory', 'SalesLT.SalesOrderDetail', 'SalesLT.SalesOrderHeader']
2024-12-04 09:59:26,018 - DatabaseIndexer - INFO - Filtered tables: ['SalesLT.Customer', 'SalesLT.Product', 'SalesLT.ProductCategory', 'SalesLT.SalesOrderDetail', 'SalesLT.SalesOrderHeader']
2024-12-04 09:59:26,020 - DatabaseIndexer - INFO - Processing table: SalesLT.Customer
2024-12-04 09:59:26,020 - DatabaseIndexer - INFO - Processing table: SalesLT.Customer
2024-12-04 10:00:22,272 - DatabaseIndexer - INFO - Completed processing table: SalesLT.Customer
2024-12-04 10:00:22,272 - DatabaseIndexer - INFO - Completed processing table: SalesLT.Customer
2024-12-04 10:00:22,274 - DatabaseIndexer - INFO - Processing table: SalesLT.Produ

In [11]:
print(indexer.tables[0].description)

The SalesLT.Customer table stores information about customers, including their personal details and contact information. Each customer is uniquely identified by a CustomerID, which serves as the primary key. The table includes attributes such as names, related company details, contact details like email and phone, and information about associated sales personnel. It also contains security-related information for customer authentication, such as hashed passwords and cryptographic salts. Data modification tracking is facilitated through the ModifiedDate column.


### Building AI Search Index and pushing data

### Testing

In [13]:
tables_prompt = {"tables":[table.model_dump(exclude=['embedding']) for table in indexer.tables]}

system_prompt = f"""
You are a helpful AI data analyst assistant, 
     You can execute SQL queries to retrieve information from a sql database,
     The database is SQL server, use the right syntax to generate queries


     ### These are the available tables in the database alongside their descriptions:
    {tables_prompt}

     When asked a question that could be answered with a SQL query: 
     - Break down the question into smaller parts that can be answered with SQL queries
    - Identify the tables that contains the information needed to answer the question
     - Only once this is done, create a sql query to retrieve the information based on your understanding of the table
 
    Think step by step, before doing anything, share the different steps you'll execute to get the answer
    When you get to a final answer use the following structure to provide the answer:
    <RESPONSE>: The SQL query to execute

    Example:    
    question: "Show me the first 5 rows of the sales_data table"
    output: Your thought process <RESPONSE>: SELECT TOP 5 * FROM sales_data  

"""

messages =[
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": "What are the top 3 selling product categories?"
    }
]

response = openai_client.chat.completions.create(
                model="gpt-4o-global",
                messages=messages)

response_message = response.choices[0].message.content
print(response_message)



To determine the top 3 selling product categories, we need to follow these steps:

1. **Identify the tables involved:**
   - We need to determine where the sales details (quantity and product) are stored. This information can be found in the `SalesLT.SalesOrderDetail` table.
   - We need to determine the product information (including product category). This information can be found in the `SalesLT.Product` table.
   - We need to determine the category information (names of the categories). This information can be found in the `SalesLT.ProductCategory` table.

2. **Determine the necessary columns:**
   - From `SalesLT.SalesOrderDetail`: `ProductID`, `OrderQty`.
   - From `SalesLT.Product`: `ProductID`, `ProductCategoryID`.
   - From `SalesLT.ProductCategory`: `ProductCategoryID`, `Name`.

3. **Write the SQL query:**
   - We need to join these three tables on their respective keys (`ProductID` and `ProductCategoryID`).
   - We will sum up the `OrderQty` for each product category.
   - W