In [1]:
from sqltoolkit.connectors import PostgreSQLConnector
from sqltoolkit.client import DatabaseClient
from sqltoolkit.entities import Table, TableColumn
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()


# Define the connection parameters
server = os.getenv('POSTGRES_HOST')
database = 'nfl'
username = os.getenv('POSTGRES_USER')
password = os.getenv('POSTGRES_PWD')
port = '5432'

psql_connector = PostgreSQLConnector(server, database, username, password, port)
sql_client = DatabaseClient(psql_connector)
  

In [2]:
from openai import AzureOpenAI

aoai_endpoint = os.getenv('OPENAI_ENDPOINT')
aoai_key = os.getenv('OPENAI_API_KEY')
aoai_deployment = os.getenv('OPENAI_4o_DEPLOYMENT') # should be GPT-4o
aoai_embedding_deployment = os.getenv('OPENAI_EMBEDDING_DEPLOYMENT') # should be text-embedding-small

openai_client = AzureOpenAI(azure_endpoint=aoai_endpoint,
    api_key=aoai_key,
    api_version='2024-10-21')

In [3]:
from sqltoolkit.indexer import DatabaseIndexer

indexer = DatabaseIndexer(sql_client, 
                          openai_client, 
                          aoai_deployment,
                          aoai_embedding_deployment)

manifest = indexer.fetch_and_describe_tables()
indexer.generate_table_embeddings()

tables_dict = indexer.export_json_manifest()

2024-12-17 11:54:26,199 - DatabaseIndexer - INFO - Fetching tables from the database.
2024-12-17 11:54:26,379 - DatabaseIndexer - INFO - Processing table: public.nfl_team_stats
2024-12-17 11:56:05,509 - DatabaseIndexer - INFO - Completed processing table: public.nfl_team_stats
2024-12-17 11:56:05,511 - DatabaseIndexer - INFO - Completed fetching and processing all tables.


In [4]:
indexer.tables[0].model_dump()

{'name': 'public.nfl_team_stats',
 'business_readable_name': 'NFL Team Statistics',
 'description': 'The "public.nfl_team_stats" table contains detailed statistical data about NFL games, split by season, week, and specific matches. It provides information on team performances, including scores, first downs, passing, rushing, and defensive plays, and includes fields for the home and away teams. Additionally, it records contextual details such as the date, game start time, and whether the game was played at a neutral location. This table is essential for analyzing team performance metrics, game outcomes, and player statistics across different games and seasons.',
 'columns': [{'name': 'season',
   'type': 'bigint',
   'description': None,
   'definition': 'The column "season" in the table "public.nfl_team_stats" contains data representing individual NFL seasons. The values in this column are of the type "bigint" and typically consist of years, such as 2002, 2003, 2004, etc. This column e