## Notebook to demo IDRIS on Databricks with AzureOpenAI endpoint.

Please make sure to install the below modules:
```shell
uv pip install -U databricks
```

- https://learn.microsoft.com/en-us/azure/databricks/dev-tools/python-sql-connector

### TODO

- Add dump_xxxx methods to save create_table/context/question_sql

In [None]:
import os
import re

import gait as G
from databricks import sql
from rich import print

In [None]:
rdb = idr.IdrisDatabricks(
    server_hostname=os.environ["DATABRICKS_SERVER_HOSTNAME"],
    http_path=os.environ["DATABRICKS_HTTP_PATH"],
    access_token=os.environ["DATABRICKS_TOKEN"],
)

emb = G.IdrisLiteEmb(
    # model_name="openai/mxbai-embed-large:latest",
    # api_base="http://localhost:11434/v1",
    # api_key="ollama",
    model_name="azure/text-embedding-ada-002",
    api_base=os.environ["AZURE_API_URL"] + "/text-embedding-ada-002",
)

llm = G.IdrisLiteLLM(
    # model_name="openai/phi4:14b-q8_0",
    # api_base="http://localhost:11434/v1",
    # api_key="ollama",
    model_name="azure/gpt-4o-mini",
    api_base=os.environ["AZURE_API_URL"] + "/gpt-4o-mini",
)

idris = Idris(rdb, emb, llm)

In [None]:
# rdb.execute_sql("select count(*) from samples.nyctaxi.trips")

### Read the data as a Pandas dataframe so we can train on it.

**MAKE SURE TO GET A SAMPLE OF THE DATA**

- https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-qry-select-sampling.html

In [None]:
pdf = rdb.execute_sql("select * from samples.nyctaxi.trips limit 10000")

In [None]:
pdf[:10]

### Create an alias mapping for columns and column:value.

In [None]:
aliases = {
    "_col:tpep_pickup_datetime": "pickup datetime",
    "_col:tpep_dropoff_datetime": "dropodd datetime",
    "_col:trip_distance": "trip distance in miles",
    "_col:fare_amount": "fare_amount in dollars",
    "_col:pickup_zip": "pickup zipcode",
    "_col:dropoff_zip": "dropoff zipcode",
    #
    # What to substitude when we see a value for a field.
    #
    "field_name:field_value": "new_value",
}

In [None]:
train_result = G.IdrisTrainer(aliases).train(
    pdf,
    table_name="samples.nyctaxi.trips",
    alias_name="trips",
)

In [None]:
print(train_result.create_table)

In [None]:
print(train_result.context)

In [None]:
print(train_result.question_sql)

### Load training data into IDRIS.

In [None]:
idris.add_create_table(train_result.create_table)
idris.load_context(train_result.context)
idris.load_question_sql(train_result.question_sql)

### Let's start asking question.

In [None]:
def clean_sql(sql: str) -> str:
    return re.sub(r"^```sql\s*|\s*```$", "", sql)

In [None]:
sql = idris.generate_sql(
    # "What is the average fare at dropoff zip code 10803 on Monday?"
    """
    What is the average fare on Mondays broken down by 1 mile increments?
    Please make sure to put a dollar sign on the fare.
    """
)
sql = clean_sql(sql)
print(sql)

In [None]:
idris.execute_sql(sql)