In [0]:

%pip install typing-extensions
%pip install openai
%pip install sqlparse==0.5.0
%pip install mlflow>=2.9.0
dbutils.library.restartPython()

### Trying to register to MLFlow

In [0]:
%pip install mlflow[databricks]
import mlflow

In [0]:
# Import the necessary libraries
import mlflow
from mlflow.pyfunc import PythonModel, ModelSignature
from mlflow.types import DataType, Schema
import mlflow.pyfunc
import mlflow.deployments
import os

def get_table_schema(table_name, spark_object):
    table_schema = spark_object.sql("DESCRIBE {}".format(table_name))
    my_schema = table_schema.collect()
    return (table_name, my_schema)

In [0]:

# Define the custom model class for TxtToSQL
class TxtToSQLModel(PythonModel):
    # Constructor
    def __init__(self):
        pass

    def nl_to_sql(self, client, nl_query, schema):
        chat_response = client.predict(
            endpoint="databricks-meta-llama-3-70b-instruct",
            inputs={
                "messages": [
                    {"role": "system", "content": "You are an AI assistant"},
                    {
                        "role": "user",
                        "content": f"Table schema:\n{schema}\n\nConvert the following natural language query to SQL: {nl_query}\n\nSQL: and in the output give only the SQL query without text",
                    },
                ],
                "temperature": 0.1,
                "max_tokens": 256,
            },
        )

        return chat_response.choices[0]

    # Method for loading the model
    def load_context(self, context):
        pass

    """
    # Method for predicting using the loaded model
    def predict(self, model_input):
        # Custom prediction logic goes here
        pass
    """
    def predict(self, client, message, table_name, spark_object):
        table_schema = get_table_schema(table_name, spark_object)
        sql_query = self.nl_to_sql(client, message, table_schema)
        response = sql_query["message"]["content"]
        response = response.replace("```", "")
        #df = spark_object.sql(response)
        return response

# Create an instance of the custom model
txtToSQLModel = TxtToSQLModel()

# Model Signature to be added to MLlow registration
from mlflow.types import DataType, Schema, ColSpec
input_schema = Schema([
    ColSpec("string", "message"),
    ColSpec("string", "table_name")
])
output_schema = Schema([
    ColSpec("string", "sql_query")
])
model_signature = ModelSignature(
    inputs=input_schema,
    outputs=output_schema
)

# Log the model with MLflow
mlflow.pyfunc.log_model(
    "my_custom_model",
    python_model=txtToSQLModel,
    artifacts={},
    signature=model_signature,
    registered_model_name="workspace.default.txt_to_sql_llama3",
)

In [0]:

# Define the custom model class for TxtToSQL
class DefineTxtQuery(PythonModel):
    # Constructor
    def __init__(self):
        pass

    def nl_to_sql(self, client, nl_query, schema):
        chat_response = client.predict(
            endpoint="databricks-meta-llama-3-70b-instruct",
            inputs={
                "messages": [
                    {"role": "system", "content": "You are an AI assistant"},
                    {
                        "role": "user",
                        "content": f"From the table having schema :\n{schema}\n\nWhat would you look at to answer this question : {nl_query}\n\n Give an answer that contains a description of how you would analyze this",
                    },
                ],
                "temperature": 0.1,
                "max_tokens": 256,
            },
        )

        return chat_response.choices[0]

    # Method for loading the model
    def load_context(self, context):
        pass

    """
    # Method for predicting using the loaded model
    def predict(self, model_input):
        # Custom prediction logic goes here
        pass
    """
    def predict(self, client, message, table_name, spark_object):
        table_schema = get_table_schema(table_name, spark_object)
        sql_query = self.nl_to_sql(client, message, table_schema)
        response = sql_query["message"]["content"]
        response = response.replace("```", "")
        #df = spark_object.sql(response)
        return response

# Create an instance of the custom model
defineTxtQueryModel = DefineTxtQuery()

# Model Signature to be added to MLlow registration
from mlflow.types import DataType, Schema, ColSpec
input_schema = Schema([
    ColSpec("string", "message"),
    ColSpec("string", "table_name")
])
output_schema = Schema([
    ColSpec("string", "sql_query")
])
model_signature = ModelSignature(
    inputs=input_schema,
    outputs=output_schema
)

# Log the model with MLflow
mlflow.pyfunc.log_model(
    "my_custom_model",
    python_model=defineTxtQueryModel,
    artifacts={},
    signature=model_signature,
    registered_model_name="workspace.default.define_txt_query_llama",
)

### Testing the Model 


In [0]:
client = mlflow.deployments.get_deploy_client("databricks")

## 1st step ask a generic question, an agent will define how to solve this with the data that we have at hand

In [0]:
query = defineTxtQueryModel.predict(client, "How would you describe the housing market for house Sold per city and zip code", "bright_data_real_estate_listings.datasets.zillow_properties", spark)


print(query)

## 2nd step get you answer transformed into sql 

In [0]:

result = txtToSQLModel.predict(client, query, "bright_data_real_estate_listings.datasets.zillow_properties", spark)

print(result)


In [0]:
%sql
SELECT 
  CITY, 
  ZIPCODE, 
  COUNT(*) AS num_houses_sold, 
  AVG(PRICE) AS avg_sold_price, 
  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY PRICE) AS median_sold_price
FROM 
  bright_data_real_estate_listings.datasets.zillow_properties
WHERE 
  HOMESTATUS = 'SOLD'
GROUP BY 
  CITY, 
  ZIPCODE
ORDER BY 
  CITY, 
  ZIPCODE;

## 3rd step : get the query executed 

In [0]:
display(spark.sql(result))