In [0]:
from mlflow.deployments import get_deploy_client
from databricks.sdk import WorkspaceClient

def _get_endpoint_task_type(endpoint_name: str) -> str:
    """Get the task type of a serving endpoint."""
    w = WorkspaceClient()
    ep = w.serving_endpoints.get(endpoint_name)
    return ep.task

def is_endpoint_supported(endpoint_name: str) -> bool:
    """Check if the endpoint has a supported task type."""
    task_type = _get_endpoint_task_type(endpoint_name)
    supported_task_types = ["agent/v1/chat", "agent/v2/chat", "llm/v1/chat"]
    return task_type in supported_task_types

def _validate_endpoint_task_type(endpoint_name: str) -> None:
    """Validate that the endpoint has a supported task type."""
    if not is_endpoint_supported(endpoint_name):
        raise Exception(
            f"Detected unsupported endpoint type for this basic chatbot template. "
            f"This chatbot template only supports chat completions-compatible endpoints. "
            f"For a richer chatbot template with support for all conversational endpoints on Databricks, "
            f"see https://docs.databricks.com/aws/en/generative-ai/agent-framework/chat-app"
        )

def _query_endpoint(endpoint_name: str, messages: list[dict[str, str]], max_tokens) -> list[dict[str, str]]:
    """Calls a model serving endpoint."""
    _validate_endpoint_task_type(endpoint_name)
    
    res = get_deploy_client('databricks').predict(
        endpoint=endpoint_name,
        inputs={'messages': messages, "max_tokens": max_tokens},
    )
    if "messages" in res:
        return res["messages"]
    elif "choices" in res:
        choice_message = res["choices"][0]["message"]
        choice_content = choice_message.get("content")
        
        # Case 1: The content is a list of structured objects
        if isinstance(choice_content, list):
            combined_content = "".join([part.get("text", "") for part in choice_content if part.get("type") == "text"])
            reformatted_message = {
                "role": choice_message.get("role"),
                "content": combined_content
            }
            return [reformatted_message]
        
        # Case 2: The content is a simple string
        elif isinstance(choice_content, str):
            return [choice_message]
    raise Exception("This app can only run against:"
                    "1) Databricks foundation model or external model endpoints with the chat task type (described in https://docs.databricks.com/aws/en/machine-learning/model-serving/score-foundation-models#chat-completion-model-query)"
                    "2) Databricks agent serving endpoints that implement the conversational agent schema documented "
                    "in https://docs.databricks.com/aws/en/generative-ai/agent-framework/author-agent")

def query_endpoint(endpoint_name, messages, max_tokens):
    """
    Query a chat-completions or agent serving endpoint
    If querying an agent serving endpoint that returns multiple messages, this method
    returns the last message
    ."""
    return _query_endpoint(endpoint_name, messages, max_tokens)[-1]


In [0]:

import json
from mlflow.deployments import get_deploy_client
from databricks.sdk import WorkspaceClient

endpoint_name = "databricks-gpt-oss-120b" # any model from "Serving" left menu
messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "What is the capital of Japan?"
    }
]
max_tokens = 400

def query_endpoint(endpoint_name: str, messages: list[dict[str, str]], max_tokens) -> list[dict[str, str]]:
    """Calls a model serving endpoint."""
   
    res = get_deploy_client('databricks').predict(
        endpoint=endpoint_name,
        inputs={'messages': messages, "max_tokens": max_tokens},
    )
    if "messages" in res:
        return res["messages"]
    elif "choices" in res:
        choice_message = res["choices"][0]["message"]
        choice_content = choice_message.get("content")
        
        # Case 1: The content is a list of structured objects
        if isinstance(choice_content, list):
            combined_content = "".join([part.get("text", "") for part in choice_content if part.get("type") == "text"])
            reformatted_message = {
                "role": choice_message.get("role"),
                "content": combined_content
            }
            return [reformatted_message]
        
        # Case 2: The content is a simple string
        elif isinstance(choice_content, str):
            return [choice_message]
    raise Exception("This app can only run against:"
                    "1) Databricks foundation model or external model endpoints with the chat task type (described in https://docs.databricks.com/aws/en/machine-learning/model-serving/score-foundation-models#chat-completion-model-query)"
                    "2) Databricks agent serving endpoints that implement the conversational agent schema documented "
                    "in https://docs.databricks.com/aws/en/generative-ai/agent-framework/author-agent")

res = query_endpoint(endpoint_name, messages, max_tokens)
print(json.dumps(res, indent=2))

[
  {
    "role": "assistant",
    "content": "The capital of Japan is **Tokyo**."
  }
]


In [0]:
!pip install databricks-sdk[openai]>=0.35.0

Collecting openai (from databricks-sdk[openai]>=0.35.0)
  Downloading openai-2.8.1-py3-none-any.whl.metadata (29 kB)
Collecting langchain-openai (from databricks-sdk[openai]>=0.35.0)
  Downloading langchain_openai-1.0.3-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<2.0.0,>=1.0.2 (from langchain-openai->databricks-sdk[openai]>=0.35.0)
  Downloading langchain_core-1.0.5-py3-none-any.whl.metadata (3.6 kB)
Collecting tiktoken<1.0.0,>=0.7.0 (from langchain-openai->databricks-sdk[openai]>=0.35.0)
  Downloading tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.7 kB)
Collecting jiter<1,>=0.10.0 (from openai->databricks-sdk[openai]>=0.35.0)
  Downloading jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting tqdm>4 (from openai->databricks-sdk[openai]>=0.35.0)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting jsonpatch<2.0.0,>=1.33.0 (from langchain-core<2.0.0,>=1.0.2->langchain-openai->databricks-

In [0]:

dbutils.library.restartPython()

In [0]:

from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
openai_client = w.serving_endpoints.get_open_ai_client()

response = openai_client.chat.completions.create(
    model="databricks-meta-llama-3-3-70b-instruct",
    messages=[
      {
        "role": "system",
        "content": "You are a helpful assistant."
      },
      {
        "role": "user",
        "content": "What is a mixture of experts model?",
      }
    ],
    max_tokens=256
)

print(response.choices[0].message.content)

A Mixture of Experts (MoE) model is a type of neural network architecture that combines the predictions of multiple expert models to produce a final output. The basic idea is to divide the input space into different regions, each handled by a specialized expert model. The experts are typically simple models, such as linear or small neural networks, that are trained to be proficient in a specific subset of the input data.

The MoE model consists of three main components:

1. **Gating network**: This is a neural network that takes the input data and produces a set of weights or probabilities that determine the contribution of each expert to the final output. The gating network essentially decides which expert is most relevant for a given input.
2. **Expert models**: These are the individual models that are responsible for making predictions in different regions of the input space. Each expert is trained on a subset of the data and is specialized to handle a specific type of input.
3. **O

In [0]:

import os

os.environ["YOUR_DATABRICKS_TOKEN"] = "dapi1123999e2fd1ea9c51255f99e8d495d3"

In [0]:


import os
import json
from openai import OpenAI

current_workspace = f'https://{spark.conf.get("spark.databricks.workspaceUrl")}'

DATABRICKS_TOKEN = os.environ.get('YOUR_DATABRICKS_TOKEN')
DATABRICKS_BASE_URL = f'{current_workspace}/serving-endpoints'

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url=DATABRICKS_BASE_URL
  )

messages = [{"role": "user", "content": "What is the weather in Tokyo?"}]

response = client.chat.completions.create(
    model="databricks-claude-3-7-sonnet",
    messages=messages,
)

print(response.choices[0].message.content)

I don't have the ability to check current weather conditions or forecasts for Tokyo or any other location. To get accurate weather information for Tokyo, you could check a weather website like Weather.com or AccuWeather, use a weather app on your device, or search for "Tokyo weather" on a search engine.


In [0]:
%pip install -U mlflow
dbutils.library.restartPython()

Collecting mlflow
  Downloading mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.6.0 (from mlflow)
  Downloading mlflow_skinny-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.6.0 (from mlflow)
  Downloading mlflow_tracing-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.17.2-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.5.4-py3-none-a

In [0]:
import mlflow
mlflow.openai.autolog()

In [0]:

import os
import json
from openai import OpenAI

current_workspace = f'https://{spark.conf.get("spark.databricks.workspaceUrl")}'
os.environ["YOUR_DATABRICKS_TOKEN"] = "dapi1123999e2fd1ea9c51255f99e8d495d3"
DATABRICKS_TOKEN = os.environ.get('YOUR_DATABRICKS_TOKEN')
DATABRICKS_BASE_URL = f'{current_workspace}/serving-endpoints'

client = OpenAI(
  api_key=DATABRICKS_TOKEN,
  base_url=DATABRICKS_BASE_URL
  )

tools = [
  {
    "type": "function",
    "function": {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
          },
          "unit": {
            "type": "string",
            "enum": [
              "celsius",
              "fahrenheit"
            ]
          }
        }
      }
    }
  }
]

messages = [{"role": "user", "content": "What is the weather in Tokyo in celsius?"}]

response = client.chat.completions.create(
    model="databricks-claude-3-7-sonnet",
    messages=messages,
    tools=tools,
    tool_choice="auto",
)

print(json.dumps(response.choices[0].message.model_dump()['tool_calls'], indent=2))

[
  {
    "id": "toolu_bdrk_01RqrgXDu94UXSGBTSX7Brnh",
    "function": {
      "arguments": "{\"location\":\"Tokyo\",\"unit\":\"celsius\"}",
      "name": "get_current_weather"
    },
    "type": "function"
  }
]


Trace(trace_id=tr-81b0a7888526a19482c081cef3033eb4)