# Martian SDK Quickstart Guide

In [31]:
# Imports
import openai
from openai.types.chat import (
    chat_completion,
    chat_completion_message,
)

from martian_apart_hack_sdk import judge_specs, martian_client, utils
from martian_apart_hack_sdk.models import RouterConstraints

## Load Credentials
You must have a .env file with the following values set:

1. `MARTIAN_API_URL` - withmartian.com/api
1. `MARTIAN_ORG_ID` - your Martain organization ID
1. `MARTIAN_API_KEY` - your personal API key

In [2]:
# Load the config and make a client.
config = utils.load_config()
client = martian_client.MartianClient(
    api_url=config.api_url,
    api_key=config.api_key,
    org_id=config.org_id,
)

## Martian Gateway

You can use Martian as a gateway to access a number of different LLM providers.
To do so, you start by making an OpenAI client with the base_url set to the Martian API URL + "/openai/v2".
Then you can use the client as you would when working with OpenAI.

The list of available models are:

- gpt-4.5-preview
- gpt-4.1
- gpt-4.1-mini
- gpt-4.1-nano
- gpt-4o
- gpt-4o-mini
- o3-mini

- claude-3-opus-latest
- claude-3-5-haiku-latest
- claude-3-5-sonnet-latest
- claude-3-7-sonnet-latest

- together/deepseek-ai/DeepSeek-R1
- together/deepseek-ai/DeepSeek-V3
- together/mistralai/Mistral-Small-24B-Instruct-2501
- together/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
- together/meta-llama/Llama-3.3-70B-Instruct-Turbo
- together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
- together/Qwen/Qwen2.5-72B-Instruct-Turbo
- together/Qwen/Qwen2.5-Coder-32B-Instruct
- together/google/gemma-2-27b-it

In [8]:
# Create the client.
openai_client = openai.OpenAI(
    api_key=config.api_key,
    base_url=config.api_url + "/openai/v2"
)

# Create a request.
gpt_nano_chat_completion_response = openai_client.chat.completions.create(
    model="gpt-4.1-nano",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
)
claude_3_haiku_chat_completion_response = openai_client.chat.completions.create(
    model="claude-3-5-haiku-latest",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
)
gemma_2_chat_completion_response = openai_client.chat.completions.create(
    model="together/google/gemma-2-27b-it",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
)

# Get the response.
print("gpt-4.1-nano says:", gpt_nano_chat_completion_response.choices[0].message.content)
print("claude-3-5-haiku-latest says:", claude_3_haiku_chat_completion_response.choices[0].message.content)
print("gemma-2-27b-it says:", gemma_2_chat_completion_response.choices[0].message.content)

gpt-4.1-nano says: The capital of France is Paris.
claude-3-5-haiku-latest says: The capital of France is Paris.
gemma-2-27b-it says: The capital of France is **Paris**. 🇫🇷 



## Judging

To create a basic rubric-based judge with a numeric scoring model, you just need to provide the rubric, the minimum and maximum scores, and the model you'd like to use as the judge.

MARTIAN'S TIP: It's better to use discrete numbers—and, if possible, a binary scale—to minimize potential biases.

In [9]:
# Create a JudgeSpec

rubric = """
You are tasked with evaluating whether a restaurant recommendation is good.
The scoring is as follows:
- 1: If the recommendation doesn't meet any of the criteria.
- 2: If the recommendation meets only some small part of the criteria.
- 3: If the recommendation is reasonable, but not perfect.
- 4: If the recommendation is almost perfect.
- 5: If the recommendation is perfect.
""".strip()

rubric_judge_spec = judge_specs.RubricJudgeSpec(
    model_type="rubric_judge",
    rubric=rubric,
    model="openai/openai/gpt-4o",
    min_score=1,
    max_score=5,
)

In [10]:
# Run the judge spec.

chat_request_text = "What is a good Chinese restaurant in downtown San Francisco?"
chat_response_text = "I couldn't find a good Mexican restaurant near you."

completion_request = {
    "model": "openai/openai/gpt-4o-mini",
    "messages": [{"role": "user", "content": chat_request_text}],
}
chat_completion_response = chat_completion.ChatCompletion(
    id="123",
    choices=[
        chat_completion.Choice(
            finish_reason="stop",
            index=0,
            message=chat_completion_message.ChatCompletionMessage(
                role="assistant",
                content=chat_response_text,
            ),
        )
    ],
    created=0,
    model="gpt-4o",
    object="chat.completion",
    service_tier=None,
)

evaluation_result = client.judges.evaluate_judge_spec(
    rubric_judge_spec.to_dict(),
    completion_request=completion_request,
    completion_response=chat_completion_response,
)

print(f"User: {chat_request_text}")
print(f"Assistant: {chat_response_text}")
print(f"Evaluation result: {evaluation_result}")

User: What is a good Chinese restaurant in downtown San Francisco?
Assistant: I couldn't find a good Mexican restaurant near you.
Evaluation result: JudgeEvaluation(score=1, reason="\nThe assistant's response does not meet the user's request at all. The user specifically asked for a recommendation for a Chinese restaurant in downtown San Francisco, but the assistant responded about not finding a Mexican restaurant. This indicates a lack of relevance and does not address the user's query in any capacity. According to the rubric, this response doesn't meet any of the criteria for a good recommendation, which results in the lowest possible score.\n", cost=0.0016425)


Once you're satisfied with the judge, you can save it.

In [11]:
judge_id = "restaurant-recommendation-reviewer"

judge = client.judges.get(judge_id=judge_id)
if judge is None:
    judge = client.judges.create_judge(
        judge_id=judge_id,
        judge_spec=rubric_judge_spec,
        description="A judge that rates how good restaurant recommendations are."
    )
    print(f"Created a judge: {judge}")
else:
    print(f"Judge {judge_id} already exists. Skipping creation.")

Judge restaurant-recommendation-reviewer already exists. Skipping creation.


You can now also evaluate the judge by its ID.

In [12]:
evaluation_result = client.judges.evaluate_judge(
    judge,
    completion_request=completion_request,
    completion_response=chat_completion_response,
)

print(f"Judge response: {evaluation_result}")

Judge response: JudgeEvaluation(score=1, reason="\nThe assistant's response does not meet any of the criteria for a good recommendation because it fails to provide a relevant suggestion. The user asked specifically for a Chinese restaurant in downtown San Francisco, but the assistant mistakenly mentioned not finding a Mexican restaurant, which is not relevant to the user's query. Therefore, the recommendation is not applicable to the user's request for information.\n", cost=0.0015225)


### Once you have created some judges, you may want to list them all

In [15]:
all_judges = client.judges.list()
print("Judges:")
print(*[f"\t- {j}\n" for j in all_judges])

Judges:
	- Judge(id='my-rubric-judge', version=18, description='', createTime='2025-05-26T14:49:06.634150Z', name='organizations/68275a4af88f39d6440c1050/judges/my-rubric-judge', judgeSpec=None)
 	- Judge(id='restaurant-recommendation-reviewer-extended', version=1, description='A judge that rates how helpful the question for the target is', createTime='2025-05-23T20:04:22.152450Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer-extended', judgeSpec=None)
 	- Judge(id='restaurant-recommendation-reviewer', version=4, description='A judge that rates how good restaurant recommendations are.', createTime='2025-05-23T20:02:34.730158Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer', judgeSpec=None)



If you already know the ID, you can retrieve the judge.

In [16]:
retrieved_judge = client.judges.get(
    judge_id="restaurant-recommendation-reviewer",
    version=1,
)
print(f"\nRetrieved judge: {retrieved_judge}")



Retrieved judge: Judge(id='restaurant-recommendation-reviewer', version=1, description='A judge that rates how good restaurant recommendations are.', createTime='2025-05-23T15:52:53.326120Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer', judgeSpec={'extract_judgement': {'extraction_fields': [{'extraction_pattern': '<rationale>(.*?)</rationale>', 'field_type': 'STRING', 'match_index': -1, 'name': 'rationale', 'required': True}, {'extraction_pattern': '<score>(.*?)</score>', 'field_type': 'FLOAT', 'match_index': -1, 'name': 'score', 'required': True}], 'model_type': 'regex_extractor'}, 'extract_variables': {'extraction_fields': [{'extraction_pattern': '', 'field_type': 'STRING', 'match_index': 0, 'name': 'content', 'required': True}], 'model_type': 'default_extractor'}, 'max_score': 5, 'min_score': 1, 'model': 'openai/openai/gpt-4o', 'model_type': 'rubric_judge', 'postscript': "Here's the conversation you are judging:\n<content>\n${content}\n</

Or you can update the judge to create a new version.

MARTIAN TIP: Judges are immutable, so there's no risk of breaking anything when you update. For example, if your current production setup is using version 2 and you update the judge, it will simply create version 3 without affecting your existing production environment.

In [17]:
new_rubric = """
You are tasked with evaluating whether a restaurant recommendation is good.
The scoring is as follows:
- 1: If the recommendation doesn't meet any of the criteria.
- 2: If the recommendation meets only some small part of the criteria.
- 3: If the recommendation is reasonable, but not perfect.
- 4: If the recommendation is almost perfect.
- 5: If the recommendation is perfect.
""".strip()

new_rubric_judge_spec = judge_specs.RubricJudgeSpec(
    model_type="rubric_judge",
    rubric=rubric,
    # TODO: Clearly communicate which models are available.
    model="openai/openai/gpt-4o",
    min_score=1,
    max_score=5,
)

new_judge_spec = client.judges.update_judge(
    judge_id="restaurant-recommendation-reviewer",
    judge_spec=new_rubric_judge_spec,
)

print(f"\nNew judge spec: {new_judge_spec}")


New judge spec: Judge(id='restaurant-recommendation-reviewer', version=5, description='A judge that rates how good restaurant recommendations are.', createTime='2025-05-26T14:57:59.746789Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer', judgeSpec={'extract_judgement': {'extraction_fields': [{'extraction_pattern': '<rationale>(.*?)</rationale>', 'field_type': 'STRING', 'match_index': -1, 'name': 'rationale', 'required': True}, {'extraction_pattern': '<score>(.*?)</score>', 'field_type': 'FLOAT', 'match_index': -1, 'name': 'score', 'required': True}], 'model_type': 'regex_extractor'}, 'extract_variables': {'extraction_fields': [{'extraction_pattern': '', 'field_type': 'STRING', 'match_index': 0, 'name': 'content', 'required': True}], 'model_type': 'default_extractor'}, 'max_score': 5, 'min_score': 1, 'model': 'openai/openai/gpt-4o', 'model_type': 'rubric_judge', 'postscript': "Here's the conversation you are judging:\n<content>\n${content}\n</c

Example of a More Complex Judge
We can start with a specification that includes not only the rubric but also a customized prescript (which is added to the judge prompt before the rubric) and postscript (which goes after the rubric). This setup is typically used to customize how your judge processes requests.

In [25]:
json_spec = '{\n  "model_type": "rubric_judge",\n  "rubric": "Is important question helps to advance to the target?",\n  "model": "gpt-4o-mini",\n  "min_score": 1.0,\n  "max_score": 4.0,\n  "prescript": "target of the conversation: ${target}.\\nConversation: ${conversation}.important question: ${important_question}.\\n\\n",\n  "postscript": "Please evaluate conversation according to the rubric.\\nThink step-by-step to produce a score, and please provide a rationale for your score.\\nYour score should be between ${min_score} and ${max_score}.\\n\\nYour response MUST include:\\n1. A <rationale>...</rationale> tag containing your explanation\\n2. A <score>...</score> tag containing your numerical score\\n",\n  "extract_variables": {\n    "model_type": "combined_extractor",\n    "extraction_fields": [\n      {\n        "name": "target",\n        "field_type": "STRING",\n        "required": true,\n        "extraction_pattern": "\\"target\\":\\"([\\\\S\\\\s]*?)\\"",\n        "match_index": 0\n      },\n      {\n        "name": "important_question",\n        "field_type": "STRING",\n        "required": true,\n        "extraction_pattern": "\\"important\\", \\"question\\": \\"([\\\\s\\\\S]*?)\\"",\n        "match_index": -1\n      },\n      {\n        "name": "conversation",\n        "field_type": "STRING",\n        "required": true,\n        "extraction_pattern": null,\n        "match_index": 0\n      }\n    ],\n    "extractors": [\n      {\n        "model_type": "regex_extractor",\n        "extraction_fields": [\n          {\n            "name": "target",\n            "field_type": "STRING",\n            "required": true,\n            "extraction_pattern": "\\"target\\":\\"([\\\\S\\\\s]*?)\\"",\n            "match_index": 0\n          }\n        ]\n      },\n      {\n        "model_type": "response_regex_extractor",\n        "extraction_fields": [\n          {\n            "name": "important_question",\n            "field_type": "STRING",\n            "required": true,\n            "extraction_pattern": "\\"important\\", \\"question\\": \\"([\\\\s\\\\S]*?)\\"",\n            "match_index": -1\n          }\n        ]\n      },\n      {\n        "model_type": "conversation_extractor",\n        "extraction_fields": [\n          {\n            "name": "conversation",\n            "field_type": "STRING",\n            "required": true,\n            "extraction_pattern": null,\n            "match_index": 0\n          }\n        ]\n      }\n    ]\n  },\n  "extract_judgement": {\n    "model_type": "regex_extractor",\n    "extraction_fields": [\n      {\n        "name": "rationale",\n        "field_type": "STRING",\n        "required": true,\n        "extraction_pattern": "<rationale>(.*?)</rationale>",\n        "match_index": -1\n      },\n      {\n        "name": "score",\n        "field_type": "FLOAT",\n        "required": true,\n        "extraction_pattern": "<score>(.*?)</score>",\n        "match_index": -1\n      }\n    ]\n  }\n}'

In [26]:
import json
json.loads(json_spec)

{'model_type': 'rubric_judge',
 'rubric': 'Is important question helps to advance to the target?',
 'model': 'gpt-4o-mini',
 'min_score': 1.0,
 'max_score': 4.0,
 'prescript': 'target of the conversation: ${target}.\nConversation: ${conversation}.important question: ${important_question}.\n\n',
 'postscript': 'Please evaluate conversation according to the rubric.\nThink step-by-step to produce a score, and please provide a rationale for your score.\nYour score should be between ${min_score} and ${max_score}.\n\nYour response MUST include:\n1. A <rationale>...</rationale> tag containing your explanation\n2. A <score>...</score> tag containing your numerical score\n',
 'extract_variables': {'model_type': 'combined_extractor',
  'extraction_fields': [{'name': 'target',
    'field_type': 'STRING',
    'required': True,
    'extraction_pattern': '"target":"([\\S\\s]*?)"',
    'match_index': 0},
   {'name': 'important_question',
    'field_type': 'STRING',
    'required': True,
    'extracti

In [28]:
judge_id = "restaurant-recommendation-reviewer-extended"
judge = client.judges.get(judge_id=judge_id)
if judge is None:
    judge = client.judges.create_judge(
        judge_id=judge_id,
        judge_spec=json.loads(json_spec),
        description="A judge that rates how helpful the question for the target is"
    )
    print(f"Created a judge: {judge}")
else:
    print(f"Judge {judge_id} already exists. Skipping creation.")

Created a judge: Judge(id='restaurant-recommendation-reviewer-extended', version=1, description='A judge that rates how helpful the question for the target is', createTime='2025-05-23T20:04:22.152450Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer-extended', judgeSpec={'extract_judgement': {'extraction_fields': [{'extraction_pattern': '<rationale>(.*?)</rationale>', 'field_type': 'STRING', 'match_index': -1, 'name': 'rationale', 'required': True}, {'extraction_pattern': '<score>(.*?)</score>', 'field_type': 'FLOAT', 'match_index': -1, 'name': 'score', 'required': True}], 'model_type': 'regex_extractor'}, 'extract_variables': {'extraction_fields': [{'extraction_pattern': '"target":"([\\S\\s]*?)"', 'field_type': 'STRING', 'match_index': 0, 'name': 'target', 'required': True}, {'extraction_pattern': '"important", "question": "([\\s\\S]*?)"', 'field_type': 'STRING', 'match_index': -1, 'name': 'important_question', 'required': True}, {'extraction_pa

Let's test it with an example request and response:

In [29]:
completion_request = {
    "messages": [
        {
            "role": "system",
            "content": 'Help me to get to my target: "target":"solve P=NP"',
        },
        {
            "role": "user",
            "content": "Pls answer the question",
        },
    ]
}
chat_completion_response = chat_completion.ChatCompletion(
    id="123",
    choices=[
        chat_completion.Choice(
            finish_reason="stop",
            index=0,
            message=chat_completion_message.ChatCompletionMessage(
                role="assistant",
                content='{"type": "important", "question": "Would like to use differential equations to solve P=NP?"}',
            ),
        )
    ],
    created=0,
    model="gpt-4o",
    object="chat.completion",
    service_tier=None,
)


In [30]:
evaluation_result = client.judges.evaluate_judge(
    judge,
    completion_request=completion_request,
    completion_response=chat_completion_response,
)

print(f"Judge response: {evaluation_result}")

Judge response: JudgeEvaluation(score=2, reason='In evaluating the conversation, the important question posed by the user about using differential equations to solve P=NP appears to be somewhat off-target. The P=NP problem is a major unsolved question in computer science concerning computational complexity, while differential equations are typically used in modeling continuous change across various scientific and engineering fields. Although there may be abstract or novel approaches that could theoretically connect these two areas, the question as it stands does not directly advance the conversation towards solving P=NP. \n\nThus, while the question is interesting, it does not effectively contribute to advancing the target of the conversation, which is to solve P=NP. Consequently, I would rate this conversation a score of 2.0, indicating that while there is some relevance, the connection is tenuous and not particularly constructive.', cost=0.00012705)


Let's measure the IAA (Inter-Annotator Agreement) of our judge using the gold scores provided by domain experts.

In [31]:
from sklearn.metrics import cohen_kappa_score
# Define test examples
def create_chat_completion(request_text: str, response_text: str) -> chat_completion.ChatCompletion:
    """Create a ChatCompletion object for testing."""
    return chat_completion.ChatCompletion(
        id="test-completion",
        choices=[
            chat_completion.Choice(
                finish_reason="stop",
                index=0,
                message=chat_completion_message.ChatCompletionMessage(
                    role="assistant",
                    content=response_text,
                ),
            )
        ],
        created=0,
        model="gpt-4o",
        object="chat.completion",
        service_tier=None,
    )

examples = [
    {
        "request": "What's a good Chinese restaurant in San Francisco?",
        "response": "I recommend China Live in Chinatown. It's known for its excellent dim sum, modern atmosphere, and authentic dishes. The prices are moderate, and they're located at 644 Broadway.",
        "golden_score": 5  # Perfect recommendation with details
    },
    {
        "request": "Where can I get good pizza in NYC?",
        "response": "Sorry, I don't have access to restaurant information.",
        "golden_score": 1  # Completely unhelpful
    },
    {
        "request": "What's a good Mexican restaurant in Chicago?",
        "response": "There's a Mexican restaurant downtown.",
        "golden_score": 2  # Very minimal information
    },
    {
        "request": "Recommend an Italian restaurant in Boston.",
        "response": "Giacomo's in the North End is a popular Italian restaurant. They serve pasta and seafood.",
        "golden_score": 3  # Basic but reasonable recommendation
    },
    {
        "request": "What's a good sushi place in LA?",
        "response": "Nobu Malibu is an excellent sushi restaurant with ocean views. They're known for their fresh fish and signature dishes, though they are on the expensive side.",
        "golden_score": 4  # Almost perfect, but could include location details
    }
]

# Create judge spec
rubric = """
You are tasked with evaluating whether a restaurant recommendation is good.
The scoring is as follows:
- 1: If the recommendation doesn't meet any of the criteria.
- 2: If the recommendation meets only some small part of the criteria.
- 3: If the recommendation is reasonable, but not perfect.
- 4: If the recommendation is almost perfect.
- 5: If the recommendation is perfect.
""".strip()

judge_spec = judge_specs.RubricJudgeSpec(
    model_type="rubric_judge",
    rubric=rubric,
    model="openai/openai/gpt-4o",
    min_score=1,
    max_score=5,
)

# Evaluate examples
judge_scores = []
golden_scores = []

print("\nEvaluating examples...")
print("-" * 80)

for i, example in enumerate(examples, 1):
    # Create completion request and response
    completion_request = {
        "model": "openai/openai/gpt-4o",
        "messages": [{"role": "user", "content": example["request"]}],
    }
    completion_response = create_chat_completion(example["request"], example["response"])

    # Get judge's evaluation
    evaluation = client.judges.evaluate_judge_spec(
        judge_spec.to_dict(),
        completion_request=completion_request,
        completion_response=completion_response,
    )

    judge_scores.append(int(evaluation.score))
    golden_scores.append(example["golden_score"])

    print(f"\nExample {i}:")
    print(f"User: {example['request']}")
    print(f"Assistant: {example['response']}")
    print(f"Judge Score: {evaluation.score}")
    print(f"Golden Score: {example['golden_score']}")
    #print(f"Judge Rationale: {evaluation.rationale}")
    print("-" * 80)

# Calculate Cohen's Kappa
kappa = cohen_kappa_score(golden_scores, judge_scores)

print("\nResults Summary:")
print(f"Number of examples evaluated: {len(examples)}")
print(f"Cohen's Kappa Score: {kappa:.3f}")
print("\nInterpretation of Kappa Score:")
print("< 0.00: Poor agreement")
print("0.00-0.20: Slight agreement")
print("0.21-0.40: Fair agreement")
print("0.41-0.60: Moderate agreement")
print("0.61-0.80: Substantial agreement")
print("0.81-1.00: Almost perfect agreement")


Evaluating examples...
--------------------------------------------------------------------------------

Example 1:
User: What's a good Chinese restaurant in San Francisco?
Assistant: I recommend China Live in Chinatown. It's known for its excellent dim sum, modern atmosphere, and authentic dishes. The prices are moderate, and they're located at 644 Broadway.
Judge Score: 4.5
Golden Score: 5
--------------------------------------------------------------------------------

Example 2:
User: Where can I get good pizza in NYC?
Assistant: Sorry, I don't have access to restaurant information.
Judge Score: 1
Golden Score: 1
--------------------------------------------------------------------------------

Example 3:
User: What's a good Mexican restaurant in Chicago?
Assistant: There's a Mexican restaurant downtown.
Judge Score: 2
Golden Score: 2
--------------------------------------------------------------------------------

Example 4:
User: Recommend an Italian restaurant in Boston.
Assista

## Routing

Let's start with the base model. You could access the base model via OpenAI client using Martian endpoint

In [18]:
# TODO change to enum from SDK
base_model = "openai/openai/gpt-4o"
openai_client = openai.OpenAI(
    api_key=config.api_key,
    base_url=config.api_url + "/openai/v2"
)
# Prepare the OpenAI chat completion request
openai_completion_request = {
    "model": "openai/openai/gpt-4o-mini",
    "messages": [
        {
            "role": "user",
            "content": chat_request_text
        }
    ],
    "max_tokens": 100
}
openai_completion_request

{'model': 'openai/openai/gpt-4o-mini',
 'messages': [{'role': 'user',
   'content': 'What is a good Chinese restaurant in downtown San Francisco?'}],
 'max_tokens': 100}

In [20]:
response = openai_client.chat.completions.create(
    **openai_completion_request
)
response.choices[0].message.content

"One popular Chinese restaurant in downtown San Francisco is **Z & Y Restaurant**. It's known for its Sichuan cuisine and offers a variety of flavorful dishes, including spicy noodles and mapo tofu. Another great option is **Hakkasan**, which provides a modern twist on traditional Chinese dishes in an upscale environment. If you're looking for dim sum, **Yank Sing** is a well-regarded choice with a wide selection. Be sure to check current reviews and availability, as restaurants can change over time"

In [21]:
# You could see the cost of the llm request
response.cost

6.269999999999999e-05

In [22]:
# Now, let's create a router
router_id = "restaurant-recommendation-router"
router = client.routers.get(router_id)
if router is None:
    router = client.routers.create_router(router_id, base_model,
                                      description="It's a brand new router to select the best model on restaurant recommendations.")
    print(f"Created a router: {router}")
else:
    print(f"Router {router_id} already exists. Skipping creation.")

Router restaurant-recommendation-router already exists. Skipping creation.


In [23]:
# You could list all your routers
all_routers = client.routers.list()
print("Routers:")
print(*[f"\t- {r}\n" for r in all_routers])

Routers:
	- Router(id='new-super-router-id', version=77, description="It's a new cool router", createTime='2025-05-26T14:49:17.276744Z', name='organizations/68275a4af88f39d6440c1050/routers/new-super-router-id', routerSpec=None)
 	- Router(id='restaurant-recommendation-router', version=1, description="It's a brand new router to select the best model on restaurant recommendations.", createTime='2025-05-23T17:19:54.252429Z', name='organizations/68275a4af88f39d6440c1050/routers/restaurant-recommendation-router', routerSpec=None)
 	- Router(id='new-super-router-is', version=1, description="It's a new cool router", createTime='2025-05-22T20:20:14.657673Z', name='organizations/68275a4af88f39d6440c1050/routers/new-super-router-is', routerSpec=None)



In [24]:
# Getting router by id
retrieved_router = client.routers.get(router_id)
print(f"\nRetrieved router: {retrieved_router}")


Retrieved router: Router(id='restaurant-recommendation-router', version=1, description="It's a brand new router to select the best model on restaurant recommendations.", createTime='2025-05-23T17:19:54.252429Z', name='organizations/68275a4af88f39d6440c1050/routers/restaurant-recommendation-router', routerSpec={'points': [{'point': {'x': 0, 'y': 0}, 'executor': {'spec': {'executor_type': 'ModelExecutor', 'model_name': 'openai/openai/gpt-4o'}}}, {'point': {'x': 1, 'y': 1}, 'executor': {'spec': {'executor_type': 'ModelExecutor', 'model_name': 'openai/openai/gpt-4o'}}}]})


Before the router is trained, it will use the base model for inference

To run the router:
- change the model name in the request into the router name,
- add routing constrains

In [25]:
# cost routing constrains
cost_constraint = RouterConstraints.RoutingConstraint(
    cost_constraint=RouterConstraints.CostConstraint(
        value=RouterConstraints.ConstraintValue(numeric_value=0.5)
    )
)
cost_constraint

RoutingConstraint(cost_constraint=CostConstraint(value=ConstraintValue(numeric_value=0.5, model_name=None)), quality_constraint=None)

In [26]:
router_completion_request = openai_completion_request | {
    "model": retrieved_router.name  # using router name instead of base model name
}

In [39]:
router_response = openai_client.chat.completions.create(
    **router_completion_request,
    extra_body=RouterConstraints.render_extra_body_router_constraint(cost_constraint)
)
router_response.choices[0].message.content

'Downtown San Francisco has several excellent Chinese restaurants. Here are a few popular ones:\n\n1. **Hakkasan** - Known for its upscale dim sum and modern Cantonese dishes, Hakkasan offers a refined dining experience in a stylish setting.\n\n2. **City View Restaurant** - A favorite spot for traditional dim sum, City View is known for its delicious and authentic offerings in a bustling and lively atmosphere.\n\n3. **House of Nanking** - This restaurant has a bit of a cult'

In [40]:
# You could see the cost and llm model used
print(f"Request cost: {router_response.cost}")
print(f"Request router to model: {router_response.model}")

Request cost: 0.001045
Request router to model: gpt-4o-2024-08-06


In [41]:
# Let's evaluate the router response
# TODO: OpenAI  response version was changed, need to be reshaped before sending to the judge
client.judges.evaluate_judge(retrieved_judge, router_completion_request, router_response)

JudgeEvaluation(score=4, reason="\nThe assistant's response provides three recommendations for Chinese restaurants in downtown San Francisco, which addresses the user's question directly. The recommendations include a variety of options: Hakkasan for a refined and upscale experience, City View Restaurant for traditional dim sum in a lively setting, and House of Nanking, which suggests a unique or cult favorite. This covers different styles of Chinese dining, catering to potential different preferences the user might have. However, the response cuts off in the middle of the House of Nanking description, which leaves the recommendation incomplete. Despite this, the assistant has offered a good range of options and seems informed about the locations and specialties of each restaurant. To be perfect, the response would need to be complete and possibly include more fitting details about why each place is a strong choice. Overall, the response is almost perfect but is slightly marred by the 

### Training router

In [51]:
# TODO