# Martian SDK Quickstart Guide

In [17]:
# Imports
import openai
from openai.types.chat import (
    chat_completion,
    chat_completion_message,
)

from martian_apart_hack_sdk import judge_specs, martian_client, utils
from martian_apart_hack_sdk.models import RouterConstraints

## Load Credentials
You must have a .env file with the following values set:

1. `MARTIAN_API_URL` - withmartian.com/api
1. `MARTIAN_ORG_ID` - your Martain organization ID
1. `MARTIAN_API_KEY` - your personal API key

In [2]:
# Load the config and make a client.
config = utils.load_config()
client = martian_client.MartianClient(
    api_url=config.api_url,
    api_key=config.api_key,
    org_id=config.org_id,
)

## Judging

In [3]:
# Create a JudgeSpec

rubric = """
You are tasked with evaluating whether a restaurant recommendation is good.
The scoring is as follows:
- 1: If the recommendation doesn't meet any of the criteria.
- 2: If the recommendation meets only some small part of the criteria.
- 3: If the recommendation is reasonable, but not perfect.
- 4: If the recommendation is almost perfect.
- 5: If the recommendation is perfect.
""".strip()

rubric_judge_spec = judge_specs.RubricJudgeSpec(
    model_type="rubric_judge",
    rubric=rubric,
    # TODO: Clearly communicate which models are available.
    model="openai/openai/gpt-4o",
    min_score=1,
    max_score=5,
)

In [4]:
# Run the judge spec.
# TODO: Go through advanced judge building.

chat_request_text = "What is a good Chinese restaurant in downtown San Francisco?"
chat_response_text = "I couldn't find a good Mexican restaurant near you."

completion_request = {
    "model": "openai/openai/gpt-4o-mini",
    "messages": [{"role": "user", "content": chat_request_text}],
}
chat_completion_response = chat_completion.ChatCompletion(
    id="123",
    choices=[
        chat_completion.Choice(
            finish_reason="stop",
            index=0,
            message=chat_completion_message.ChatCompletionMessage(
                role="assistant",
                content=chat_response_text,
            ),
        )
    ],
    created=0,
    model="gpt-4o",
    object="chat.completion",
    service_tier=None,
)

evaluation_result = client.judges.evaluate_judge_spec(
    rubric_judge_spec.to_dict(),
    completion_request=completion_request,
    completion_response=chat_completion_response,
)

print(f"User: {chat_request_text}")
print(f"Assistant: {chat_response_text}")
print(f"Evaluation result: {evaluation_result}")

User: What is a good Chinese restaurant in downtown San Francisco?
Assistant: I couldn't find a good Mexican restaurant near you.
Evaluation result: JudgeEvaluation(score=1, reason="\nThe assistant's response does not meet the user's request at all. The user asked for a recommendation for a Chinese restaurant in downtown San Francisco, but the assistant responded with information about a Mexican restaurant, which is irrelevant and entirely unrelated to the user's query. The response fails to address any part of the user's request for a Chinese restaurant recommendation. Thus, the recommendation does not meet any of the criteria outlined in the rubric.\n", cost=0.0016425)


In [5]:
# Once you are happy with the judge, you can save it.

judge_id = "restaurant-recommendation-reviewer"

judge = client.judges.create_judge(
    judge_id=judge_id,
    judge_spec=rubric_judge_spec,
    description="A judge that rates how good restaurant recommendations are."
)

print(f"Created a judge: {judge}")

Created a judge: Judge(id='restaurant-recommendation-reviewer', version=1, description='A judge that rates how good restaurant recommendations are.', createTime='2025-05-23T15:52:53.326120Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer', judgeSpec={'extract_judgement': {'extraction_fields': [{'extraction_pattern': '<rationale>(.*?)</rationale>', 'field_type': 'STRING', 'match_index': -1, 'name': 'rationale', 'required': True}, {'extraction_pattern': '<score>(.*?)</score>', 'field_type': 'FLOAT', 'match_index': -1, 'name': 'score', 'required': True}], 'model_type': 'regex_extractor'}, 'extract_variables': {'extraction_fields': [{'extraction_pattern': '', 'field_type': 'STRING', 'match_index': 0, 'name': 'content', 'required': True}], 'model_type': 'default_extractor'}, 'max_score': 5, 'min_score': 1, 'model': 'openai/openai/gpt-4o', 'model_type': 'rubric_judge', 'postscript': "Here's the conversation you are judging:\n<content>\n${content}\n</c

In [6]:
# You can then call the judge:

evaluation_result = client.judges.evaluate_judge(
    judge,
    completion_request=completion_request,
    completion_response=chat_completion_response,
)

print(f"Judge response: {evaluation_result}")

{'judgement': {'score': 1, 'reason': "\nThe assistant's response is entirely irrelevant to the user's question. The user asked for a recommendation of a Chinese restaurant in downtown San Francisco, but the assistant responded with information about a Mexican restaurant, which is not what was requested. This means that the response does not meet any of the criteria set out for evaluating a good restaurant recommendation. Therefore, based on the rubric, the assistant's response merits the lowest possible score because it fails to provide any helpful information regarding the user's request.\n", 'cost': 0.0017625}}
Judge response: JudgeEvaluation(score=1, reason="\nThe assistant's response is entirely irrelevant to the user's question. The user asked for a recommendation of a Chinese restaurant in downtown San Francisco, but the assistant responded with information about a Mexican restaurant, which is not what was requested. This means that the response does not meet any of the criteria 

In [32]:
# Once you have created some judges, you may want to list them all:

all_judges = client.judges.list()
print("Judges:")
print(*[f"\t- {j}" for j in all_judges])

# Or get a specific judge:

retrieved_judge = client.judges.get(
    judge_id="restaurant-recommendation-reviewer",
    version=1,
)
print(f"\nRetrieved judge: {retrieved_judge}")

# Or update the judge, creating a new version:


new_rubric = """
You are tasked with evaluating whether a restaurant recommendation is good.
The scoring is as follows:
- 1: If the recommendation doesn't meet any of the criteria.
- 2: If the recommendation meets only some small part of the criteria.
- 3: If the recommendation is reasonable, but not perfect.
- 4: If the recommendation is almost perfect.
- 5: If the recommendation is perfect.
""".strip()

new_rubric_judge_spec = judge_specs.RubricJudgeSpec(
    model_type="rubric_judge",
    rubric=rubric,
    # TODO: Clearly communicate which models are available.
    model="openai/openai/gpt-4o",
    min_score=1,
    max_score=5,
)

new_judge_spec = client.judges.update_judge(
    judge_id="restaurant-recommendation-reviewer",
    judge_spec=new_rubric_judge_spec,
)

print(f"\nNew judge spec: {new_judge_spec}")

Judges:
	- Judge(id='restaurant-recommendation-reviewer', version=2, description='A judge that rates how good restaurant recommendations are.', createTime='2025-05-23T15:53:12.734947Z', name='organizations/68275a4af88f39d6440c1050/judges/restaurant-recommendation-reviewer', judgeSpec=None) 	- Judge(id='my-rubric-judge', version=11, description='', createTime='2025-05-23T15:40:09.091441Z', name='organizations/68275a4af88f39d6440c1050/judges/my-rubric-judge', judgeSpec=None) 	- Judge(id='rubric-judge', version=1, description='', createTime='2025-05-23T15:22:05.194690Z', name='organizations/68275a4af88f39d6440c1050/judges/rubric-judge', judgeSpec=None) 	- Judge(id='new-quality-rubric-judge', version=2, description='A judge that evaluates response quality based on accuracy, completeness, clarity, and helpfulness', createTime='2025-05-22T14:01:07.021971Z', name='organizations/68275a4af88f39d6440c1050/judges/new-quality-rubric-judge', judgeSpec=None) 	- Judge(id='rubric-judge-test-id', versi

## Routing

Let's start with the base model. You could access the base model via OpenAI client using Martian endpoint

In [5]:
# TODO change to enum from SDK
base_model = "openai/openai/gpt-4o"
openai_client = openai.OpenAI(
    api_key=config.api_key,
    base_url=config.api_url + "/openai/v2"
)
# Prepare the OpenAI chat completion request
openai_completion_request = {
    "model": "openai/openai/gpt-4o-mini",
    "messages": [
        {
            "role": "user",
            "content": chat_request_text
        }
    ],
    "max_tokens": 100
}
openai_completion_request

{'model': 'openai/openai/gpt-4o-mini',
 'messages': [{'role': 'user',
   'content': 'What is a good Chinese restaurant in downtown San Francisco?'}],
 'max_tokens': 100}

In [6]:
response = openai_client.chat.completions.create(
    **openai_completion_request
)
response.llm_response['choices'][0]['message']

{'content': "One highly recommended Chinese restaurant in downtown San Francisco is **Yank Sing**. It's well-known for its dim sum and offers a variety of traditional dishes in a stylish setting. Another popular option is **R&G Lounge**, which is famous for its salt and pepper crab and other Cantonese dishes. Both restaurants are praised for their quality food and vibrant atmosphere. Be sure to check their hours and make a reservation if needed, as they can get quite busy!",
 'refusal': None,
 'role': 'assistant',
 'annotations': [],
 'audio': None,
 'function_call': None,
 'tool_calls': None}

In [7]:
# You could see the cost of the llm request
response.llm_response['cost']

5.73e-05

In [11]:
# Now, let's create a router
router_id = "restaurant-recommendation-router"
router = client.routers.create_router(router_id, base_model,
                                      description="It's a brand new router to select the best model on restaurant recommendations.")

In [9]:
# You could list all your routers
all_routers = client.routers.list()
print("Routers:")
print(*[f"\t- {r}\n" for r in all_routers])

Routers:
	- Router(id='restaurant-recommendation-router', version=1, description="It's a brand new router to select the best model on restaurant recommendations.", createTime='2025-05-23T17:19:54.252429Z', name='organizations/68275a4af88f39d6440c1050/routers/restaurant-recommendation-router', routerSpec=None)
 	- Router(id='new-super-router-id', version=73, description="It's a new cool router", createTime='2025-05-23T15:40:14.051683Z', name='organizations/68275a4af88f39d6440c1050/routers/new-super-router-id', routerSpec=None)
 	- Router(id='new-super-router-is', version=1, description="It's a new cool router", createTime='2025-05-22T20:20:14.657673Z', name='organizations/68275a4af88f39d6440c1050/routers/new-super-router-is', routerSpec=None)
 	- Router(id='new-super-router', version=1, description="It's a new cool router", createTime='2025-05-22T20:16:53.097515Z', name='organizations/68275a4af88f39d6440c1050/routers/new-super-router', routerSpec=None)
 	- Router(id='quality-cost-router

In [12]:
# Getting router by id
retrieved_router = client.routers.get(router_id)
print(f"\nRetrieved router: {retrieved_router}")


Retrieved router: Router(id='restaurant-recommendation-router', version=1, description="It's a brand new router to select the best model on restaurant recommendations.", createTime='2025-05-23T17:19:54.252429Z', name='organizations/68275a4af88f39d6440c1050/routers/restaurant-recommendation-router', routerSpec={'points': [{'point': {'x': 0, 'y': 0}, 'executor': {'spec': {'executor_type': 'ModelExecutor', 'model_name': 'openai/openai/gpt-4o'}}}, {'point': {'x': 1, 'y': 1}, 'executor': {'spec': {'executor_type': 'ModelExecutor', 'model_name': 'openai/openai/gpt-4o'}}}]})


Before the router is trained, it will use the base model for inference

To run the router:
- change the model name in the request into the router name,
- add routing constrains

In [20]:
# cost routing constrains
cost_constraint = RouterConstraints.RoutingConstraint(
    cost_constraint=RouterConstraints.CostConstraint(
        value=RouterConstraints.ConstraintValue(numeric_value=0.5)
    )
)
cost_constraint

RoutingConstraint(cost_constraint=CostConstraint(value=ConstraintValue(numeric_value=0.5, model_name=None)), quality_constraint=None)

In [35]:
router_completion_request = openai_completion_request | {
    "model": retrieved_router.name  # using router name instead of base model name
}

In [38]:
router_response = openai_client.chat.completions.create(
    **router_completion_request,
    extra_body={
        RouterConstraints.ROUTING_CONSTRAINT_KEY: cost_constraint.to_dict()
        # adding cost routing constraint to the request
    }
)
router_response.llm_response['choices'][0]['message']

{'content': "Downtown San Francisco offers a variety of excellent Chinese restaurants. Some popular choices include:\n\n1. **Hunan Home's Restaurant**: Located in Chinatown, it is renowned for its authentic Hunan and Szechuan-style dishes.\n\n2. **Chong Qing Xiao Mian**: Known for its flavorful and spicy Chongqing noodles and a wide range of traditional Sichuan dishes.\n\n3. **R&G Lounge**: Famous for its salt and pepper crab and a favorite for both locals and tourists.\n\n",
 'refusal': None,
 'role': 'assistant',
 'annotations': [],
 'audio': None,
 'function_call': None,
 'tool_calls': None}

In [39]:
# You could see the cost and llm model used
print(f"Request cost: {router_response.llm_response['cost']}")
print(f"Request router to model: {router_response.llm_response['model']}")

Request cost: 0.001045
Request router to model: gpt-4o-2024-08-06


In [None]:
# Let's evaluate the router response
client.judges.evaluate_judge(retrieved_judge, router_completion_request, router_response)

### Training router

In [None]:
# TODO