From 5abd79691a4d1cab59cdfac73dc6b6c21d0113c8 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 20 May 2026 16:47:38 +0000 Subject: [PATCH] feat(cli): add eval compare bias correction flag Co-authored-by: Blaine Kasten --- src/together/lib/cli/api/evals/create.py | 8 +++++ src/together/lib/cli/utils/_help_examples.py | 3 +- tests/cli/test_evals.py | 38 ++++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/together/lib/cli/api/evals/create.py b/src/together/lib/cli/api/evals/create.py index 400397744..345794160 100644 --- a/src/together/lib/cli/api/evals/create.py +++ b/src/together/lib/cli/api/evals/create.py @@ -77,6 +77,13 @@ async def create( pass_threshold: Annotated[ Optional[float], Parameter(help="Threshold for passing (required for score type)") ] = None, + disable_position_bias_correction: Annotated[ + bool, + Parameter( + negative=(), + help="For compare evals, run only the original-order judge pass without position-bias correction", + ), + ] = False, model_a_field: Annotated[ Optional[str], Parameter( @@ -274,6 +281,7 @@ async def create( parameters=ParametersEvaluationCompareParameters( input_data_file_path=training_file, judge=judge_config, + disable_position_bias_correction=disable_position_bias_correction, model_a=cast(ParametersEvaluationCompareParametersModelAEvaluationModelRequest, model_a_final), model_b=cast(ParametersEvaluationCompareParametersModelBEvaluationModelRequest, model_b_final), ), diff --git a/src/together/lib/cli/utils/_help_examples.py b/src/together/lib/cli/utils/_help_examples.py index 817858769..2bbe64ea8 100644 --- a/src/together/lib/cli/utils/_help_examples.py +++ b/src/together/lib/cli/utils/_help_examples.py @@ -246,7 +246,8 @@ --model-b deepseek-ai/DeepSeek-V3.1 \\ --model-b-source serverless \\ --model-b-system-template "You are a concise assistant." \\ - --model-b-input-template $'Answer the following:\\n\\n{{prompt}}'[/primary] + --model-b-input-template $'Answer the following:\\n\\n{{prompt}}' \\ + --disable-position-bias-correction[/primary] """ ## Beta clusters API commands diff --git a/tests/cli/test_evals.py b/tests/cli/test_evals.py index 319a95a61..0bc318f17 100644 --- a/tests/cli/test_evals.py +++ b/tests/cli/test_evals.py @@ -59,3 +59,41 @@ def test_status(self, respx_mock: MockRouter, cli_runner: CliRunner) -> None: result = cli_runner.invoke(["evals", "status", "eval-wf-1"]) assert result.exit_code == 0 assert "Status: completed" in result.output + + +class TestEvalsCreate: + @pytest.mark.respx(base_url=base_url) + def test_compare_passes_disable_position_bias_correction( + self, respx_mock: MockRouter, cli_runner: CliRunner + ) -> None: + route = respx_mock.post("/evaluation").mock( + return_value=httpx.Response(200, json={"workflow_id": "eval-wf-1", "status": "pending"}) + ) + + result = cli_runner.invoke( + [ + "evals", + "create", + "--type", + "compare", + "--judge-model", + "Qwen/Qwen3.5-9B", + "--judge-model-source", + "serverless", + "--judge-system-template", + "Choose the better response.", + "--input-data-file-path", + "file-123", + "--model-a-field", + "response_a", + "--model-b-field", + "response_b", + "--disable-position-bias-correction", + ] + ) + + assert result.exit_code == 0 + req = cast(Call, route.calls[0]).request + payload = json.loads(req.content) + assert payload["type"] == "compare" + assert payload["parameters"]["disable_position_bias_correction"] is True