Skip to content

Commit b935fa4

Browse files
authored
Bump SGLang on Modal to fix flaky e2e tests (#4009)
* Bump SGLang on Modal to fix flaky e2e tests SGLang now emits many duplicate tool calls, instead of not emitting any tool calls at all. I've adjusted our tests to allow this for sglang. When running locally, the tests almost always pass on the first try (with the occasional retry), instead of the ~9 retries we were seeing on our daily cron job * Fix clippy * Change warmup url
1 parent d9d45b5 commit b935fa4

File tree

6 files changed

+91
-41
lines changed

6 files changed

+91
-41
lines changed

.github/workflows/merge-queue.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ jobs:
8383
- name: Warm up Modal instances
8484
run: |
8585
curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-inference-vllm-inference.modal.run/docs > vllm_modal_logs.txt &
86-
curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt &
86+
curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ > sglang_modal_logs.txt &
8787
# TODO: Re-enable once we can switch to a T4 GPU
8888
# curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" https://tensorzero--vllm-gpt-oss-20b-serve.modal.run/ > vllm_gpt_oss_modal_logs.txt &
8989

ci/buildkite/modal-warmup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" \
2323

2424
echo "Warming up SGLang Modal instance..."
2525
curl -H "Modal-Key: $MODAL_KEY" -H "Modal-Secret: $MODAL_SECRET" \
26-
https://tensorzero--sglang-inference-sglang-inference.modal.run/ \
26+
https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/ \
2727
> sglang_modal_logs.txt &
2828

2929
echo "Waiting for warmup requests to complete..."

tensorzero-core/fixtures/deployment/sgl-modal/sgl_inference.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77

88
sgl_image = (
99
modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
10-
.pip_install("sglang[all]>=0.4.6.post1", "huggingface_hub[hf_transfer]==0.30.2")
10+
# This is the last version of sglang that supports SM75 GPUs (e.g. the Nvidia T4)
11+
.pip_install("sglang[all]==0.4.10.post2", "huggingface_hub[hf_transfer]==0.34")
12+
.apt_install("numactl")
1113
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
1214
)
1315

@@ -20,7 +22,7 @@
2022
sglang_cache_vol = modal.Volume.from_name("sglang-cache", create_if_missing=True)
2123

2224
N_GPU = 1
23-
app = modal.App(name="sglang-inference")
25+
app = modal.App(name="sglang-0.4.10-inference")
2426

2527

2628
@app.function(
@@ -47,6 +49,8 @@ def sglang_inference():
4749
"python",
4850
"-m",
4951
"sglang.launch_server",
52+
# This prevents the container from OOMing on startup
53+
"--disable-cuda-graph",
5054
"--model-path",
5155
MODEL_NAME,
5256
"--tool-call-parser",

tensorzero-core/tests/e2e/providers/common.rs

Lines changed: 80 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7010,11 +7010,14 @@ pub async fn check_tool_use_tool_choice_specific_inference_response(
70107010
.collect();
70117011

70127012
// Assert at most one tool call (a model could decide to call no tools if to reads the `self_destruct` description).
7013-
assert!(
7014-
tool_call_blocks.len() <= 1,
7015-
"Expected at most one tool call, found {}",
7016-
tool_call_blocks.len()
7017-
);
7013+
// Sglang likes to emit lots of tool calls
7014+
if provider.model_provider_name != "sglang" {
7015+
assert!(
7016+
tool_call_blocks.len() <= 1,
7017+
"Expected at most one tool call, found {}",
7018+
tool_call_blocks.len()
7019+
);
7020+
}
70187021

70197022
let tool_call_block = tool_call_blocks.first();
70207023
match tool_call_block {
@@ -7143,10 +7146,19 @@ pub async fn test_tool_use_tool_choice_specific_streaming_inference_request_with
71437146
let block_tool_id = block.get("id").unwrap().as_str().unwrap();
71447147
match &tool_id {
71457148
None => tool_id = Some(block_tool_id.to_string()),
7146-
Some(tool_id) => assert_eq!(
7147-
tool_id, block_tool_id,
7148-
"Provider returned multiple tool calls"
7149-
),
7149+
Some(tool_id) => {
7150+
if provider.model_provider_name == "sglang" {
7151+
// Sglang likes to emit lots of duplicate tool calls
7152+
if tool_id != block_tool_id {
7153+
continue;
7154+
}
7155+
} else {
7156+
assert_eq!(
7157+
tool_id, block_tool_id,
7158+
"Provider returned multiple tool calls"
7159+
);
7160+
}
7161+
}
71507162
}
71517163

71527164
let chunk_arguments = block.get("raw_arguments").unwrap().as_str().unwrap();
@@ -7417,11 +7429,14 @@ pub async fn test_tool_use_tool_choice_specific_streaming_inference_request_with
74177429
.collect();
74187430

74197431
// Assert at most one tool call (a model could decide to call no tools if to reads the `self_destruct` description).
7420-
assert!(
7421-
tool_call_blocks.len() <= 1,
7422-
"Expected at most one tool call, found {}",
7423-
tool_call_blocks.len()
7424-
);
7432+
// Sglang likes to emit lots of tool calls
7433+
if provider.model_provider_name != "sglang" {
7434+
assert!(
7435+
tool_call_blocks.len() <= 1,
7436+
"Expected at most one tool call, found {}",
7437+
tool_call_blocks.len()
7438+
);
7439+
}
74257440

74267441
let tool_call_block = tool_call_blocks.first();
74277442
match tool_call_block {
@@ -7683,16 +7698,25 @@ pub async fn check_tool_use_tool_choice_allowed_tools_inference_response(
76837698
.filter(|block| matches!(block, StoredContentBlock::ToolCall(_)))
76847699
.collect();
76857700

7686-
// Assert exactly one tool call
7687-
assert_eq!(tool_call_blocks.len(), 1, "Expected exactly one tool call");
7701+
if provider.model_provider_name == "sglang" {
7702+
// Sglang likes to emit lots of duplicate tool calls
7703+
assert!(
7704+
!tool_call_blocks.is_empty(),
7705+
"Expected at least one tool call"
7706+
);
7707+
} else {
7708+
// Assert exactly one tool call
7709+
assert_eq!(tool_call_blocks.len(), 1, "Expected exactly one tool call");
7710+
}
76887711

7689-
let tool_call_block = tool_call_blocks[0];
7690-
match tool_call_block {
7691-
StoredContentBlock::ToolCall(tool_call) => {
7692-
assert_eq!(tool_call.name, "get_humidity");
7693-
serde_json::from_str::<Value>(&tool_call.arguments.to_lowercase()).unwrap();
7712+
for tool_call_block in tool_call_blocks {
7713+
match tool_call_block {
7714+
StoredContentBlock::ToolCall(tool_call) => {
7715+
assert_eq!(tool_call.name, "get_humidity");
7716+
serde_json::from_str::<Value>(&tool_call.arguments.to_lowercase()).unwrap();
7717+
}
7718+
_ => panic!("Unreachable"),
76947719
}
7695-
_ => panic!("Unreachable"),
76967720
}
76977721
}
76987722

@@ -7789,10 +7813,13 @@ pub async fn test_tool_use_allowed_tools_streaming_inference_request_with_provid
77897813
if let Some(block_raw_name) = block.get("raw_name") {
77907814
match tool_name {
77917815
Some(_) => {
7792-
assert!(
7793-
block_raw_name.as_str().unwrap().is_empty(),
7794-
"Raw name already seen, got {block:#?}"
7795-
);
7816+
// Sglang likes to emit lots of duplicate tool calls
7817+
if provider.model_provider_name != "sglang" {
7818+
assert!(
7819+
block_raw_name.as_str().unwrap().is_empty(),
7820+
"Raw name already seen, got {block:#?}"
7821+
);
7822+
}
77967823
}
77977824
None => {
77987825
tool_name = Some(block_raw_name.as_str().unwrap().to_string());
@@ -7803,7 +7830,16 @@ pub async fn test_tool_use_allowed_tools_streaming_inference_request_with_provid
78037830
let block_tool_id = block.get("id").unwrap().as_str().unwrap();
78047831
match &tool_id {
78057832
None => tool_id = Some(block_tool_id.to_string()),
7806-
Some(tool_id) => assert_eq!(tool_id, block_tool_id),
7833+
Some(tool_id) => {
7834+
if provider.model_provider_name == "sglang" {
7835+
// Sglang likes to emit lots of duplicate tool calls
7836+
if tool_id != block_tool_id {
7837+
continue;
7838+
}
7839+
} else {
7840+
assert_eq!(tool_id, block_tool_id);
7841+
}
7842+
}
78077843
}
78087844

78097845
let chunk_arguments = block.get("raw_arguments").unwrap().as_str().unwrap();
@@ -8035,16 +8071,25 @@ pub async fn test_tool_use_allowed_tools_streaming_inference_request_with_provid
80358071
.filter(|block| matches!(block, StoredContentBlock::ToolCall(_)))
80368072
.collect();
80378073

8038-
// Assert exactly one tool call
8039-
assert_eq!(tool_call_blocks.len(), 1, "Expected exactly one tool call");
8074+
// Sglang likes to emit lots of tool calls
8075+
if provider.model_provider_name == "sglang" {
8076+
assert!(
8077+
!tool_call_blocks.is_empty(),
8078+
"Expected at least one tool call"
8079+
);
8080+
} else {
8081+
// Assert exactly one tool call
8082+
assert_eq!(tool_call_blocks.len(), 1, "Expected exactly one tool call");
8083+
}
80408084

8041-
let tool_call_block = tool_call_blocks[0];
8042-
match tool_call_block {
8043-
StoredContentBlock::ToolCall(tool_call) => {
8044-
assert_eq!(tool_call.name, "get_humidity");
8045-
serde_json::from_str::<Value>(&tool_call.arguments.to_lowercase()).unwrap();
8085+
for tool_call_block in tool_call_blocks {
8086+
match tool_call_block {
8087+
StoredContentBlock::ToolCall(tool_call) => {
8088+
assert_eq!(tool_call.name, "get_humidity");
8089+
serde_json::from_str::<Value>(&tool_call.arguments.to_lowercase()).unwrap();
8090+
}
8091+
_ => panic!("Unreachable"),
80468092
}
8047-
_ => panic!("Unreachable"),
80488093
}
80498094
}
80508095

tensorzero-core/tests/e2e/providers/sglang.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ async fn get_providers() -> E2ETestProviders {
6464
),
6565
(
6666
"api_base".to_string(),
67-
"https://tensorzero--sglang-inference-sglang-inference.modal.run/v1/".to_string(),
67+
"https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/v1/"
68+
.to_string(),
6869
),
6970
]),
7071
use_modal_headers: true,

tensorzero-core/tests/e2e/tensorzero.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ routing = ["sglang"]
403403
[models."Qwen/Qwen2.5-1.5B-Instruct".providers.sglang]
404404
type = "sglang"
405405
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
406-
api_base = "https://tensorzero--sglang-inference-sglang-inference.modal.run/v1/"
406+
api_base = "https://tensorzero--sglang-0-4-10-inference-sglang-inference.modal.run/v1/"
407407

408408
[models."qwen2.5-0.5b-instruct-vllm"]
409409
routing = ["vllm"]

0 commit comments

Comments
 (0)