Closed
Description
Seems that similar issues (#17248 #17313) were reported and fixed a month ago (#17623) however this still crashes vLLM 0.9.0.1:
in example, start vLLM with:
vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 8192
and run:
import requests
endpoint = "http://localhost:8000/v1"
url = f"{endpoint}/chat/completions"
payload = {
"model": "meta-llama/Llama-3.2-3B-Instruct",
"messages": [
{"role": "user", "content": "write a poem"},
],
"guided_regex": "foo**",
"max_tokens": 128,
}
r = requests.post(url, json=payload)
r.raise_for_status()
print(r.json()["choices"][0]["message"]["content"])
This will crash the server, with this stacktrace:
INFO 06-06 13:52:49 [async_llm.py:261] Added request chatcmpl-af33f030d2544c3a97ff066adfd9e6bd.
ERROR 06-06 13:52:49 [core.py:502] EngineCore encountered a fatal error.
ERROR 06-06 13:52:49 [core.py:502] Traceback (most recent call last):
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 493, in run_engine_core
ERROR 06-06 13:52:49 [core.py:502] engine_core.run_busy_loop()
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 520, in run_busy_loop
ERROR 06-06 13:52:49 [core.py:502] self._process_engine_step()
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 545, in _process_engine_step
ERROR 06-06 13:52:49 [core.py:502] outputs = self.step_fn()
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 225, in step
ERROR 06-06 13:52:49 [core.py:502] scheduler_output = self.scheduler.schedule()
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/core/sched/scheduler.py", line 336, in schedule
ERROR 06-06 13:52:49 [core.py:502] if structured_output_req and structured_output_req.grammar:
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 44, in grammar
ERROR 06-06 13:52:49 [core.py:502] completed = self._check_grammar_completion()
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 32, in _check_grammar_completion
ERROR 06-06 13:52:49 [core.py:502] self._grammar = self._grammar.result(timeout=0.0001)
ERROR 06-06 13:52:49 [core.py:502] File "/usr/lib/python3.10/concurrent/futures/_base.py", line 458, in result
ERROR 06-06 13:52:49 [core.py:502] return self.__get_result()
ERROR 06-06 13:52:49 [core.py:502] File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
ERROR 06-06 13:52:49 [core.py:502] raise self._exception
ERROR 06-06 13:52:49 [core.py:502] File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
ERROR 06-06 13:52:49 [core.py:502] result = self.fn(*self.args, **self.kwargs)
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/__init__.py", line 106, in _async_create_grammar
ERROR 06-06 13:52:49 [core.py:502] return self.backend.compile_grammar(request_type, grammar_spec)
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/backend_xgrammar.py", line 100, in compile_grammar
ERROR 06-06 13:52:49 [core.py:502] ctx = self.compiler.compile_regex(grammar_spec)
ERROR 06-06 13:52:49 [core.py:502] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/xgrammar/compiler.py", line 150, in compile_regex
ERROR 06-06 13:52:49 [core.py:502] return CompiledGrammar._create_from_handle(self._handle.compile_regex(regex))
ERROR 06-06 13:52:49 [core.py:502] RuntimeError: [13:52:49] /project/cpp/regex_converter.cc:73: Regex parsing error at position 5: Two consecutive repetition modifiers are not allowed.
ERROR 06-06 13:52:49 [core.py:502]
Process EngineCore_0:
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
ERROR 06-06 13:52:49 [async_llm.py:408] AsyncLLM output_handler failed.
ERROR 06-06 13:52:49 [async_llm.py:408] Traceback (most recent call last):
ERROR 06-06 13:52:49 [async_llm.py:408] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 366, in output_handler
ERROR 06-06 13:52:49 [async_llm.py:408] outputs = await engine_core.get_output_async()
ERROR 06-06 13:52:49 [async_llm.py:408] File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 806, in get_output_async
ERROR 06-06 13:52:49 [async_llm.py:408] raise self._format_exception(outputs) from None
ERROR 06-06 13:52:49 [async_llm.py:408] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 504, in run_engine_core
raise e
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 493, in run_engine_core
engine_core.run_busy_loop()
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 520, in run_busy_loop
self._process_engine_step()
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 545, in _process_engine_step
outputs = self.step_fn()
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 225, in step
scheduler_output = self.scheduler.schedule()
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/core/sched/scheduler.py", line 336, in schedule
if structured_output_req and structured_output_req.grammar:
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 44, in grammar
completed = self._check_grammar_completion()
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/request.py", line 32, in _check_grammar_completion
self._grammar = self._grammar.result(timeout=0.0001)
File "/usr/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/__init__.py", line 106, in _async_create_grammar
return self.backend.compile_grammar(request_type, grammar_spec)
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/vllm/v1/structured_output/backend_xgrammar.py", line 100, in compile_grammar
ctx = self.compiler.compile_regex(grammar_spec)
File "/home/user/code/debug/.venv/lib/python3.10/site-packages/xgrammar/compiler.py", line 150, in compile_regex
return CompiledGrammar._create_from_handle(self._handle.compile_regex(regex))
INFO 06-06 13:52:49 [async_llm.py:333] Request chatcmpl-af33f030d2544c3a97ff066adfd9e6bd failed (engine dead).
RuntimeError: [13:52:49] /project/cpp/regex_converter.cc:73: Regex parsing error at position 5: Two consecutive repetition modifiers are not allowed.
INFO: 127.0.0.1:57354 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
INFO: Shutting down
INFO: Waiting for application shutdown.
INFO: Application shutdown complete.
INFO: Finished server process [211847]
It's OK if the request would fail with 4xx, but this crashes the whole vLLM server.
This happens with the default xgrammer
backend. With guidance
it seems to behave OK and just fail the request upon invalid regex.
Thanks