You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I must completely tear down the container and start it again to unstuck it.
If I adjust my service to be more gentle - sending just 1 request at a time, it seems to hold steady.
vllm-llava | async for res in result_generator:
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 746, in generate
vllm-llava | async for output in self._process_request(
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 859, in _process_request
vllm-llava | raise e
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 855, in _process_request
vllm-llava | async for request_output in stream:
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 90, in __anext__
vllm-llava | raise result
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 43, in _log_task_completion
vllm-llava | return_value = task.result()
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 595, in run_engine_loop
vllm-llava | result = task.result()
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 540, in engine_step
vllm-llava | request_outputs = await self.engine.step_async(virtual_engine)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 241, in step_async
vllm-llava | output = await self.model_executor.execute_model_async(
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/executor/distributed_gpu_executor.py", line 173, in execute_model_async
vllm-llava | return await self._driver_execute_model_async(execute_model_req)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/executor/multiproc_gpu_executor.py", line 160, in _driver_execute_model_async
vllm-llava | return await self.driver_exec_model(execute_model_req)
vllm-llava | File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
vllm-llava | result = self.fn(*self.args, **self.kwargs)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker_base.py", line 246, in execute_model
vllm-llava | broadcast_tensor_dict(broadcast_data, src=0)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/communication_op.py", line 32, in broadcast_tensor_dict
vllm-llava | return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/parallel_state.py", line 505, in broadcast_tensor_dict
vllm-llava | self.broadcast_object(metadata_list, src=src)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/parallel_state.py", line 382, in broadcast_object
vllm-llava | return self.shm_broadcaster.broadcast_object(obj)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/device_communicators/shm_broadcast.py", line 266, in broadcast_object
vllm-llava | self.enqueue(obj)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/device_communicators/shm_broadcast.py", line 248, in enqueue
vllm-llava | raise RuntimeError(
vllm-llava | RuntimeError: len(serialized_obj)=18969348 larger than the allowed value 4194304,Please increase the max_chunk_bytes parameter.
vllm-llava |
vllm-llava | The above exception was the direct cause of the following exception:
vllm-llava |
vllm-llava | Traceback (most recent call last):
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 399, in run_asgi
vllm-llava | result = await app( # type: ignore[func-returns-value]
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
vllm-llava | return await self.app(scope, receive, send)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/fastapi/applications.py", line 1054, in __call__
vllm-llava | await super().__call__(scope, receive, send)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 123, in __call__
vllm-llava | await self.middleware_stack(scope, receive, send)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 186, in __call__
vllm-llava | raise exc
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 164, in __call__
vllm-llava | await self.app(scope, receive, _send)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/base.py", line 189, in __call__
vllm-llava | with collapse_excgroups():
vllm-llava | File "/usr/lib/python3.10/contextlib.py", line 153, in __exit__
vllm-llava | self.gen.throw(typ, value, traceback)
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/_utils.py", line 93, in collapse_excgroups
vllm-llava | raise exc
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 261, in wrap
vllm-llava | await func()
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 250, in stream_response
vllm-llava | async for chunk in self.body_iterator:
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/serving_chat.py", line 334, in chat_completion_stream_generator
vllm-llava | async for res in result_generator:
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 746, in generate
vllm-llava | async for output in self._process_request(
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 845, in _process_request
vllm-llava | stream = await self.add_request(
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 654, in add_request
vllm-llava | self.start_background_loop()
vllm-llava | File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 476, in start_background_loop
vllm-llava | raise AsyncEngineDeadError(
vllm-llava | vllm.engine.async_llm_engine.AsyncEngineDeadError: Background loop has errored already.
The text was updated successfully, but these errors were encountered:
@faileon I talked to Kaichao offline and in fact this won't be an issue if you build the docker image from the main branch and serve the model from that one. This is because of the recently merged #6183.
Feel free to raise another issue if you still see any other error from the latest main branch!
Your current environment
I have the following docker compose service running vLLM and llava-hf/llava-v1.6-mistral-7b-hf
I have a service sending 5 parallel requests on the exposed
/v1/chat/completions
, which will seize it with the following error:After which the container is stuck in a state with 5 requests, where it doesnt accept any new requests:
I must completely tear down the container and start it again to unstuck it.
If I adjust my service to be more gentle - sending just 1 request at a time, it seems to hold steady.
This is an example request that I am sending:
A bit more from the stack trace:
The text was updated successfully, but these errors were encountered: