-
Notifications
You must be signed in to change notification settings - Fork 132
Description
System Info
cpu intel 14700k
gpu rtx 4090
tensorrt_llm 0.13
docker tritonserver:24.09-trtllm-python-py3
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
request:
POST /v1/chat/completions HTTP/1.1
{
"model": "tensorrt_llm_bls",
"max_tokens": 80,
"temperature": 0.8,
"top_p": 1,
"presence_penalty": 0.5,
"frequency_penalty": 0.5,
"messages": [
{
"role": "system",
"content": "xxx"
},
{
"role": "user",
"content": "yyy"
},
{
"role": "assistant",
"content": "lulu:"
}
],
"stop": [
"\n\n",
"User:",
"<|",
"***"
],
"stream": true
}
Expected behavior
Answer in a streaming input format.
actual behavior
E1020 03:43:08.358547 5558 model.py:120] "Traceback (most recent call last):\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/model.py", line 108, in execute\n for res in res_gen:\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/decode.py", line 220, in decode\n gen_response = self._generate_non_streaming(\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 347, in _generate_non_streaming\n r = self._exec_triton_request_single(triton_req)\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 146, in _exec_triton_request_single\n raise pb_utils.TritonModelException(responses.error().message())\nc_python_backend_utils.TritonModelException: Streaming is only supported if model is deployed using decoupled mode. (/tmp/tritonbuild/tensorrtllm/inflight_batcher_llm/src/utils.cc:617)\n1 0x7f7bdd54e194 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x12194) [0x7f7bdd54e194]\n2 0x7f7bdd5575c9 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x1b5c9) [0x7f7bdd5575c9]\n3 0x7f7bdd562c6b /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x26c6b) [0x7f7bdd562c6b]\n4 0x7f7bdd551745 TRITONBACKEND_ModelInstanceExecute + 101\n5 0x7f7c519c90d4 /opt/tritonserver/lib/libtritonserver.so(+0x1a70d4) [0x7f7c519c90d4]\n6 0x7f7c519c944b /opt/tritonserver/lib/libtritonserver.so(+0x1a744b) [0x7f7c519c944b]\n7 0x7f7c51ae7ccd /opt/tritonserver/lib/libtritonserver.so(+0x2c5ccd) [0x7f7c51ae7ccd]\n8 0x7f7c519cd884 /opt/tritonserver/lib/libtritonserver.so(+0x1ab884) [0x7f7c519cd884]\n9 0x7f7c516d2253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7f7c516d2253]\n10 0x7f7c80fb7ac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f7c80fb7ac3]\n11 0x7f7c81049850 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x7f7c81049850]\n"
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 265, in call
await wrap(partial(self.listen_for_disconnect, receive))
File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 261, in wrap
await func()
File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 238, in listen_for_disconnect
message = await receive()
File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 555, in receive
await self.message_event.wait()
File "/usr/lib/python3.10/asyncio/locks.py", line 214, in wait
await fut
asyncio.exceptions.CancelledError: Cancelled by cancel scope 7f7c60eb5ea0
During handling of the above exception, another exception occurred:
- Exception Group Traceback (most recent call last):
| File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 401, in run_asgi
| result = await app( # type: ignore[func-returns-value]
| File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in call
| return await self.app(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/fastapi/applications.py", line 1054, in call
| await super().call(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 123, in call
| await self.middleware_stack(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 186, in call
| raise exc
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 164, in call
| await self.app(scope, receive, _send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/cors.py", line 85, in call
| await self.app(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/exceptions.py", line 65, in call
| await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
| raise exc
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
| await app(scope, receive, sender)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 756, in call
| await self.middleware_stack(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 776, in app
| await route.handle(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 297, in handle
| await self.app(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 77, in app
| await wrap_app_handling_exceptions(app, request)(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
| raise exc
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
| await app(scope, receive, sender)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 75, in app
| await response(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 258, in call
| async with anyio.create_task_group() as task_group:
| File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 736, in aexit
| raise BaseExceptionGroup(
| exceptiongroup.ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 261, in wrap
| await func()
| File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 250, in stream_response
| async for chunk in self.body_iterator:
| File "/llm/openai/openai_frontend/engine/triton_engine.py", line 339, in _streaming_chat_iterator
| chunk = self._get_nth_streaming_chat_response(
| File "/usr/local/lib/python3.10/dist-packages/tritonserver/_api/_response.py", line 148, in anext
| raise response.error
| File "/usr/local/lib/python3.10/dist-packages/tritonserver/_api/_response.py", line 378, in _from_tritonserver_inference_response
| response.throw_if_response_error()
| tritonserver.InternalError: ('Traceback (most recent call last):\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/model.py", line 108, in execute\n for res in res_gen:\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/decode.py", line 220, in decode\n gen_response = self._generate_non_streaming(\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 347, in _generate_non_streaming\n r = self._exec_triton_request_single(triton_req)\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 146, in _exec_triton_request_single\n raise pb_utils.TritonModelException(responses.error().message())\nc_python_backend_utils.TritonModelException: Streaming is only supported if model is deployed using decoupled mode. (/tmp/tritonbuild/tensorrtllm/inflight_batcher_llm/src/utils.cc:617)\n1 0x7f7bdd54e194 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x12194) [0x7f7bdd54e194]\n2 0x7f7bdd5575c9 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x1b5c9) [0x7f7bdd5575c9]\n3 0x7f7bdd562c6b /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x26c6b) [0x7f7bdd562c6b]\n4 0x7f7bdd551745 TRITONBACKEND_ModelInstanceExecute + 101\n5 0x7f7c519c90d4 /opt/tritonserver/lib/libtritonserver.so(+0x1a70d4) [0x7f7c519c90d4]\n6 0x7f7c519c944b /opt/tritonserver/lib/libtritonserver.so(+0x1a744b) [0x7f7c519c944b]\n7 0x7f7c51ae7ccd /opt/tritonserver/lib/libtritonserver.so(+0x2c5ccd) [0x7f7c51ae7ccd]\n8 0x7f7c519cd884 /opt/tritonserver/lib/libtritonserver.so(+0x1ab884) [0x7f7c519cd884]\n9 0x7f7c516d2253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7f7c516d2253]\n10 0x7f7c80fb7ac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f7c80fb7ac3]\n11 0x7f7c81049850 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x7f7c81049850]\n', InferenceResponse(model={'name': 'tensorrt_llm_bls', 'version': 1, 'state': None}, request_id='', parameters={}, outputs={}, error=InternalError(...), classification_label=None, final=True))
+------------------------------------
INFO: 172.17.0.1:44170 - "POST /v1/chat/completions HTTP/1.1" 200 OK
E1020 03:43:08.442689 5558 model.py:120] "Traceback (most recent call last):\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/model.py", line 108, in execute\n for res in res_gen:\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/decode.py", line 220, in decode\n gen_response = self._generate_non_streaming(\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 347, in _generate_non_streaming\n r = self._exec_triton_request_single(triton_req)\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 146, in _exec_triton_request_single\n raise pb_utils.TritonModelException(responses.error().message())\nc_python_backend_utils.TritonModelException: Streaming is only supported if model is deployed using decoupled mode. (/tmp/tritonbuild/tensorrtllm/inflight_batcher_llm/src/utils.cc:617)\n1 0x7f7bdd54e194 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x12194) [0x7f7bdd54e194]\n2 0x7f7bdd5575c9 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x1b5c9) [0x7f7bdd5575c9]\n3 0x7f7bdd562c6b /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x26c6b) [0x7f7bdd562c6b]\n4 0x7f7bdd551745 TRITONBACKEND_ModelInstanceExecute + 101\n5 0x7f7c519c90d4 /opt/tritonserver/lib/libtritonserver.so(+0x1a70d4) [0x7f7c519c90d4]\n6 0x7f7c519c944b /opt/tritonserver/lib/libtritonserver.so(+0x1a744b) [0x7f7c519c944b]\n7 0x7f7c51ae7ccd /opt/tritonserver/lib/libtritonserver.so(+0x2c5ccd) [0x7f7c51ae7ccd]\n8 0x7f7c519cd884 /opt/tritonserver/lib/libtritonserver.so(+0x1ab884) [0x7f7c519cd884]\n9 0x7f7c516d2253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7f7c516d2253]\n10 0x7f7c80fb7ac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f7c80fb7ac3]\n11 0x7f7c81049850 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x7f7c81049850]\n"
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 265, in call
await wrap(partial(self.listen_for_disconnect, receive))
File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 261, in wrap
await func()
File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 238, in listen_for_disconnect
message = await receive()
File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 555, in receive
await self.message_event.wait()
File "/usr/lib/python3.10/asyncio/locks.py", line 214, in wait
await fut
asyncio.exceptions.CancelledError: Cancelled by cancel scope 7f7c616ef5e0
During handling of the above exception, another exception occurred:
- Exception Group Traceback (most recent call last):
| File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 401, in run_asgi
| result = await app( # type: ignore[func-returns-value]
| File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in call
| return await self.app(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/fastapi/applications.py", line 1054, in call
| await super().call(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 123, in call
| await self.middleware_stack(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 186, in call
| raise exc
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 164, in call
| await self.app(scope, receive, _send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/cors.py", line 85, in call
| await self.app(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/exceptions.py", line 65, in call
| await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
| raise exc
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
| await app(scope, receive, sender)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 756, in call
| await self.middleware_stack(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 776, in app
| await route.handle(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 297, in handle
| await self.app(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 77, in app
| await wrap_app_handling_exceptions(app, request)(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
| raise exc
| File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
| await app(scope, receive, sender)
| File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 75, in app
| await response(scope, receive, send)
| File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 258, in call
| async with anyio.create_task_group() as task_group:
| File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 736, in aexit
| raise BaseExceptionGroup(
| exceptiongroup.ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 261, in wrap
| await func()
| File "/usr/local/lib/python3.10/dist-packages/starlette/responses.py", line 250, in stream_response
| async for chunk in self.body_iterator:
| File "/llm/openai/openai_frontend/engine/triton_engine.py", line 339, in _streaming_chat_iterator
| chunk = self._get_nth_streaming_chat_response(
| File "/usr/local/lib/python3.10/dist-packages/tritonserver/_api/_response.py", line 148, in anext
| raise response.error
| File "/usr/local/lib/python3.10/dist-packages/tritonserver/_api/_response.py", line 378, in _from_tritonserver_inference_response
| response.throw_if_response_error()
| tritonserver.InternalError: ('Traceback (most recent call last):\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/model.py", line 108, in execute\n for res in res_gen:\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/decode.py", line 220, in decode\n gen_response = self._generate_non_streaming(\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 347, in _generate_non_streaming\n r = self._exec_triton_request_single(triton_req)\n File "/llm/tensorrt_llm/model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py", line 146, in _exec_triton_request_single\n raise pb_utils.TritonModelException(responses.error().message())\nc_python_backend_utils.TritonModelException: Streaming is only supported if model is deployed using decoupled mode. (/tmp/tritonbuild/tensorrtllm/inflight_batcher_llm/src/utils.cc:617)\n1 0x7f7bdd54e194 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x12194) [0x7f7bdd54e194]\n2 0x7f7bdd5575c9 /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x1b5c9) [0x7f7bdd5575c9]\n3 0x7f7bdd562c6b /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so(+0x26c6b) [0x7f7bdd562c6b]\n4 0x7f7bdd551745 TRITONBACKEND_ModelInstanceExecute + 101\n5 0x7f7c519c90d4 /opt/tritonserver/lib/libtritonserver.so(+0x1a70d4) [0x7f7c519c90d4]\n6 0x7f7c519c944b /opt/tritonserver/lib/libtritonserver.so(+0x1a744b) [0x7f7c519c944b]\n7 0x7f7c51ae7ccd /opt/tritonserver/lib/libtritonserver.so(+0x2c5ccd) [0x7f7c51ae7ccd]\n8 0x7f7c519cd884 /opt/tritonserver/lib/libtritonserver.so(+0x1ab884) [0x7f7c519cd884]\n9 0x7f7c516d2253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7f7c516d2253]\n10 0x7f7c80fb7ac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f7c80fb7ac3]\n11 0x7f7c81049850 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126850) [0x7f7c81049850]\n', InferenceResponse(model={'name': 'tensorrt_llm_bls', 'version': 1, 'state': None}, request_id='', parameters={}, outputs={}, error=InternalError(...), classification_label=None, final=True))
+------------------------------------
additional notes
none