diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3668dd7ee37f..c4aea06ba12f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,6 +127,12 @@ def _init_cache(self) -> None: # FIXME(woosuk): Change to debug log. logger.info(f'# GPU blocks: {num_gpu_blocks}, ' f'# CPU blocks: {num_cpu_blocks}') + + if num_gpu_blocks <= 0 or num_cpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/outputs.py b/vllm/outputs.py index ebb5c19df0ad..384ca020985d 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -53,6 +53,7 @@ class RequestOutput: prompt: The prompt string of the request. prompt_token_ids: The token IDs of the prompt. outputs: The output sequences of the request. + finished: Whether the whole request is finished. """ def __init__( self,