Skip to content

GPU blocks: -1 #184

@cemalgndzz

Description

@cemalgndzz

I am using aws ec2 'g4dn_xlarge' GPU machine which has CUDA installed.

I am using following code:
from vllm import LLM, SamplingParams
llm = LLM(model="lmsys/vicuna-7b-v1.3")

When loading model, in info part it outputs:
INFO 06-21 10:00:18 llm_engine.py:128] # GPU blocks: -1, # CPU blocks: 512
and gives runtime error RuntimeError: Trying to create tensor with negative dimension -1: [-1, 32, 16, 16, 8]

when I try to initialize the model facebook/opt-125m as shown in the quickstart doc it works fine but when I try to load vicuna it gives error.
Here is the full error:


RuntimeError Traceback (most recent call last)
Cell In[3], line 2
1 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
----> 2 llm = LLM(model="lmsys/vicuna-7b-v1.3")

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/entrypoints/llm.py:55, in LLM.init(self, model, tensor_parallel_size, dtype, seed, **kwargs)
47 kwargs["disable_log_stats"] = True
48 engine_args = EngineArgs(
49 model=model,
50 tensor_parallel_size=tensor_parallel_size,
(...)
53 **kwargs,
54 )
---> 55 self.llm_engine = LLMEngine.from_engine_args(engine_args)
56 self.request_counter = Counter()

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:145, in LLMEngine.from_engine_args(cls, engine_args)
143 distributed_init_method, devices = initialize_cluster(parallel_config)
144 # Create the LLM engine.
--> 145 engine = cls(*engine_configs, distributed_init_method, devices,
146 log_stats=not engine_args.disable_log_stats)
147 return engine

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:102, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, distributed_init_method, stage_devices, log_stats)
100 self.workers.append(worker)
101 # Profile the memory usage and initialize the cache.
--> 102 self._init_cache()
104 # Create the scheduler.
105 self.scheduler = Scheduler(scheduler_config, cache_config, log_stats)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:134, in LLMEngine._init_cache(self)
131 self.cache_config.num_cpu_blocks = num_cpu_blocks
133 # Initialize the cache.
--> 134 self._run_workers("init_cache_engine", cache_config=self.cache_config)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:307, in LLMEngine._run_workers(self, method, get_all_outputs, *args, **kwargs)
304 if self.parallel_config.worker_use_ray:
305 executor = executor.remote
--> 307 output = executor(*args, **kwargs)
308 all_outputs.append(output)
310 if self.parallel_config.worker_use_ray:

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/worker.py:126, in Worker.init_cache_engine(self, cache_config)
124 self.cache_config = cache_config
125 self.block_size = cache_config.block_size
--> 126 self.cache_engine = CacheEngine(
127 self.cache_config, self.model_config, self.parallel_config)
128 self.cache_events = self.cache_engine.events
129 self.gpu_cache = self.cache_engine.gpu_cache

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/cache_engine.py:40, in CacheEngine.init(self, cache_config, model_config, parallel_config)
37 self.num_cpu_blocks = cache_config.num_cpu_blocks
39 # Initialize the cache.
---> 40 self.gpu_cache = self.allocate_gpu_cache()
41 self.cpu_cache = self.allocate_cpu_cache()
43 # Initialize the stream for caching operations.

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/cache_engine.py:71, in CacheEngine.allocate_gpu_cache(self)
69 value_block_shape = self.get_value_block_shape()
70 for _ in range(self.num_layers):
---> 71 key_blocks = torch.empty(
72 size=(self.num_gpu_blocks, *key_block_shape),
73 dtype=self.dtype,
74 device="cuda",
75 )
76 value_blocks = torch.empty(
77 size=(self.num_gpu_blocks, *value_block_shape),
78 dtype=self.dtype,
79 device="cuda",
80 )
81 gpu_cache.append((key_blocks, value_blocks))

RuntimeError: Trying to create tensor with negative dimension -1: [-1, 32, 16, 16, 8]

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions