GPU blocks: -1

I am using aws ec2 'g4dn_xlarge' GPU machine which has CUDA installed.

I am using following code:
from vllm import LLM, SamplingParams
llm = LLM(model="lmsys/vicuna-7b-v1.3")

When loading model, in info part it outputs: 
INFO 06-21 10:00:18 llm_engine.py:128] # GPU blocks: -1, # CPU blocks: 512
and gives runtime error RuntimeError: Trying to create tensor with negative dimension -1: [-1, 32, 16, 16, 8]

when I try to initialize the model facebook/opt-125m as shown in the quickstart doc it works fine but when I try to load vicuna it gives error. 
Here is the full error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[3], line 2
      1 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
----> 2 llm = LLM(model="lmsys/vicuna-7b-v1.3")

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/entrypoints/llm.py:55, in LLM.__init__(self, model, tensor_parallel_size, dtype, seed, **kwargs)
     47     kwargs["disable_log_stats"] = True
     48 engine_args = EngineArgs(
     49     model=model,
     50     tensor_parallel_size=tensor_parallel_size,
   (...)
     53     **kwargs,
     54 )
---> 55 self.llm_engine = LLMEngine.from_engine_args(engine_args)
     56 self.request_counter = Counter()

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:145, in LLMEngine.from_engine_args(cls, engine_args)
    143 distributed_init_method, devices = initialize_cluster(parallel_config)
    144 # Create the LLM engine.
--> 145 engine = cls(*engine_configs, distributed_init_method, devices,
    146              log_stats=not engine_args.disable_log_stats)
    147 return engine

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:102, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, distributed_init_method, stage_devices, log_stats)
    100     self.workers.append(worker)
    101 # Profile the memory usage and initialize the cache.
--> 102 self._init_cache()
    104 # Create the scheduler.
    105 self.scheduler = Scheduler(scheduler_config, cache_config, log_stats)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:134, in LLMEngine._init_cache(self)
    131 self.cache_config.num_cpu_blocks = num_cpu_blocks
    133 # Initialize the cache.
--> 134 self._run_workers("init_cache_engine", cache_config=self.cache_config)

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:307, in LLMEngine._run_workers(self, method, get_all_outputs, *args, **kwargs)
    304     if self.parallel_config.worker_use_ray:
    305         executor = executor.remote
--> 307     output = executor(*args, **kwargs)
    308     all_outputs.append(output)
    310 if self.parallel_config.worker_use_ray:

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/worker.py:126, in Worker.init_cache_engine(self, cache_config)
    124 self.cache_config = cache_config
    125 self.block_size = cache_config.block_size
--> 126 self.cache_engine = CacheEngine(
    127     self.cache_config, self.model_config, self.parallel_config)
    128 self.cache_events = self.cache_engine.events
    129 self.gpu_cache = self.cache_engine.gpu_cache

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/cache_engine.py:40, in CacheEngine.__init__(self, cache_config, model_config, parallel_config)
     37 self.num_cpu_blocks = cache_config.num_cpu_blocks
     39 # Initialize the cache.
---> 40 self.gpu_cache = self.allocate_gpu_cache()
     41 self.cpu_cache = self.allocate_cpu_cache()
     43 # Initialize the stream for caching operations.

File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/cache_engine.py:71, in CacheEngine.allocate_gpu_cache(self)
     69 value_block_shape = self.get_value_block_shape()
     70 for _ in range(self.num_layers):
---> 71     key_blocks = torch.empty(
     72         size=(self.num_gpu_blocks, *key_block_shape),
     73         dtype=self.dtype,
     74         device="cuda",
     75     )
     76     value_blocks = torch.empty(
     77         size=(self.num_gpu_blocks, *value_block_shape),
     78         dtype=self.dtype,
     79         device="cuda",
     80     )
     81     gpu_cache.append((key_blocks, value_blocks))

RuntimeError: Trying to create tensor with negative dimension -1: [-1, 32, 16, 16, 8]


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

GPU blocks: -1 #184

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

GPU blocks: -1 #184

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions