-
-
Notifications
You must be signed in to change notification settings - Fork 10.6k
Description
I am using aws ec2 'g4dn_xlarge' GPU machine which has CUDA installed.
I am using following code:
from vllm import LLM, SamplingParams
llm = LLM(model="lmsys/vicuna-7b-v1.3")
When loading model, in info part it outputs:
INFO 06-21 10:00:18 llm_engine.py:128] # GPU blocks: -1, # CPU blocks: 512
and gives runtime error RuntimeError: Trying to create tensor with negative dimension -1: [-1, 32, 16, 16, 8]
when I try to initialize the model facebook/opt-125m as shown in the quickstart doc it works fine but when I try to load vicuna it gives error.
Here is the full error:
RuntimeError Traceback (most recent call last)
Cell In[3], line 2
1 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
----> 2 llm = LLM(model="lmsys/vicuna-7b-v1.3")
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/entrypoints/llm.py:55, in LLM.init(self, model, tensor_parallel_size, dtype, seed, **kwargs)
47 kwargs["disable_log_stats"] = True
48 engine_args = EngineArgs(
49 model=model,
50 tensor_parallel_size=tensor_parallel_size,
(...)
53 **kwargs,
54 )
---> 55 self.llm_engine = LLMEngine.from_engine_args(engine_args)
56 self.request_counter = Counter()
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:145, in LLMEngine.from_engine_args(cls, engine_args)
143 distributed_init_method, devices = initialize_cluster(parallel_config)
144 # Create the LLM engine.
--> 145 engine = cls(*engine_configs, distributed_init_method, devices,
146 log_stats=not engine_args.disable_log_stats)
147 return engine
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:102, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, distributed_init_method, stage_devices, log_stats)
100 self.workers.append(worker)
101 # Profile the memory usage and initialize the cache.
--> 102 self._init_cache()
104 # Create the scheduler.
105 self.scheduler = Scheduler(scheduler_config, cache_config, log_stats)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:134, in LLMEngine._init_cache(self)
131 self.cache_config.num_cpu_blocks = num_cpu_blocks
133 # Initialize the cache.
--> 134 self._run_workers("init_cache_engine", cache_config=self.cache_config)
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/engine/llm_engine.py:307, in LLMEngine._run_workers(self, method, get_all_outputs, *args, **kwargs)
304 if self.parallel_config.worker_use_ray:
305 executor = executor.remote
--> 307 output = executor(*args, **kwargs)
308 all_outputs.append(output)
310 if self.parallel_config.worker_use_ray:
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/worker.py:126, in Worker.init_cache_engine(self, cache_config)
124 self.cache_config = cache_config
125 self.block_size = cache_config.block_size
--> 126 self.cache_engine = CacheEngine(
127 self.cache_config, self.model_config, self.parallel_config)
128 self.cache_events = self.cache_engine.events
129 self.gpu_cache = self.cache_engine.gpu_cache
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/cache_engine.py:40, in CacheEngine.init(self, cache_config, model_config, parallel_config)
37 self.num_cpu_blocks = cache_config.num_cpu_blocks
39 # Initialize the cache.
---> 40 self.gpu_cache = self.allocate_gpu_cache()
41 self.cpu_cache = self.allocate_cpu_cache()
43 # Initialize the stream for caching operations.
File /opt/conda/envs/textgen/lib/python3.10/site-packages/vllm/worker/cache_engine.py:71, in CacheEngine.allocate_gpu_cache(self)
69 value_block_shape = self.get_value_block_shape()
70 for _ in range(self.num_layers):
---> 71 key_blocks = torch.empty(
72 size=(self.num_gpu_blocks, *key_block_shape),
73 dtype=self.dtype,
74 device="cuda",
75 )
76 value_blocks = torch.empty(
77 size=(self.num_gpu_blocks, *value_block_shape),
78 dtype=self.dtype,
79 device="cuda",
80 )
81 gpu_cache.append((key_blocks, value_blocks))
RuntimeError: Trying to create tensor with negative dimension -1: [-1, 32, 16, 16, 8]