From 48a3dea9ef174ff83d182bfad1b4b9c030a4843e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 12 Sep 2025 15:11:48 +0800 Subject: [PATCH 1/5] add logging Signed-off-by: youkaichao --- vllm/device_allocator/cumem.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 7963fb15c419..30078bff6511 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -16,8 +16,11 @@ import torch +from vllm.logger import init_logger from vllm.utils import is_pin_memory_available +logger = init_logger(__name__) + def find_loaded_library(lib_name) -> Optional[str]: """ @@ -165,6 +168,9 @@ def _python_malloc_callback(self, allocation_handle: HandleType) -> None: py_d_mem = allocation_handle[2] self.pointer_to_data[py_d_mem] = AllocationData( allocation_handle, self.current_tag) + logger.debug( + "Allocated %s bytes for %s with address %s from cumem allocator", + allocation_handle[1], self.current_tag, py_d_mem) return def _python_free_callback(self, ptr: int) -> HandleType: @@ -174,6 +180,9 @@ def _python_free_callback(self, ptr: int) -> HandleType: data = self.pointer_to_data.pop(ptr) if data.cpu_backup_tensor is not None: data.cpu_backup_tensor = None + logger.debug( + "Freed %s bytes for %s with address %s from cumem allocator", + data.handle[1], data.tag, ptr) return data.handle def sleep( From 172ed461fdb084337e5de907269f850de1a0358e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 12 Sep 2025 15:19:26 +0800 Subject: [PATCH 2/5] manually free Signed-off-by: youkaichao --- vllm/device_allocator/cumem.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 30078bff6511..a49591edf241 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -276,12 +276,17 @@ def use_memory_pool(self, tag: Optional[str] = None): # when using pluggable allocator, see # https://github.com/pytorch/pytorch/issues/145168 . # if we have some memory allocated and then freed, - # the memory will not be released. - # right now it is fine, because we only use this allocator - # during weight loading and kv cache creation, where we only - # allocate memory. - # TODO: we need to find a way to release the memory, - # i.e. calling torch.cuda.empty_cache() + # the memory will not be released, e.g. in online quantization, + # where the model is created in higher precision, and then + # quantized in lower precision. + # Find all unused allocations and manually release them. + # TODO: we should expose `empty_cache` method in the memory pool. + # TODO: ask for help from PyTorch team to expose this method. + allocations = data[0].snapshot() + for allocation in allocations: + if allocation["allocated_size"] == 0: + handle = self._python_free_callback(allocation["ptr"]) + unmap_and_release(handle) self.current_tag = old_tag def get_current_usage(self) -> int: From 534f044d6eb52fdfdf32962a6511cc18f27fb994 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 12 Sep 2025 15:22:54 +0800 Subject: [PATCH 3/5] fix Signed-off-by: youkaichao --- vllm/device_allocator/cumem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index a49591edf241..d2db87dbfcd2 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -285,7 +285,7 @@ def use_memory_pool(self, tag: Optional[str] = None): allocations = data[0].snapshot() for allocation in allocations: if allocation["allocated_size"] == 0: - handle = self._python_free_callback(allocation["ptr"]) + handle = self._python_free_callback(allocation["address"]) unmap_and_release(handle) self.current_tag = old_tag From 8ff1b9ee807918213d979d94db1731588189958a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 12 Sep 2025 15:36:27 +0800 Subject: [PATCH 4/5] add more logging Signed-off-by: youkaichao --- vllm/device_allocator/cumem.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index d2db87dbfcd2..9aa5abd1b27c 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -206,9 +206,14 @@ def sleep( assert isinstance(offload_tags, tuple) + total_bytes = 0 + backup_bytes = 0 + for ptr, data in self.pointer_to_data.items(): handle = data.handle + total_bytes += handle[1] if data.tag in offload_tags: + backup_bytes += handle[1] size_in_bytes = handle[1] cpu_backup_tensor = torch.empty( size_in_bytes, @@ -220,6 +225,12 @@ def sleep( data.cpu_backup_tensor = cpu_backup_tensor unmap_and_release(handle) + logger.info( + "CuMemAllocator: sleep freed %s GiB memory in total, of which" + "%s GiB is backed up in CPU and the rest %s GiB is discarded" + "directly.", total_bytes / 1024**3, backup_bytes / 1024**3, + (total_bytes - backup_bytes) / 1024**3) + gc.collect() torch.cuda.empty_cache() From 2b97b4e8344e2bd8bb8748185488bd5116e87adc Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 12 Sep 2025 15:38:35 +0800 Subject: [PATCH 5/5] add more logging Signed-off-by: youkaichao --- vllm/device_allocator/cumem.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 9aa5abd1b27c..af7ca6be1fca 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -226,8 +226,8 @@ def sleep( unmap_and_release(handle) logger.info( - "CuMemAllocator: sleep freed %s GiB memory in total, of which" - "%s GiB is backed up in CPU and the rest %s GiB is discarded" + "CuMemAllocator: sleep freed %.2f GiB memory in total, of which " + "%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded " "directly.", total_bytes / 1024**3, backup_bytes / 1024**3, (total_bytes - backup_bytes) / 1024**3)