From 48a3dea9ef174ff83d182bfad1b4b9c030a4843e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Sep 2025 15:11:48 +0800
Subject: [PATCH 1/5] add logging

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/device_allocator/cumem.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 7963fb15c419..30078bff6511 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -16,8 +16,11 @@
 
 import torch
 
+from vllm.logger import init_logger
 from vllm.utils import is_pin_memory_available
 
+logger = init_logger(__name__)
+
 
 def find_loaded_library(lib_name) -> Optional[str]:
     """
@@ -165,6 +168,9 @@ def _python_malloc_callback(self, allocation_handle: HandleType) -> None:
         py_d_mem = allocation_handle[2]
         self.pointer_to_data[py_d_mem] = AllocationData(
             allocation_handle, self.current_tag)
+        logger.debug(
+            "Allocated %s bytes for %s with address %s from cumem allocator",
+            allocation_handle[1], self.current_tag, py_d_mem)
         return
 
     def _python_free_callback(self, ptr: int) -> HandleType:
@@ -174,6 +180,9 @@ def _python_free_callback(self, ptr: int) -> HandleType:
         data = self.pointer_to_data.pop(ptr)
         if data.cpu_backup_tensor is not None:
             data.cpu_backup_tensor = None
+        logger.debug(
+            "Freed %s bytes for %s with address %s from cumem allocator",
+            data.handle[1], data.tag, ptr)
         return data.handle
 
     def sleep(

From 172ed461fdb084337e5de907269f850de1a0358e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Sep 2025 15:19:26 +0800
Subject: [PATCH 2/5] manually free

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/device_allocator/cumem.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 30078bff6511..a49591edf241 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -276,12 +276,17 @@ def use_memory_pool(self, tag: Optional[str] = None):
             # when using pluggable allocator, see
             # https://github.com/pytorch/pytorch/issues/145168 .
             # if we have some memory allocated and then freed,
-            # the memory will not be released.
-            # right now it is fine, because we only use this allocator
-            # during weight loading and kv cache creation, where we only
-            # allocate memory.
-            # TODO: we need to find a way to release the memory,
-            # i.e. calling torch.cuda.empty_cache()
+            # the memory will not be released, e.g. in online quantization,
+            # where the model is created in higher precision, and then
+            # quantized in lower precision.
+            # Find all unused allocations and manually release them.
+            # TODO: we should expose `empty_cache` method in the memory pool.
+            # TODO: ask for help from PyTorch team to expose this method.
+            allocations = data[0].snapshot()
+            for allocation in allocations:
+                if allocation["allocated_size"] == 0:
+                    handle = self._python_free_callback(allocation["ptr"])
+                    unmap_and_release(handle)
             self.current_tag = old_tag
 
     def get_current_usage(self) -> int:

From 534f044d6eb52fdfdf32962a6511cc18f27fb994 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Sep 2025 15:22:54 +0800
Subject: [PATCH 3/5] fix

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/device_allocator/cumem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index a49591edf241..d2db87dbfcd2 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -285,7 +285,7 @@ def use_memory_pool(self, tag: Optional[str] = None):
             allocations = data[0].snapshot()
             for allocation in allocations:
                 if allocation["allocated_size"] == 0:
-                    handle = self._python_free_callback(allocation["ptr"])
+                    handle = self._python_free_callback(allocation["address"])
                     unmap_and_release(handle)
             self.current_tag = old_tag
 

From 8ff1b9ee807918213d979d94db1731588189958a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Sep 2025 15:36:27 +0800
Subject: [PATCH 4/5] add more logging

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/device_allocator/cumem.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index d2db87dbfcd2..9aa5abd1b27c 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -206,9 +206,14 @@ def sleep(
 
         assert isinstance(offload_tags, tuple)
 
+        total_bytes = 0
+        backup_bytes = 0
+
         for ptr, data in self.pointer_to_data.items():
             handle = data.handle
+            total_bytes += handle[1]
             if data.tag in offload_tags:
+                backup_bytes += handle[1]
                 size_in_bytes = handle[1]
                 cpu_backup_tensor = torch.empty(
                     size_in_bytes,
@@ -220,6 +225,12 @@ def sleep(
                 data.cpu_backup_tensor = cpu_backup_tensor
             unmap_and_release(handle)
 
+        logger.info(
+            "CuMemAllocator: sleep freed %s GiB memory in total, of which"
+            "%s GiB is backed up in CPU and the rest %s GiB is discarded"
+            "directly.", total_bytes / 1024**3, backup_bytes / 1024**3,
+            (total_bytes - backup_bytes) / 1024**3)
+
         gc.collect()
         torch.cuda.empty_cache()
 

From 2b97b4e8344e2bd8bb8748185488bd5116e87adc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 12 Sep 2025 15:38:35 +0800
Subject: [PATCH 5/5] add more logging

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/device_allocator/cumem.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 9aa5abd1b27c..af7ca6be1fca 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -226,8 +226,8 @@ def sleep(
             unmap_and_release(handle)
 
         logger.info(
-            "CuMemAllocator: sleep freed %s GiB memory in total, of which"
-            "%s GiB is backed up in CPU and the rest %s GiB is discarded"
+            "CuMemAllocator: sleep freed %.2f GiB memory in total, of which "
+            "%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded "
             "directly.", total_bytes / 1024**3, backup_bytes / 1024**3,
             (total_bytes - backup_bytes) / 1024**3)