[PyTorch Pinned Allocator] Create per thread task pool for mapping me…

…mory space (pytorch#111545) Differential Revision: D50443865 Pull Request resolved: pytorch#111545 Approved by: https://github.com/zdevito
xuhancn · Nov 8, 2023 · e7c60cf · e7c60cf
1 parent 44bdbe9
commit e7c60cf
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 8 deletions.
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -10,6 +10,7 @@
 #include <cuda_runtime_api.h>
 #include <stdint.h>
 #include <deque>
+#include <future>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -397,7 +398,7 @@ class CUDAHostAllocator {
         (void*)devptr == (void*)ptr,
         "Host and device pointer dont match with cudaHostRegister. "
         "Please dont use this feature by setting "
-        "PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)",
+        "PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
         "");
   }
 
@@ -412,18 +413,31 @@ class CUDAHostAllocator {
     size_t numMapThreads = c10::cuda::CUDACachingAllocator::
         CUDAAllocatorConfig::pinned_num_register_threads();
     if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
+      // parallelize the mapping of pages with a threadpool
       auto* pool = getThreadPool();
+      std::vector<std::promise<void>> promises;
+      std::vector<std::future<void>> futures;
+      promises.reserve(numMapThreads);
+      futures.reserve(numMapThreads);
+
       for (size_t i = 0; i < numMapThreads; i++) {
-        pool->run(std::bind(
-            &CUDAHostAllocator::mapPagesForRegister,
-            this,
+        promises.emplace_back();
+        futures.push_back(promises[i].get_future());
+        auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable {
+          mapPagesForRegister(
             *ptr,
             roundSize,
             i, // thread task-id
             numMapThreads,
-            pageSize));
+            pageSize);
+          // set the promise when mapping pages are done
+          promises[i].set_value();
+        };
+        pool->run(task);
+      }
+      for (auto& future : futures) {
+        future.wait();
       }
-      pool->waitWorkComplete();
     } else {
       // Map pages in the same thread
       mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);

diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -80,8 +80,23 @@ def test_pinned_memory_with_cudaregister(self):
         torch.cuda.memory._set_allocator_settings("pinned_use_cuda_host_register:True,pinned_num_register_threads:8")
         t = torch.ones(20)
         self.assertFalse(t.is_pinned())
-        pinned_t = torch.ones(1 << 21).pin_memory()
-        self.assertTrue(pinned_t.is_pinned())
+        try:
+            pinned_t = torch.ones(1 << 21).pin_memory()
+            self.assertTrue(pinned_t.is_pinned())
+            pinned_t = torch.ones(1 << 24).pin_memory()
+            self.assertTrue(pinned_t.is_pinned())
+        except RuntimeError as e:
+            # Some GPUs don't support same address space on host and device side
+            pass
+
+    def test_pinned_memory_with_cudaregister_multithread(self):
+        num_threads = 4
+        threads = [threading.Thread(target=self.test_pinned_memory_with_cudaregister)
+                   for t in range(num_threads)]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
 
     def test_cudart_register(self):
         t = torch.ones(20)