Skip to content

Commit

Permalink
[PyTorch Pinned Allocator] Create per thread task pool for mapping me…
Browse files Browse the repository at this point in the history
…mory space (pytorch#111545)

Differential Revision: D50443865

Pull Request resolved: pytorch#111545
Approved by: https://github.com/zdevito
  • Loading branch information
banitag1 authored and xuhancn committed Nov 8, 2023
1 parent 44bdbe9 commit e7c60cf
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 8 deletions.
26 changes: 20 additions & 6 deletions aten/src/ATen/cuda/CachingHostAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <cuda_runtime_api.h>
#include <stdint.h>
#include <deque>
#include <future>
#include <memory>
#include <mutex>
#include <set>
Expand Down Expand Up @@ -397,7 +398,7 @@ class CUDAHostAllocator {
(void*)devptr == (void*)ptr,
"Host and device pointer dont match with cudaHostRegister. "
"Please dont use this feature by setting "
"PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)",
"PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
"");
}

Expand All @@ -412,18 +413,31 @@ class CUDAHostAllocator {
size_t numMapThreads = c10::cuda::CUDACachingAllocator::
CUDAAllocatorConfig::pinned_num_register_threads();
if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
// parallelize the mapping of pages with a threadpool
auto* pool = getThreadPool();
std::vector<std::promise<void>> promises;
std::vector<std::future<void>> futures;
promises.reserve(numMapThreads);
futures.reserve(numMapThreads);

for (size_t i = 0; i < numMapThreads; i++) {
pool->run(std::bind(
&CUDAHostAllocator::mapPagesForRegister,
this,
promises.emplace_back();
futures.push_back(promises[i].get_future());
auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable {
mapPagesForRegister(
*ptr,
roundSize,
i, // thread task-id
numMapThreads,
pageSize));
pageSize);
// set the promise when mapping pages are done
promises[i].set_value();
};
pool->run(task);
}
for (auto& future : futures) {
future.wait();
}
pool->waitWorkComplete();
} else {
// Map pages in the same thread
mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
Expand Down
19 changes: 17 additions & 2 deletions test/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,23 @@ def test_pinned_memory_with_cudaregister(self):
torch.cuda.memory._set_allocator_settings("pinned_use_cuda_host_register:True,pinned_num_register_threads:8")
t = torch.ones(20)
self.assertFalse(t.is_pinned())
pinned_t = torch.ones(1 << 21).pin_memory()
self.assertTrue(pinned_t.is_pinned())
try:
pinned_t = torch.ones(1 << 21).pin_memory()
self.assertTrue(pinned_t.is_pinned())
pinned_t = torch.ones(1 << 24).pin_memory()
self.assertTrue(pinned_t.is_pinned())
except RuntimeError as e:
# Some GPUs don't support same address space on host and device side
pass

def test_pinned_memory_with_cudaregister_multithread(self):
num_threads = 4
threads = [threading.Thread(target=self.test_pinned_memory_with_cudaregister)
for t in range(num_threads)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

def test_cudart_register(self):
t = torch.ones(20)
Expand Down

0 comments on commit e7c60cf

Please sign in to comment.