From b49d9e273f39ea567a6cff7ea16e431ac15be063 Mon Sep 17 00:00:00 2001 From: J Wyman Date: Mon, 10 Nov 2025 11:19:12 -0500 Subject: [PATCH 1/2] fix: Fix SEGFAULT in BLS Model Loading This change contains the minimal change to avoid SEGFAULT failures during the BLS Model Loading test. The crash itself is cause by deleting a shared-memory region's control allocation which can happen when we somehow endup with handle{1} (the control region) in our accounting, and then delete it when its refcount reaches zero. This change does fix the root cause of how we're accounting for handle{1} (which we should never have). --- src/shm_manager.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/shm_manager.h b/src/shm_manager.h index e0799a07..745bf758 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -43,6 +43,9 @@ namespace triton { namespace backend { namespace python { namespace bi = boost::interprocess; +static constexpr bi::managed_external_buffer::handle_t + SHM_CONTROL_REGION_HANDLE{1}; + class CUDAMemoryPoolManager { public: CUDAMemoryPoolManager() : triton_memory_manager_(nullptr) {} @@ -166,6 +169,10 @@ class SharedMemoryManager { void Deallocate(bi::managed_external_buffer::handle_t handle) { + // Do not delete the control region, to avoid undefined behavior. + if (handle == SHM_CONTROL_REGION_HANDLE) { + return; + } bi::scoped_lock guard{*shm_mutex_}; GrowIfNeeded(0); void* ptr = managed_buffer_->get_address_from_handle(handle); @@ -174,6 +181,10 @@ class SharedMemoryManager { void DeallocateUnsafe(bi::managed_external_buffer::handle_t handle) { + // Do not delete the control region, to avoid undefined behavior. + if (handle == SHM_CONTROL_REGION_HANDLE) { + return; + } void* ptr = managed_buffer_->get_address_from_handle(handle); managed_buffer_->deallocate(ptr); } From 3bfd11eae0bbec5126412102951bf1983ad578d3 Mon Sep 17 00:00:00 2001 From: J Wyman Date: Mon, 10 Nov 2025 17:36:27 -0500 Subject: [PATCH 2/2] rename variable --- src/shm_manager.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/shm_manager.h b/src/shm_manager.h index 745bf758..8517faf3 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -43,8 +43,8 @@ namespace triton { namespace backend { namespace python { namespace bi = boost::interprocess; -static constexpr bi::managed_external_buffer::handle_t - SHM_CONTROL_REGION_HANDLE{1}; +static constexpr bi::managed_external_buffer::handle_t kShmControlRegionHandle{ + 1}; class CUDAMemoryPoolManager { public: @@ -170,7 +170,7 @@ class SharedMemoryManager { void Deallocate(bi::managed_external_buffer::handle_t handle) { // Do not delete the control region, to avoid undefined behavior. - if (handle == SHM_CONTROL_REGION_HANDLE) { + if (handle == kShmControlRegionHandle) { return; } bi::scoped_lock guard{*shm_mutex_}; @@ -182,7 +182,7 @@ class SharedMemoryManager { void DeallocateUnsafe(bi::managed_external_buffer::handle_t handle) { // Do not delete the control region, to avoid undefined behavior. - if (handle == SHM_CONTROL_REGION_HANDLE) { + if (handle == kShmControlRegionHandle) { return; } void* ptr = managed_buffer_->get_address_from_handle(handle);