Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add shared memory leak debugging introspection #305

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,20 +95,18 @@ FetchContent_Declare(
)
FetchContent_MakeAvailable(dlpack)

set(BOOST_ENABLE_CMAKE ON)
set(BOOST_INCLUDE_LIBRARIES stacktrace)
#
# Boost
#
ExternalProject_Add(
boostorg
URL https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz
URL_HASH SHA256=273f1be93238a068aba4f9735a4a2b003019af067b9c183ed227780b8f36062c
PREFIX "boost-src"
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E copy_directory
<SOURCE_DIR>/boost/ ${CMAKE_BINARY_DIR}/boost
INSTALL_COMMAND ""
BUILD_COMMAND ""
FetchContent_Declare(
Boost
GIT_REPOSITORY https://github.com/boostorg/boost.git
GIT_TAG boost-1.81.0
GIT_SHALLOW ON
)
set(boostorg_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/boost/")
FetchContent_MakeAvailable(Boost)

#
# CUDA
Expand All @@ -125,6 +123,10 @@ if(${TRITON_ENABLE_NVTX})
add_definitions(-DTRITON_ENABLE_NVTX=1)
endif() # TRITON_ENABLE_NVTX

add_definitions(-DBOOST_STACKTRACE_USE_ADDR2LINE=1)
add_definitions(-DBOOST_STACKTRACE_USE_BACKTRACE=1)


find_package(ZLIB REQUIRED)
find_package(Threads REQUIRED)

Expand Down Expand Up @@ -220,8 +222,6 @@ add_executable(
${PYTHON_BACKEND_STUB_SRCS}
)

add_dependencies(triton-python-backend boostorg)
add_dependencies(triton-python-backend-stub boostorg)

set_property(TARGET triton-python-backend-stub PROPERTY OUTPUT_NAME triton_python_backend_stub)

Expand Down Expand Up @@ -255,6 +255,9 @@ target_link_libraries(
triton-core-serverstub # from repo-core
ZLIB::ZLIB
-larchive
-ldl
Boost::stacktrace_backtrace
Boost::stacktrace_addr2line
)

target_link_libraries(
Expand All @@ -267,6 +270,9 @@ target_link_libraries(
pybind11::embed
-lrt # shared memory
-larchive # libarchive
-ldl
Boost::stacktrace_backtrace
Boost::stacktrace_addr2line
)

set_target_properties(
Expand Down
4 changes: 1 addition & 3 deletions src/pb_response_iterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ ResponseIterator::Next()
}
}

py::iterator
void
ResponseIterator::Iter()
{
if (is_finished_) {
Expand All @@ -111,8 +111,6 @@ ResponseIterator::Iter()
idx_ = 0;
}
}

return py::cast(*this);
}

void
Expand Down
2 changes: 1 addition & 1 deletion src/pb_response_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class ResponseIterator {
~ResponseIterator();

std::shared_ptr<InferResponse> Next();
py::iterator Iter();
void Iter();
void EnqueueResponse(std::shared_ptr<InferResponse> infer_response);
void* Id();
void Clear();
Expand Down
7 changes: 6 additions & 1 deletion src/pb_stub.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1544,7 +1544,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
py::class_<ResponseIterator, std::shared_ptr<ResponseIterator>>(
module, "ResponseIterator")
.def(py::init<const std::shared_ptr<InferResponse>&>())
.def("__iter__", &ResponseIterator::Iter, py::keep_alive<0, 1>())
.def(
"__iter__",
[](ResponseIterator& it) -> ResponseIterator& {
it.Iter();
return it;
})
.def("__next__", &ResponseIterator::Next);

py::class_<Logger> logger(module, "Logger");
Expand Down
2 changes: 2 additions & 0 deletions src/shm_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ SharedMemoryManager::SharedMemoryManager(
shm_obj_ = std::make_unique<bi::shared_memory_object>(
bi::create_only, shm_region_name.c_str(), bi::read_write);
shm_obj_->truncate(shm_size);
shm_debug_info_.open("shm_allocations_debug");
} else {
// Open the existing region.
shm_obj_ = std::make_unique<bi::shared_memory_object>(
bi::open_only, shm_region_name.c_str(), bi::read_write);
shm_debug_info_.open("shm_allocations_debug_stub");
}

current_capacity_ = shm_size;
Expand Down
32 changes: 32 additions & 0 deletions src/shm_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,14 @@
#include <type_traits>
#include <typeinfo>
#include <vector>
#define BOOST_STACKTRACE_USE_ADDR2LINE 1

#include <boost/stacktrace.hpp>
#include <fstream>
#include <sstream>

#include "pb_exception.h"
using namespace std;

namespace triton { namespace backend { namespace python {
namespace bi = boost::interprocess;
Expand Down Expand Up @@ -108,6 +114,13 @@ class SharedMemoryManager {

handle = managed_buffer_->get_handle_from_address(
reinterpret_cast<void*>(shm_ownership_data));
std::string stack_trace =
boost::stacktrace::to_string(boost::stacktrace::stacktrace());
std::replace(stack_trace.begin(), stack_trace.end(), '\n', '|');
std::replace(stack_trace.begin(), stack_trace.end(), ',', ' ');
shm_debug_info_ << handle << ",ALLOC"
<< "," << stack_trace << std::endl;
shm_debug_info_.flush();
}

return WrapObjectInUniquePtr(obj, shm_ownership_data, handle);
Expand Down Expand Up @@ -143,12 +156,30 @@ class SharedMemoryManager {
bi::scoped_lock<bi::interprocess_mutex> guard{*shm_mutex_};
GrowIfNeeded(0);
void* ptr = managed_buffer_->get_address_from_handle(handle);

std::string stack_trace =
boost::stacktrace::to_string(boost::stacktrace::stacktrace());
std::replace(stack_trace.begin(), stack_trace.end(), '\n', '|');
std::replace(stack_trace.begin(), stack_trace.end(), ',', ' ');
shm_debug_info_ << handle << ",DEALLOC"
<< "," << stack_trace << std::endl;
shm_debug_info_.flush();

managed_buffer_->deallocate(ptr);
}

void DeallocateUnsafe(bi::managed_external_buffer::handle_t handle)
{
void* ptr = managed_buffer_->get_address_from_handle(handle);

std::string stack_trace =
boost::stacktrace::to_string(boost::stacktrace::stacktrace());
std::replace(stack_trace.begin(), stack_trace.end(), '\n', '|');
std::replace(stack_trace.begin(), stack_trace.end(), ',', ' ');
shm_debug_info_ << handle << ",DEALLOC"
<< "," << stack_trace << std::endl;
shm_debug_info_.flush();

managed_buffer_->deallocate(ptr);
}

Expand All @@ -171,6 +202,7 @@ class SharedMemoryManager {
uint64_t* total_size_;
bool create_;
bool delete_region_;
std::ofstream shm_debug_info_;

template <typename T>
AllocatedSharedMemory<T> WrapObjectInUniquePtr(
Expand Down
Loading