From badc9d290bcf8d98d2d0327fa85d93593f2139cb Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Mon, 6 Oct 2025 18:36:08 -0500 Subject: [PATCH 1/5] use call_once to prevent repeated thread count setting --- src/libtorch.cc | 53 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 26a2960..f78762b 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -28,6 +28,7 @@ #include #include +#include #include "libtorch_utils.h" #include "triton/backend/backend_common.h" @@ -66,6 +67,11 @@ // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API. // +namespace { + std::once_flag pytorch_interop_threads_flag; + std::once_flag pytorch_intraop_threads_flag; +} + namespace triton { namespace backend { namespace pytorch { // @@ -509,13 +515,17 @@ ModelState::ParseParameters() } } else { if (intra_op_thread_count > 0) { - at::set_num_threads(intra_op_thread_count); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Intra op thread count is set to ") + - std::to_string(intra_op_thread_count) + " for model instance '" + - Name() + "'") - .c_str()); + // at::set_num_threads() does not throw if called more than once, but issues warnings. + // std::call_once() is useful to limit these. + std::call_once(pytorch_intraop_threads_flag, [this, intra_op_thread_count](){ + at::set_num_threads(intra_op_thread_count); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Intra op thread count is set to ") + + std::to_string(intra_op_thread_count) + " for model instance '" + + this->Name() + "'") + .c_str()); + }); } } @@ -533,13 +543,28 @@ ModelState::ParseParameters() } } else { if (inter_op_thread_count > 0) { - at::set_num_interop_threads(inter_op_thread_count); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inter op thread count is set to ") + - std::to_string(inter_op_thread_count) + " for model instance '" + - Name() + "'") - .c_str()); + // at::set_num_interop_threads() throws if called more than once. + // std::call_once() should prevent this, but try/catch is additionally used for safety. + std::call_once(pytorch_interop_threads_flag, [this, inter_op_thread_count](){ + try { + at::set_num_interop_threads(inter_op_thread_count); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inter op thread count is set to ") + + std::to_string(inter_op_thread_count) + " for model instance '" + + Name() + "'") + .c_str()); + } catch (const c10::Error& e) { + int current_inter_op_thread_count = at::get_num_interop_threads(); + bool current_is_requested = inter_op_thread_count == current_inter_op_thread_count; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inter op thread count is already set to ") + + std::to_string(current_inter_op_thread_count) + + (current_is_requested ? "" : " and cannot be changed. Setting ignored") + + " for model instance '" + this->Name() + "'").c_str()); + } + }); } } } From a4146b29cc386687565ced7ad44e04674933f37e Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Mon, 6 Oct 2025 19:27:33 -0500 Subject: [PATCH 2/5] update docs for thread parameters --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b27283..34ec2dc 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ key: "ENABLE_CACHE_CLEANING" * `INTER_OP_THREAD_COUNT`: PyTorch allows using multiple CPU threads during TorchScript model inference. -One or more inference threads execute a model’s forward pass on the given +One or more inference threads execute a model's forward pass on the given inputs. Each inference thread invokes a JIT interpreter that executes the ops of a model inline, one by one. This parameter sets the size of this thread pool. The default value of this setting is the number of cpu cores. Please refer @@ -218,6 +218,10 @@ key: "INTER_OP_THREAD_COUNT" } ``` +**NOTE**: This parameter is set globally for the PyTorch backend. +The value from the first model config file that specifies this parameter will be used. +Subsequent values from other model config files, if different, will be ignored. + * `INTRA_OP_THREAD_COUNT`: In addition to the inter-op parallelism, PyTorch can also utilize multiple threads @@ -238,6 +242,10 @@ key: "INTRA_OP_THREAD_COUNT" } ``` +**NOTE**: This parameter is set globally for the PyTorch backend. +The value from the first model config file that specifies this parameter will be used. +Subsequent values from other model config files, if different, will be ignored. + * Additional Optimizations: Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with complex execution modes and dynamic shapes. If not specified, all are enabled by default. From 3037b2737173daa1f625849fe83b9a9ecf7b4263 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Tue, 7 Oct 2025 11:27:03 -0500 Subject: [PATCH 3/5] always emit a message with number of threads --- src/libtorch.cc | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index f78762b..27bb488 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -517,15 +517,15 @@ ModelState::ParseParameters() if (intra_op_thread_count > 0) { // at::set_num_threads() does not throw if called more than once, but issues warnings. // std::call_once() is useful to limit these. - std::call_once(pytorch_intraop_threads_flag, [this, intra_op_thread_count](){ + std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count](){ at::set_num_threads(intra_op_thread_count); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Intra op thread count is set to ") + - std::to_string(intra_op_thread_count) + " for model instance '" + - this->Name() + "'") - .c_str()); }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Intra op thread count is set to ") + + std::to_string(at::get_num_threads()) + " for model instance '" + + Name() + "'") + .c_str()); } } @@ -545,26 +545,19 @@ ModelState::ParseParameters() if (inter_op_thread_count > 0) { // at::set_num_interop_threads() throws if called more than once. // std::call_once() should prevent this, but try/catch is additionally used for safety. - std::call_once(pytorch_interop_threads_flag, [this, inter_op_thread_count](){ + std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count](){ try { at::set_num_interop_threads(inter_op_thread_count); - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inter op thread count is set to ") + - std::to_string(inter_op_thread_count) + " for model instance '" + - Name() + "'") - .c_str()); } catch (const c10::Error& e) { - int current_inter_op_thread_count = at::get_num_interop_threads(); - bool current_is_requested = inter_op_thread_count == current_inter_op_thread_count; - LOG_MESSAGE( - TRITONSERVER_LOG_INFO, - (std::string("Inter op thread count is already set to ") + - std::to_string(current_inter_op_thread_count) + - (current_is_requested ? "" : " and cannot be changed. Setting ignored") + - " for model instance '" + this->Name() + "'").c_str()); + // do nothing } }); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Inter op thread count is set to ") + + std::to_string(at::get_num_interop_threads()) + " for model instance '" + + Name() + "'") + .c_str()); } } } From 5ee93fd0827c2b4d45ba7ef037712aeb886f9b90 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Thu, 9 Oct 2025 13:44:32 -0500 Subject: [PATCH 4/5] apply formatting --- src/libtorch.cc | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/libtorch.cc b/src/libtorch.cc index 27bb488..c873375 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -68,9 +68,9 @@ // namespace { - std::once_flag pytorch_interop_threads_flag; - std::once_flag pytorch_intraop_threads_flag; -} +std::once_flag pytorch_interop_threads_flag; +std::once_flag pytorch_intraop_threads_flag; +} // namespace namespace triton { namespace backend { namespace pytorch { @@ -515,9 +515,9 @@ ModelState::ParseParameters() } } else { if (intra_op_thread_count > 0) { - // at::set_num_threads() does not throw if called more than once, but issues warnings. - // std::call_once() is useful to limit these. - std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count](){ + // at::set_num_threads() does not throw if called more than once, but + // issues warnings. std::call_once() is useful to limit these. + std::call_once(pytorch_intraop_threads_flag, [intra_op_thread_count]() { at::set_num_threads(intra_op_thread_count); }); LOG_MESSAGE( @@ -544,19 +544,21 @@ ModelState::ParseParameters() } else { if (inter_op_thread_count > 0) { // at::set_num_interop_threads() throws if called more than once. - // std::call_once() should prevent this, but try/catch is additionally used for safety. - std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count](){ + // std::call_once() should prevent this, but try/catch is additionally + // used for safety. + std::call_once(pytorch_interop_threads_flag, [inter_op_thread_count]() { try { at::set_num_interop_threads(inter_op_thread_count); - } catch (const c10::Error& e) { + } + catch (const c10::Error& e) { // do nothing } }); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("Inter op thread count is set to ") + - std::to_string(at::get_num_interop_threads()) + " for model instance '" + - Name() + "'") + std::to_string(at::get_num_interop_threads()) + + " for model instance '" + Name() + "'") .c_str()); } } From f0cb13fd53da812d67f493fd9da34400926287bf Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Thu, 9 Oct 2025 13:46:00 -0500 Subject: [PATCH 5/5] readme formatting --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 34ec2dc..d948954 100644 --- a/README.md +++ b/README.md @@ -218,9 +218,10 @@ key: "INTER_OP_THREAD_COUNT" } ``` -**NOTE**: This parameter is set globally for the PyTorch backend. -The value from the first model config file that specifies this parameter will be used. -Subsequent values from other model config files, if different, will be ignored. +> [!NOTE] +> This parameter is set globally for the PyTorch backend. +> The value from the first model config file that specifies this parameter will be used. +> Subsequent values from other model config files, if different, will be ignored. * `INTRA_OP_THREAD_COUNT`: @@ -242,9 +243,10 @@ key: "INTRA_OP_THREAD_COUNT" } ``` -**NOTE**: This parameter is set globally for the PyTorch backend. -The value from the first model config file that specifies this parameter will be used. -Subsequent values from other model config files, if different, will be ignored. +> [!NOTE] +> This parameter is set globally for the PyTorch backend. +> The value from the first model config file that specifies this parameter will be used. +> Subsequent values from other model config files, if different, will be ignored. * Additional Optimizations: Three additional boolean parameters are available to disable certain Torch optimizations that can sometimes cause latency regressions in models with