Skip to content

Commit

Permalink
Add request parameters to Python models (#213)
Browse files Browse the repository at this point in the history
* Add request parameters to Python models

* Add documentation about the inference request parameters
  • Loading branch information
Tabrizian committed Mar 15, 2023
1 parent f007255 commit 7f5f32e
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 13 deletions.
12 changes: 12 additions & 0 deletions README.md
Expand Up @@ -50,6 +50,7 @@ any C++ code.
- [Known Issues](#known-issues)
- [`finalize`](#finalize)
- [Model Config File](#model-config-file)
- [Inference Request Parameters](#inference-request-parameters)
- [Managing Python Runtime and Libraries](#managing-python-runtime-and-libraries)
- [Building Custom Python Backend Stub](#building-custom-python-backend-stub)
- [Creating Custom Execution Environments](#creating-custom-execution-environments)
Expand Down Expand Up @@ -560,6 +561,17 @@ models
└── config.pbtxt
```

## Inference Request Parameters

You can retrieve the parameters associated with an inference request
using the `inference_request.parameters()` function. This function
returns a JSON object where the keys are the keys of the parameters
object and the values are the values for the parameters field.

You can read more about the inference request parameters in the [parameters
extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md)
documentation.

## Managing Python Runtime and Libraries

Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/)
Expand Down
38 changes: 31 additions & 7 deletions src/infer_request.cc
Expand Up @@ -27,6 +27,7 @@
#include "infer_request.h"

#include <boost/interprocess/sync/scoped_lock.hpp>

#include "pb_utils.h"
#include "scoped_defer.h"
#ifdef TRITON_PB_STUB
Expand All @@ -40,12 +41,12 @@ InferRequest::InferRequest(
const std::vector<std::shared_ptr<PbTensor>>& inputs,
const std::set<std::string>& requested_output_names,
const std::string& model_name, const int64_t model_version,
const uint32_t flags, const int32_t timeout,
const std::string& parameters, const uint32_t flags, const int32_t timeout,
const intptr_t response_factory_address, const intptr_t request_address)
: request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs),
requested_output_names_(requested_output_names), model_name_(model_name),
model_version_(model_version), flags_(flags), timeout_(timeout),
response_factory_address_(response_factory_address),
model_version_(model_version), parameters_(parameters), flags_(flags),
timeout_(timeout), response_factory_address_(response_factory_address),
request_address_(request_address)
{
for (auto& input : inputs) {
Expand Down Expand Up @@ -79,6 +80,12 @@ InferRequest::Inputs()
return inputs_;
}

const std::string&
InferRequest::Parameters()
{
return parameters_;
}

const std::string&
InferRequest::RequestId()
{
Expand Down Expand Up @@ -160,7 +167,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
sizeof(bi::managed_external_buffer::handle_t)) +
(Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)) +
PbString::ShmStructSize(ModelName()) +
PbString::ShmStructSize(RequestId()));
PbString::ShmStructSize(RequestId()) +
PbString::ShmStructSize(Parameters()));

infer_request_shm_ptr_ =
reinterpret_cast<InferRequestShm*>(infer_request_shm.data_.get());
Expand Down Expand Up @@ -222,10 +230,18 @@ InferRequest::SaveToSharedMemory(std::unique_ptr<SharedMemoryManager>& shm_pool)
reinterpret_cast<char*>(infer_request_shm_ptr_) + request_id_offset,
infer_request_shm.handle_ + request_id_offset);

size_t parameters_offset =
request_id_offset + PbString::ShmStructSize(RequestId());
std::unique_ptr<PbString> parameters_shm = PbString::Create(
Parameters(),
reinterpret_cast<char*>(infer_request_shm_ptr_) + parameters_offset,
infer_request_shm.handle_ + parameters_offset);

// Save the references to shared memory.
infer_request_shm_ = std::move(infer_request_shm);
request_id_shm_ = std::move(request_id_shm);
model_name_shm_ = std::move(model_name_shm);
parameters_shm_ = std::move(parameters_shm);
shm_handle_ = infer_request_shm_.handle_;
requested_output_names_shm_ = std::move(requested_output_names_shm);
}
Expand Down Expand Up @@ -286,21 +302,28 @@ InferRequest::LoadFromSharedMemory(
request_handle + request_id_offset,
reinterpret_cast<char*>(infer_request_shm_ptr) + request_id_offset);

size_t parameters_offset = request_id_offset + request_id_shm->Size();
std::unique_ptr<PbString> parameters_shm = PbString::LoadFromSharedMemory(
request_handle + request_id_offset,
reinterpret_cast<char*>(infer_request_shm_ptr) + parameters_offset);

return std::unique_ptr<InferRequest>(new InferRequest(
infer_request_shm, request_id_shm, requested_output_names_shm,
model_name_shm, input_tensors));
model_name_shm, input_tensors, parameters_shm));
}

InferRequest::InferRequest(
AllocatedSharedMemory<char>& infer_request_shm,
std::unique_ptr<PbString>& request_id_shm,
std::vector<std::unique_ptr<PbString>>& requested_output_names_shm,
std::unique_ptr<PbString>& model_name_shm,
std::vector<std::shared_ptr<PbTensor>>& input_tensors)
std::vector<std::shared_ptr<PbTensor>>& input_tensors,
std::unique_ptr<PbString>& parameters_shm)
: infer_request_shm_(std::move(infer_request_shm)),
request_id_shm_(std::move(request_id_shm)),
requested_output_names_shm_(std::move(requested_output_names_shm)),
model_name_shm_(std::move(model_name_shm))
model_name_shm_(std::move(model_name_shm)),
parameters_shm_(std::move(parameters_shm))
{
infer_request_shm_ptr_ =
reinterpret_cast<InferRequestShm*>(infer_request_shm_.data_.get());
Expand All @@ -325,6 +348,7 @@ InferRequest::InferRequest(
}

request_id_ = request_id_shm_->String();
parameters_ = parameters_shm_->String();
requested_output_names_ = std::move(requested_output_names);
model_name_ = model_name_shm_->String();
flags_ = infer_request_shm_ptr_->flags;
Expand Down
10 changes: 7 additions & 3 deletions src/infer_request.h
Expand Up @@ -61,12 +61,13 @@ class InferRequest {
const std::vector<std::shared_ptr<PbTensor>>& inputs,
const std::set<std::string>& requested_output_names,
const std::string& model_name, const int64_t model_version,
const uint32_t flags = 0, const int32_t timeout = 0,
const intptr_t response_factory_address = 0,
const std::string& parameters, const uint32_t flags = 0,
const int32_t timeout = 0, const intptr_t response_factory_address = 0,
const intptr_t request_address = 0);

const std::vector<std::shared_ptr<PbTensor>>& Inputs();
const std::string& RequestId();
const std::string& Parameters();
uint64_t CorrelationId();
const std::string& ModelName();
int64_t ModelVersion();
Expand Down Expand Up @@ -116,14 +117,16 @@ class InferRequest {
std::unique_ptr<PbString>& request_id_shm,
std::vector<std::unique_ptr<PbString>>& requested_output_names_shm,
std::unique_ptr<PbString>& model_name_shm,
std::vector<std::shared_ptr<PbTensor>>& input_tensors);
std::vector<std::shared_ptr<PbTensor>>& input_tensors,
std::unique_ptr<PbString>& parameters_shm);

std::string request_id_;
uint64_t correlation_id_;
std::vector<std::shared_ptr<PbTensor>> inputs_;
std::set<std::string> requested_output_names_;
std::string model_name_;
int64_t model_version_;
std::string parameters_;
uint32_t flags_;
int32_t timeout_;
intptr_t response_factory_address_;
Expand All @@ -140,6 +143,7 @@ class InferRequest {
bi::managed_external_buffer::handle_t* output_names_handle_shm_ptr_;
bi::managed_external_buffer::handle_t* input_tensors_handle_ptr_;
bi::managed_external_buffer::handle_t shm_handle_;
std::unique_ptr<PbString> parameters_shm_;

#ifdef TRITON_PB_STUB
std::shared_ptr<ResponseSender> response_sender_;
Expand Down
6 changes: 5 additions & 1 deletion src/pb_stub.cc
Expand Up @@ -29,6 +29,7 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <atomic>
#include <boost/interprocess/sync/interprocess_condition.hpp>
#include <boost/interprocess/sync/interprocess_mutex.hpp>
Expand All @@ -41,6 +42,7 @@
#include <regex>
#include <thread>
#include <unordered_map>

#include "infer_response.h"
#include "pb_error.h"
#include "pb_map.h"
Expand Down Expand Up @@ -1272,9 +1274,10 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
for (auto& requested_output_name : requested_output_names) {
requested_outputs.emplace(requested_output_name);
}
// FIXME: InferenceRequest parameters are not supported in BLS now.
return std::make_shared<InferRequest>(
request_id, correlation_id, inputs, requested_outputs,
model_name, model_version, flags, timeout);
model_name, model_version, "" /*parameters*/, flags, timeout);
}),
py::arg("request_id").none(false) = "",
py::arg("correlation_id").none(false) = 0,
Expand All @@ -1291,6 +1294,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
.def("flags", &InferRequest::Flags)
.def("set_flags", &InferRequest::SetFlags)
.def("timeout", &InferRequest::Timeout)
.def("parameters", &InferRequest::Parameters)
.def(
"exec",
[](std::shared_ptr<InferRequest>& infer_request,
Expand Down
37 changes: 35 additions & 2 deletions src/python_be.cc
Expand Up @@ -356,6 +356,39 @@ ModelInstanceState::SaveRequestsToSharedMemory(
requested_output_names.emplace(requested_output_name);
}

triton::common::TritonJson::Value parameters_json(
triton::common::TritonJson::ValueType::OBJECT);
uint32_t parameter_count;
RETURN_IF_ERROR(
TRITONBACKEND_RequestParameterCount(request, &parameter_count));
for (size_t i = 0; i < parameter_count; i++) {
const char* name;
TRITONSERVER_ParameterType type;
const void* vvalue;
RETURN_IF_ERROR(
TRITONBACKEND_RequestParameter(request, i, &name, &type, &vvalue));
if (type == TRITONSERVER_PARAMETER_INT) {
RETURN_IF_ERROR(parameters_json.AddInt(
name, *(reinterpret_cast<const int64_t*>(vvalue))));
} else if (type == TRITONSERVER_PARAMETER_BOOL) {
RETURN_IF_ERROR(parameters_json.AddBool(
name, *(reinterpret_cast<const bool*>(vvalue))));
} else if (type == TRITONSERVER_PARAMETER_STRING) {
std::string string = reinterpret_cast<const char*>(vvalue);
RETURN_IF_ERROR(parameters_json.AddString(name, string));
} else {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
(std::string("Unsupported parameter type for parameter '") + name +
"'.")
.c_str());
}
}

triton::common::TritonJson::WriteBuffer buffer;
RETURN_IF_ERROR(parameters_json.Write(&buffer));
const auto& parameters_string = buffer.Contents();

// request id
const char* id;
RETURN_IF_ERROR(TRITONBACKEND_RequestId(request, &id));
Expand All @@ -373,13 +406,13 @@ ModelInstanceState::SaveRequestsToSharedMemory(
RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request));
infer_request = std::make_unique<InferRequest>(
id, correlation_id, pb_input_tensors, requested_output_names,
model_state->Name(), model_state->Version(), flags,
model_state->Name(), model_state->Version(), parameters_string, flags,
0 /* BLS request timeout*/, reinterpret_cast<intptr_t>(factory_ptr),
reinterpret_cast<intptr_t>(request));
} else {
infer_request = std::make_unique<InferRequest>(
id, correlation_id, pb_input_tensors, requested_output_names,
model_state->Name(), model_state->Version(), flags,
model_state->Name(), model_state->Version(), parameters_string, flags,
0 /* BLS request timeout*/, 0 /* response_factory_address */,
reinterpret_cast<intptr_t>(request));
}
Expand Down

0 comments on commit 7f5f32e

Please sign in to comment.