diff --git a/src/request_executor.cc b/src/request_executor.cc index c0241784..89866726 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -77,7 +77,8 @@ void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) { - auto p = reinterpret_cast(userp); + auto infer_payload = + *(reinterpret_cast*>(userp)); std::unique_ptr infer_response; std::vector> output_tensors; std::shared_ptr pb_error; @@ -146,7 +147,7 @@ InferResponseComplete( output_tensors.clear(); } - if (!p->IsDecoupled()) { + if (!infer_payload->IsDecoupled()) { infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */); } else { @@ -167,7 +168,8 @@ InferResponseComplete( TRITONSERVER_InferenceResponseDelete(response), "Failed to release BLS inference response."); } else if ( - p->IsDecoupled() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { + infer_payload->IsDecoupled() && + (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { // An empty response may be the last reponse for decoupled models. infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */, userp /* id */); @@ -177,7 +179,7 @@ InferResponseComplete( output_tensors, pb_error, true /* is_last_response */, userp /* id */); } - p->SetValue(std::move(infer_response)); + infer_payload->SetValue(std::move(infer_response)); } TRITONSERVER_Error* @@ -333,8 +335,8 @@ RequestExecutor::Infer( std::string("Model ") + model_name + " is using the decoupled. The current BLS request call doesn't " "support models using the decoupled transaction policy. Please use " - "stream API 'stream_exec()' or 'async_stream_exec() for decoupled " - "models.'"); + "'decoupled=True' argument to the 'exec' or 'async_exec' calls for " + "decoupled models.'"); } // Inference @@ -379,11 +381,13 @@ RequestExecutor::Infer( ResponseAllocatorUserp response_allocator_userp( shm_pool_.get(), infer_request->GetPreferredMemory()); infer_payload->SetResponseAllocUserp(response_allocator_userp); + std::shared_ptr* infer_payload_p = + new std::shared_ptr(infer_payload); THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback( irequest, response_allocator_, reinterpret_cast(infer_payload->ResponseAllocUserp().get()), - InferResponseComplete, reinterpret_cast(infer_payload.get()))); + InferResponseComplete, reinterpret_cast(infer_payload_p))); THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( server_, irequest, nullptr /* trace */));