Fix lifetime of infer payload

triton-inference-server · May 14, 2023 · f0cf5d1 · f0cf5d1
1 parent 3623215
commit f0cf5d1
Showing 1 changed file with 11 additions and 7 deletions.
diff --git a/src/request_executor.cc b/src/request_executor.cc
@@ -77,7 +77,8 @@ void
 InferResponseComplete(
     TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)
 {
-  auto p = reinterpret_cast<InferPayload*>(userp);
+  auto infer_payload =
+      *(reinterpret_cast<std::shared_ptr<InferPayload>*>(userp));
   std::unique_ptr<InferResponse> infer_response;
   std::vector<std::shared_ptr<PbTensor>> output_tensors;
   std::shared_ptr<PbError> pb_error;
@@ -146,7 +147,7 @@ InferResponseComplete(
       output_tensors.clear();
     }
 
-    if (!p->IsDecoupled()) {
+    if (!infer_payload->IsDecoupled()) {
       infer_response = std::make_unique<InferResponse>(
           output_tensors, pb_error, true /* is_last_response */);
     } else {
@@ -167,7 +168,8 @@ InferResponseComplete(
         TRITONSERVER_InferenceResponseDelete(response),
         "Failed to release BLS inference response.");
   } else if (
-      p->IsDecoupled() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
+      infer_payload->IsDecoupled() &&
+      (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
     // An empty response may be the last reponse for decoupled models.
     infer_response = std::make_unique<InferResponse>(
         output_tensors, pb_error, true /* is_last_response */, userp /* id */);
@@ -177,7 +179,7 @@ InferResponseComplete(
         output_tensors, pb_error, true /* is_last_response */, userp /* id */);
   }
 
-  p->SetValue(std::move(infer_response));
+  infer_payload->SetValue(std::move(infer_response));
 }
 
 TRITONSERVER_Error*
@@ -333,8 +335,8 @@ RequestExecutor::Infer(
           std::string("Model ") + model_name +
           " is using the decoupled. The current BLS request call doesn't "
           "support models using the decoupled transaction policy. Please use "
-          "stream API 'stream_exec()' or 'async_stream_exec() for decoupled "
-          "models.'");
+          "'decoupled=True' argument to the 'exec' or 'async_exec' calls for "
+          "decoupled models.'");
     }
 
     // Inference
@@ -379,11 +381,13 @@ RequestExecutor::Infer(
       ResponseAllocatorUserp response_allocator_userp(
           shm_pool_.get(), infer_request->GetPreferredMemory());
       infer_payload->SetResponseAllocUserp(response_allocator_userp);
+      std::shared_ptr<InferPayload>* infer_payload_p =
+          new std::shared_ptr<InferPayload>(infer_payload);
 
       THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback(
           irequest, response_allocator_,
           reinterpret_cast<void*>(infer_payload->ResponseAllocUserp().get()),
-          InferResponseComplete, reinterpret_cast<void*>(infer_payload.get())));
+          InferResponseComplete, reinterpret_cast<void*>(infer_payload_p)));
 
       THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync(
           server_, irequest, nullptr /* trace */));