tensorflow · learning-to-play · Jun 15, 2023 · Jun 13, 2023
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
@@ -274,6 +274,8 @@ cc_library(
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 

diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/log/log.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
@@ -462,9 +464,6 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
                       host_transfer_manager->Initialize(
                           host_transfers, rendezvous_key_base, ctx));
 
-  VLOG(2) << "Cloud TPU: Executing computation on device "
-          << node_context->device_ordinal();
-
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_device_assignment(device_assignment);
@@ -504,17 +503,25 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
         prefetch.offset());
   }
 
+  VLOG(1) << "TPUExecute: Updating dynamic HLO inputs on "
+          << node_context->device_ordinal();
+
   TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, backend->memory_allocator(),
                                          &arguments, input_shapes));
 
   // Retrieve the TPU embedding memory addresses to be fed to the TPU. The
   // memory addresses are communicated with a dynamically allocated C array
   // (which needs to be free'd once the function terminates).
-  SE_DeviceMemoryBase* device_memory_addrs;
+  VLOG(1) << "TPUExecute: Updating TPUEmbedding memory addresses on "
+          << node_context->device_ordinal();
+
+  SE_DeviceMemoryBase* device_memory_addrs = nullptr;
   size_t device_memory_addrs_count;
   auto device_memory_cleanup = absl::MakeCleanup([&device_memory_addrs]() {
-    stream_executor::tpu::OpsApiFn()->SE_DeviceMemoryBase_FreeArrayFn(
-        device_memory_addrs);
+    if (device_memory_addrs != nullptr) {
+      stream_executor::tpu::OpsApiFn()->SE_DeviceMemoryBase_FreeArrayFn(
+          device_memory_addrs);
+    }
   });
 
   SE_StreamExecutor executor{stream->parent()};
@@ -529,10 +536,15 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
 
   // Add the TPU embedding memory addresses as additional arguments for the TPU
   // executable.
+  VLOG(1) << "TPUExecute: Adding " << device_memory_addrs_count
+          << " TPUEmbedding memory addresses to HLO parameters.";
   for (int i = 0; i < device_memory_addrs_count; ++i) {
     xla::ShapeTree<xla::MaybeOwningDeviceMemory> tree(
         xla::ShapeUtil::MakeOpaqueShape());
-    *tree.mutable_element({}) = ApiConverter::FromC(device_memory_addrs[i]);
+    const SE_DeviceMemoryBase& addr = device_memory_addrs[i];
+    VLOG(2) << absl::StrFormat("Device memory addr[%i] = {%p, %llu, %llu}", i,
+                               addr.opaque, addr.size, addr.payload);
+    *tree.mutable_element({}) = ApiConverter::FromC(addr);
     xla::ExecutionInput input(std::move(tree));
     arguments.push_back(std::move(input));
   }