PR #13039: [XLA:GPU] Add debug code to VLOG allocation info if GpuExe…

…cutable has any memory addressed changed. Imported from GitHub PR openxla/xla#13039 Help to identify command buffer perf issues. Copybara import of the project: -- 05c24d4707017755ee75a1d6058928fccfac1f83 by Shawn Wang <shawnw@nvidia.com>: Add debug code to dump if GpuExecutable has any memory addressed changed -- 5fabaab97573a8059f0fe993514c03332c7e6ae8 by Shawn Wang <shawnw@nvidia.com>: add comments Merging this change closes #13039 PiperOrigin-RevId: 636827911
tensorflow · May 24, 2024 · ec47405 · ec47405
1 parent ff2121f
commit ec47405
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 2 deletions.
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
@@ -336,7 +336,7 @@ static const HloInstruction* GetOutputInstruction(
   return nullptr;
 }
 
-std::string BufferAllocation::ToString() const {
+std::string BufferAllocation::ToShortString() const {
   std::string output;
   StrAppendFormat(&output, "allocation %d: size %d", index_, size());
   if (color() != 0) {
@@ -366,6 +366,11 @@ std::string BufferAllocation::ToString() const {
     StrAppend(&output, ", preallocated-temp");
   }
   StrAppend(&output, ":\n");
+  return output;
+}
+
+std::string BufferAllocation::ToString() const {
+  std::string output = ToShortString();
   // Dump the assigned buffers ordered by id.
   std::vector<const HloValue*> sorted_buffers;
   for (const auto& buffer_offset_size : assigned_buffers_) {

diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
@@ -216,6 +216,7 @@ class BufferAllocation {
   Slice GetSlice(const HloValue& buffer) const;
 
   std::string ToString() const;
+  std::string ToShortString() const;
   BufferAllocationProto ToProto() const;
 
   // Whether the buffer is a parameter to or live out of the entry computation.

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -857,6 +857,36 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
       GenerateBufferAllocations(arguments, globals, memory_allocator,
                                 device_ordinal));
   VLOG(3) << buffer_allocations.ToString();
+  absl::Span<const BufferAllocation> allocations = GetAllocations();
+
+  if (VLOG_IS_ON(5)) {
+    // Debug code to compare current allocation's address with previous run's
+    // address, and report the allocation info if memory addressed changed.
+    // Useful for identify in user's model if it is command buffer perf friendly
+    // (no command buffer update cost).
+    absl::MutexLock lock(&module_handle_mutex_);
+    if (module_allocations_.find(executor) == module_allocations_.end()) {
+      std::vector<se::DeviceMemoryBase> allocs_addr;
+      allocs_addr.reserve(buffer_allocations.size());
+      for (int i = 0; i < buffer_allocations.size(); i++) {
+        allocs_addr.push_back(buffer_allocations.GetDeviceAddress(i));
+      }
+      module_allocations_[executor] = std::move(allocs_addr);
+    } else {
+      for (int i = 0; i < buffer_allocations.size(); i++) {
+        if (module_allocations_[executor][i].IsSameAs(
+                buffer_allocations.GetDeviceAddress(i))) {
+          continue;
+        }
+        module_allocations_[executor][i] =
+            buffer_allocations.GetDeviceAddress(i);
+        VLOG(5) << "Gpu address changed for module " << module_name_
+                << ", allocation info: \n"
+                << allocations[i].ToShortString();
+      }
+    }
+  }
+
   std::set<se::DeviceMemoryBase> buffers_in_result;
 
   const bool is_entire_tuple_contents_aliased = [&] {
@@ -872,7 +902,6 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     return true;
   }();
 
-  absl::Span<const BufferAllocation> allocations = GetAllocations();
   for (auto& p : result.MutableResult()->buffers()) {
     const ShapeIndex& index = p.first;
     if (!output_info_.contains(index)) {

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -291,6 +291,12 @@ class GpuExecutable : public Executable {
                       std::unique_ptr<BufferAllocToDeviceMemoryMap>>
       module_globals_ ABSL_GUARDED_BY(module_handle_mutex_);
 
+  // Cache previous memory allocations for current module, this is used to help
+  // identify if user's model have unstable pointers by turning on VLOG(5).
+  absl::flat_hash_map<stream_executor::StreamExecutor*,
+                      std::vector<se::DeviceMemoryBase>>
+      module_allocations_ ABSL_GUARDED_BY(module_handle_mutex_);
+
   std::vector<ConstantInfo> constants_;
   const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_;
   // Retains shared ownership of on-device constants that are managed by XLA and