Skip to content

Commit

Permalink
PR #13039: [XLA:GPU] Add debug code to VLOG allocation info if GpuExe…
Browse files Browse the repository at this point in the history
…cutable has any memory addressed changed.

Imported from GitHub PR openxla/xla#13039

Help to identify command buffer perf issues.

Copybara import of the project:

--
05c24d4707017755ee75a1d6058928fccfac1f83 by Shawn Wang <shawnw@nvidia.com>:

Add debug code to dump if GpuExecutable has any memory addressed changed

--
5fabaab97573a8059f0fe993514c03332c7e6ae8 by Shawn Wang <shawnw@nvidia.com>:

add comments

Merging this change closes #13039

PiperOrigin-RevId: 636827911
  • Loading branch information
shawnwang18 authored and tensorflower-gardener committed May 24, 2024
1 parent ff2121f commit ec47405
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 2 deletions.
7 changes: 6 additions & 1 deletion third_party/xla/xla/service/buffer_assignment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ static const HloInstruction* GetOutputInstruction(
return nullptr;
}

std::string BufferAllocation::ToString() const {
std::string BufferAllocation::ToShortString() const {
std::string output;
StrAppendFormat(&output, "allocation %d: size %d", index_, size());
if (color() != 0) {
Expand Down Expand Up @@ -366,6 +366,11 @@ std::string BufferAllocation::ToString() const {
StrAppend(&output, ", preallocated-temp");
}
StrAppend(&output, ":\n");
return output;
}

std::string BufferAllocation::ToString() const {
std::string output = ToShortString();
// Dump the assigned buffers ordered by id.
std::vector<const HloValue*> sorted_buffers;
for (const auto& buffer_offset_size : assigned_buffers_) {
Expand Down
1 change: 1 addition & 0 deletions third_party/xla/xla/service/buffer_assignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ class BufferAllocation {
Slice GetSlice(const HloValue& buffer) const;

std::string ToString() const;
std::string ToShortString() const;
BufferAllocationProto ToProto() const;

// Whether the buffer is a parameter to or live out of the entry computation.
Expand Down
31 changes: 30 additions & 1 deletion third_party/xla/xla/service/gpu/gpu_executable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,36 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
GenerateBufferAllocations(arguments, globals, memory_allocator,
device_ordinal));
VLOG(3) << buffer_allocations.ToString();
absl::Span<const BufferAllocation> allocations = GetAllocations();

if (VLOG_IS_ON(5)) {
// Debug code to compare current allocation's address with previous run's
// address, and report the allocation info if memory addressed changed.
// Useful for identify in user's model if it is command buffer perf friendly
// (no command buffer update cost).
absl::MutexLock lock(&module_handle_mutex_);
if (module_allocations_.find(executor) == module_allocations_.end()) {
std::vector<se::DeviceMemoryBase> allocs_addr;
allocs_addr.reserve(buffer_allocations.size());
for (int i = 0; i < buffer_allocations.size(); i++) {
allocs_addr.push_back(buffer_allocations.GetDeviceAddress(i));
}
module_allocations_[executor] = std::move(allocs_addr);
} else {
for (int i = 0; i < buffer_allocations.size(); i++) {
if (module_allocations_[executor][i].IsSameAs(
buffer_allocations.GetDeviceAddress(i))) {
continue;
}
module_allocations_[executor][i] =
buffer_allocations.GetDeviceAddress(i);
VLOG(5) << "Gpu address changed for module " << module_name_
<< ", allocation info: \n"
<< allocations[i].ToShortString();
}
}
}

std::set<se::DeviceMemoryBase> buffers_in_result;

const bool is_entire_tuple_contents_aliased = [&] {
Expand All @@ -872,7 +902,6 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
return true;
}();

absl::Span<const BufferAllocation> allocations = GetAllocations();
for (auto& p : result.MutableResult()->buffers()) {
const ShapeIndex& index = p.first;
if (!output_info_.contains(index)) {
Expand Down
6 changes: 6 additions & 0 deletions third_party/xla/xla/service/gpu/gpu_executable.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,12 @@ class GpuExecutable : public Executable {
std::unique_ptr<BufferAllocToDeviceMemoryMap>>
module_globals_ ABSL_GUARDED_BY(module_handle_mutex_);

// Cache previous memory allocations for current module, this is used to help
// identify if user's model have unstable pointers by turning on VLOG(5).
absl::flat_hash_map<stream_executor::StreamExecutor*,
std::vector<se::DeviceMemoryBase>>
module_allocations_ ABSL_GUARDED_BY(module_handle_mutex_);

std::vector<ConstantInfo> constants_;
const absl::flat_hash_map<ShapeIndex, OutputInfo> output_info_;
// Retains shared ownership of on-device constants that are managed by XLA and
Expand Down

0 comments on commit ec47405

Please sign in to comment.