[XLA:GPU] Use EstimateRunTimeData::ToString instead of VLogResult.

The same logic to print roughly the same parameters in duplicated between to places. PiperOrigin-RevId: 636197839
tensorflow · May 22, 2024 · 000a07c · 000a07c
1 parent 850c889
commit 000a07c
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 41 deletions.
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -175,19 +175,29 @@ GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
       compute_time, memory_access_time,
       GpuPerformanceModelOptions::PriorityFusion());
 
-  VLogResult(flops, bytes_read, bytes_written, num_threads, compute_time,
-             read_time, write_time, exec_time);
-
-  return EstimateRunTimeData{flops,      bytes_written, num_threads, read_time,
-                             write_time, compute_time,  exec_time};
+  EstimateRunTimeData runtime_data = {flops,        bytes_read, bytes_written,
+                                      num_threads,  read_time,  write_time,
+                                      compute_time, exec_time};
+  VLOG(3) << "Runtime data for HLO fusion: " << fusion_adaptor.ToString()
+          << "\n"
+          << runtime_data.ToString();
+
+  return runtime_data;
 }
 
 EstimateRunTimeData
 GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForInstruction(
     const HloInstruction* producer) {
   // Stand-alone bitcast is always no-op during runtime.
   if (producer->opcode() == HloOpcode::kBitcast) {
-    return {0, 0, 0, absl::ZeroDuration(), absl::ZeroDuration()};
+    return EstimateRunTimeData{/*flops=*/0,
+                               /*bytes_read=*/0,
+                               /*bytes_written=*/0,
+                               /*num_threads=*/0,
+                               /*read_time=*/absl::ZeroDuration(),
+                               /*write_time=*/absl::ZeroDuration(),
+                               /*compute_time=*/absl::ZeroDuration(),
+                               /*exec_time=*/absl::ZeroDuration()};
   }
 
   auto fusion_analysis = AnalyzeFusion(*producer, *device_info_);

diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -95,12 +95,9 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
   absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
       compute_time, read_time + write_time, config);
 
-  VLogResult(flops, bytes_read, bytes_written, num_threads, compute_time,
-             read_time, write_time, exec_time);
-
-  EstimateRunTimeData runtime_data = {flops,     bytes_written, num_threads,
-                                      read_time, write_time,    compute_time,
-                                      exec_time};
+  EstimateRunTimeData runtime_data = {flops,        bytes_read, bytes_written,
+                                      num_threads,  read_time,  write_time,
+                                      compute_time, exec_time};
   VLOG(3) << "Runtime data for HLO: " << instr->name() << "\n"
           << runtime_data.ToString();
   return runtime_data;
@@ -240,8 +237,18 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
   auto exec_time = CombineComputeAndMemoryAccessTime(
       compute_time, read_time + consumer_runtime.write_time, config);
 
-  VLogResult(flops, bytes_read, consumer_runtime.bytes_written, num_threads,
-             compute_time, read_time, consumer_runtime.write_time, exec_time);
+  VLOG(3) << "Runtime data for producer-consumer fusion:\n"
+          << " producer: " << producer->name() << "\n"
+          << " consumer: " << consumer->name() << "\n"
+          << EstimateRunTimeData{flops,
+                                 bytes_read,
+                                 consumer_runtime.bytes_written,
+                                 num_threads,
+                                 compute_time,
+                                 read_time,
+                                 consumer_runtime.write_time,
+                                 exec_time}
+                 .ToString();
 
   return exec_time;
 }

diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -411,22 +411,5 @@ void GpuPerformanceModelBase::VLogOperandRead(const HloInstruction* operand,
           << ", n_bytes_net: " << n_bytes_net << ", coalesced: " << coalesced;
 }
 
-/*static*/
-void GpuPerformanceModelBase::VLogResult(
-    int64_t flops, int64_t bytes_read, int64_t bytes_written,
-    int64_t num_threads, absl::Duration compute_time, absl::Duration read_time,
-    absl::Duration write_time, absl::Duration exec_time) {
-  if (VLOG_IS_ON(8)) {
-    LOG(INFO) << "FLOPs: " << flops;
-    LOG(INFO) << "Bytes read: " << bytes_read;
-    LOG(INFO) << "Bytes written: " << bytes_written;
-    LOG(INFO) << "Num threads: " << num_threads;
-    LOG(INFO) << "Compute time: " << compute_time;
-    LOG(INFO) << "Input read time: " << read_time;
-    LOG(INFO) << "Output write time: " << write_time;
-    LOG(INFO) << "Exec time: " << exec_time;
-  }
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -18,14 +18,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
@@ -36,6 +36,7 @@ namespace gpu {
 
 struct EstimateRunTimeData {
   int64_t flops;
+  int64_t bytes_read;
   int64_t bytes_written;
   int64_t num_threads;
   absl::Duration read_time;
@@ -47,16 +48,17 @@ struct EstimateRunTimeData {
     return absl::StrFormat(
         "EstimateRunTimeData{\n"
         " flops: %d\n"
+        " bytes_read: %d\n"
         " bytes_written: %d\n"
         " num_threads: %d\n"
         " read_time: %s\n"
         " write_time: %s\n"
         " compute_time: %s\n"
         " exec_time: %s\n"
         "}",
-        flops, bytes_written, num_threads, absl::FormatDuration(read_time),
-        absl::FormatDuration(write_time), absl::FormatDuration(compute_time),
-        absl::FormatDuration(exec_time));
+        flops, bytes_read, bytes_written, num_threads,
+        absl::FormatDuration(read_time), absl::FormatDuration(write_time),
+        absl::FormatDuration(compute_time), absl::FormatDuration(exec_time));
   }
 };
 
@@ -226,12 +228,6 @@ class GpuPerformanceModelBase {
   static void VLogOperandRead(const HloInstruction* operand,
                               int64_t n_bytes_total, int64_t n_bytes_net,
                               bool coalesced);
-
-  // Logs estimate results of the performance model if VLOG is enabled.
-  static void VLogResult(int64_t flops, int64_t bytes_read,
-                         int64_t bytes_written, int64_t num_threads,
-                         absl::Duration compute_time, absl::Duration read_time,
-                         absl::Duration write_time, absl::Duration exec_time);
 };
 
 }  // namespace gpu