tensorflow · copybara-service · Apr 23, 2021 · Sep 16, 2020 · Apr 22, 2021 · Apr 23, 2021
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -352,7 +352,7 @@ Status GpuCompiler::OptimizeHloModule(
     fusion.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
-    HloPassPipeline horizontal_fusion("horizontal_fusion");
+    HloPassFix<HloPassPipeline> horizontal_fusion("horizontal_fusion");
     horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
     horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -541,5 +541,35 @@ bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
   });
 }
 
+size_t GetInstrCountOfFusible(const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return 1;
+  } else {
+    return instr.fused_instruction_count();
+  }
+}
+
+absl::InlinedVector<HloInstruction*, 2> GetOutputsOfFusible(
+    const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return {const_cast<HloInstruction*>(&instr)};
+  }
+
+  HloInstruction* root = instr.fused_expression_root();
+  if (root->opcode() != HloOpcode::kTuple) {
+    return {root};
+  } else {
+    return root->operands();
+  }
+}
+
+size_t GetOutputSizeOfFusible(const HloInstruction& instr) {
+  if (!instr.IsMultiOutputFusion()) {
+    return 1;
+  }
+  const HloInstruction* root = instr.fused_expression_root();
+  return ShapeUtil::TupleElementCount(root->shape());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -109,6 +109,17 @@ HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
                                   const HloInstruction& consumer);
 
+// Returns number of instructions in the fusible `instr`. If `instr` is not a
+// fusion instruction, 1 is returned.
+size_t GetInstrCountOfFusible(const HloInstruction& instr);
+
+// Returns the outputs of the fusible `instr`.
+absl::InlinedVector<HloInstruction*, 2> GetOutputsOfFusible(
+    const HloInstruction& instr);
+
+// Returns the output size of the fusible `instr`.
+size_t GetOutputSizeOfFusible(const HloInstruction& instr);
+
 }  // namespace gpu
 }  // namespace xla
 

diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@@ -78,12 +78,12 @@ std::vector<HloInstruction*> FindAndSortFusionCandidates(
     HloInstruction* consumer) {
   absl::flat_hash_set<HloInstruction*> fusion_instr_set;
   std::vector<HloInstruction*> fusion_instrs;
-  for (auto opnd : consumer->operands()) {
+  for (HloInstruction* opnd : consumer->operands()) {
     HloInstruction* predecessor = opnd->LatestNonGteAncestor();
     // Find out the input fusion instructions whose only consumer is `consumer`.
     // This guarantees that fusing these candidates will never create cycles, as
     // there is no back edge.
-    if (IsReduceInputFusion(*predecessor) &&
+    if (IsInputFusibleReduction(*predecessor) &&
         IsConsumerTheOnlyNonRootUser(*predecessor, *consumer)) {
       if (fusion_instr_set.insert(predecessor).second) {
         fusion_instrs.push_back(predecessor);
@@ -102,8 +102,7 @@ std::vector<HloInstruction*> FindAndSortFusionCandidates(
               }
               // Sort `fusion_instrs` according to instruction counts, because
               // we'd like to fuse together computations of similar sizes.
-              return a->fused_instruction_count() <
-                     b->fused_instruction_count();
+              return GetInstrCountOfFusible(*a) < GetInstrCountOfFusible(*b);
             });
 
   return fusion_instrs;
@@ -116,12 +115,24 @@ StatusOr<bool> HorizontalInputFusionImpl::Run() {
   // Using def-to-use order is sound since we do not modify users.
   std::vector<HloInstruction*> def_to_use_order =
       computation_->MakeInstructionPostOrder();
-  for (auto consumer : def_to_use_order) {
+  for (HloInstruction* consumer : def_to_use_order) {
     auto candidates = FindAndSortFusionCandidates(consumer);
-    if (candidates.empty()) {
+    if (candidates.size() <= 1) {
       continue;
     }
 
+    // Convert candidates into fusions if needed.
+    for (size_t j = 0; j < candidates.size(); ++j) {
+      if (candidates[j]->opcode() != HloOpcode::kFusion) {
+        TF_ASSIGN_OR_RETURN(
+            HloInstruction * fusion_instr,
+            MakeFusionInstruction(candidates[j],
+                                  HloInstruction::FusionKind::kInput));
+        candidates[j] = fusion_instr;
+        changed = true;
+      }
+    }
+
     size_t fusion_anchor_id = 0;
     for (size_t j = 1; j < candidates.size(); ++j) {
       HloInstruction* fusion_anchor = candidates[fusion_anchor_id];
@@ -155,7 +166,7 @@ StatusOr<bool> GpuHorizontalInputFusion::RunOnComputation(
 StatusOr<bool> GpuHorizontalInputFusion::Run(HloModule* module) {
   bool changed = false;
   VLOG(2) << "Run horizontal input fusion.";
-  for (auto* comp : module->MakeNonfusionComputations()) {
+  for (HloComputation* comp : module->MakeNonfusionComputations()) {
     TF_ASSIGN_OR_RETURN(changed, RunOnComputation(comp));
   }
 

diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -211,6 +211,39 @@ TEST_F(HorizontalInputFusionTest, MultiOutputFusionTest) {
   EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(HorizontalInputFusionTest, NonfusionInstrs) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule NonfusionInstrs
+
+ %add_f16 {
+   %x = f16[] parameter(0)
+   %y = f16[] parameter(1)
+   ROOT %add = f16[] add(%x, %y)
+ }
+
+ ENTRY entry_computation {
+   arg.0 = f16[1024]{0} parameter(0)
+   arg.1 = f16[1024]{0} parameter(1)
+   constant0 = f16[] constant(0)
+   reduce.0 = f16[] reduce(arg.0, constant0), dimensions={0}, to_apply=%add_f16
+   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+   ROOT tuple.0 = (f16[], f16[]) tuple(reduce.0, reduce.1)
+ }
+)").ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(entry_root, op::Tuple((op::GetTupleElement(op::Fusion())),
+                                    (op::GetTupleElement(op::Fusion()))));
+
+  const HloInstruction* fusion = entry_root->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla