tensorflow · copybara-service · Apr 23, 2021 · Sep 16, 2020 · Apr 22, 2021 · Apr 23, 2021
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -352,7 +352,7 @@ Status GpuCompiler::OptimizeHloModule(
     fusion.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
-    HloPassPipeline horizontal_fusion("horizontal_fusion");
+    HloPassFix<HloPassPipeline> horizontal_fusion("horizontal_fusion");
     horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
     horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,

diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@@ -44,6 +44,25 @@ Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
   }
 }
 
+// Creates a kInput fusion instruction and fuses `fused` into the created
+// fusion instruction.
+HloInstruction* MakeInputFusionInstruction(HloInstruction* fused) {
+  HloComputation* comp = fused->parent();
+  HloInstruction* fusion_instruction =
+      comp->AddInstruction(HloInstruction::CreateFusion(
+          fused->shape(), HloInstruction::FusionKind::kInput, fused));
+  TF_CHECK_OK(comp->ReplaceInstruction(fused, fusion_instruction));
+  return fusion_instruction;
+}
+
+size_t GetInstrCountOfFusible(const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return 1;
+  } else {
+    return instr.fused_instruction_count();
+  }
+}
+
 class HorizontalInputFusionImpl {
  public:
   explicit HorizontalInputFusionImpl(HloComputation* computation)
@@ -83,7 +102,7 @@ std::vector<HloInstruction*> FindAndSortFusionCandidates(
     // Find out the input fusion instructions whose only consumer is `consumer`.
     // This guarantees that fusing these candidates will never create cycles, as
     // there is no back edge.
-    if (IsReduceInputFusion(*predecessor) &&
+    if (IsInputFusibleReduction(*predecessor) &&
         IsConsumerTheOnlyNonRootUser(*predecessor, *consumer)) {
       if (fusion_instr_set.insert(predecessor).second) {
         fusion_instrs.push_back(predecessor);
@@ -102,8 +121,7 @@ std::vector<HloInstruction*> FindAndSortFusionCandidates(
               }
               // Sort `fusion_instrs` according to instruction counts, because
               // we'd like to fuse together computations of similar sizes.
-              return a->fused_instruction_count() <
-                     b->fused_instruction_count();
+              return GetInstrCountOfFusible(*a) < GetInstrCountOfFusible(*b);
             });
 
   return fusion_instrs;
@@ -118,10 +136,18 @@ StatusOr<bool> HorizontalInputFusionImpl::Run() {
       computation_->MakeInstructionPostOrder();
   for (auto consumer : def_to_use_order) {
     auto candidates = FindAndSortFusionCandidates(consumer);
-    if (candidates.empty()) {
+    if (candidates.size() <= 1) {
       continue;
     }
 
+    // Convert candidates into fusions if needed.
+    for (size_t j = 0; j < candidates.size(); ++j) {
+      if (candidates[j]->opcode() != HloOpcode::kFusion) {
+        candidates[j] = MakeInputFusionInstruction(candidates[j]);
+        changed = true;
+      }
+    }
+
     size_t fusion_anchor_id = 0;
     for (size_t j = 1; j < candidates.size(); ++j) {
       HloInstruction* fusion_anchor = candidates[fusion_anchor_id];

diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -211,6 +211,39 @@ TEST_F(HorizontalInputFusionTest, MultiOutputFusionTest) {
   EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(HorizontalInputFusionTest, NonfusionInstrs) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule NonfusionInstrs
+
+ %add_f16 {
+   %x = f16[] parameter(0)
+   %y = f16[] parameter(1)
+   ROOT %add = f16[] add(%x, %y)
+ }
+
+ ENTRY entry_computation {
+   arg.0 = f16[1024]{0} parameter(0)
+   arg.1 = f16[1024]{0} parameter(1)
+   constant0 = f16[] constant(0)
+   reduce.0 = f16[] reduce(arg.0, constant0), dimensions={0}, to_apply=%add_f16
+   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+   ROOT tuple.0 = (f16[], f16[]) tuple(reduce.0, reduce.1)
+ }
+)").ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(entry_root, op::Tuple((op::GetTupleElement(op::Fusion())),
+                                    (op::GetTupleElement(op::Fusion()))));
+
+  const HloInstruction* fusion = entry_root->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla