diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 4103605df99450..5e7c03bc2cee56 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -195,20 +195,12 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  // Skip multiple output fusion. It's not yet supported.
-  if (fusion->IsMultiOutputFusion()) {
-    VLOG(3) << "Not merging " << fusion->name() << ": Is multi-output fusion.";
-    ++num_fail_not_loop_fusion_;
-    return Status::OK();
-  }
   // Skip 'fusion' instruction if we cannot merge into all of its users.
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
-               (user->IsLoopFusion() ||
-                (IsReduceInputFusion(*user) &&
-                 LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
+	       IsProducerConsumerFusible(*fusion, *user);
       })) {
     VLOG(3) << "Not merging " << fusion->name()
             << ": Some of its users are not loop/input fusion kernels.";
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index d5b351f69e3dea..896abd4080ab8a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -178,7 +178,7 @@ bool IsLoopFusible(const HloInstruction& instr) {
           instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
           (instr.opcode() == HloOpcode::kFusion &&
            instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
-          instr.opcode() == HloOpcode::kGather ||
+	  instr.opcode() == HloOpcode::kGather ||
           instr.opcode() == HloOpcode::kIota ||
           instr.opcode() == HloOpcode::kPad ||
           (instr.opcode() == HloOpcode::kReduce &&
@@ -187,6 +187,7 @@ bool IsLoopFusible(const HloInstruction& instr) {
           instr.opcode() == HloOpcode::kReshape ||
           instr.opcode() == HloOpcode::kReverse ||
           instr.opcode() == HloOpcode::kSlice ||
+          instr.opcode() == HloOpcode::kConstant ||
           instr.opcode() == HloOpcode::kTranspose);
 }
 
@@ -194,6 +195,127 @@ bool IsFusible(const HloInstruction& instr) {
   return IsInputFusible(instr) || IsLoopFusible(instr);
 }
 
+bool IsProducerConsumerFusible(const HloInstruction& producer,
+                                   const HloInstruction& consumer) {
+
+  if (!IsLoopFusible(producer) || !IsFusible(consumer)) {
+    return false;
+  }
+
+  // Skip multiple output fusion. It's not yet supported.
+  if (producer.IsMultiOutputFusion()) {
+    return false;
+  }
+
+  // Do not fuse into reduce input fusions if the resulting kernel would suffer
+  // from poor data locality (due to unfriendly input layouts).
+  if (IsInputFusibleReduction(consumer) &&
+      !LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+    return false;
+  }
+
+  // We can't fuse library calls, so if a user of such an op could become a
+  // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for
+  // further rationale.
+  if (producer.CouldBeBitcast() &&
+      ImplementedAsLibraryCall(*producer.operand(0))) {
+    return false;
+  }
+
+  // Fuse scalar constants into loop fusion nodes. This reduces the number of
+  // parameters and makes matching scalar broadcasts easier.
+  //
+  // Don't fuse other constants: Unfused constants in GPU land can be
+  // represented as an external constant (i.e. not emitted in LLVM IR / PTX),
+  // but fused constants are handled by shrared CPU/GPU code and always emitted
+  // in the IR/PTX.  The external constant representation makes for faster
+  // compiles and significantly smaller assembly code.
+  if (producer.opcode() == HloOpcode::kConstant) {
+    return ShapeUtil::IsEffectiveScalar(producer.shape()) &&
+           consumer.opcode() == HloOpcode::kFusion;
+  }
+
+  return true;
+}
+
+bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
+                                              const HloInstruction& consumer) {
+
+  if (!IsFusibleAsMultiOutputFusionRoot(producer) || !IsFusibleAsMultiOutputFusionRoot(consumer)) {
+    return false;
+  }
+
+  if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
+    return false;
+  }
+
+  if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+    return false;
+  }
+
+  return true;
+}
+
+// This function limits the maximum number of operands to a fusion.
+//
+// There's a cap on how many parameters we can pass to a CUDA kernel, but
+// exactly what that limit is hazy, as it depends on (among other things) how
+// much GPU constant memory is in use for other purposes.
+//
+// Moreover, we don't even know at the point that we're running fusion how many
+// arguments the CUDA kernel for a fusion node will have: It depends on buffer
+// assignment, where we will decide which of the fusion's operands live in XLA's
+// big temp buffer versus in other allocations.
+//
+// As a heuristic, we simply cap the number of fusion operands plus outputs at
+// kMaxOperandsAndOutputsPerFusion.  This puts an upper bound on the number of
+// parameters to the kernel, working around the correctness problem.
+//
+// This limit is also often good for performance.  In a fusion with many
+// operands, each GPU thread likely has to do a lot of work, and so possibly
+// uses a lot of registers, thus limiting occupancy.
+bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2) {
+  // Compute the number of outputs of the (possibly multi-output) fusion node
+  // we're considering creating.
+  //
+  // This isn't precise; we may be off by one if
+  //  - We're creating a multi-output fusion out of two non-MOFs.  Creating a
+  //    MOF adds a new buffer, namely, the tuple buffer.
+  //  - We're merging two MOFs.  In this case, we should count the tuple buffer
+  //    only once.
+  //  - WLOG there's an edge from `a` to `b` and `b` is the only consumer of
+  //    `a`.  In this case the result of `a` is not part of the output of the
+  //    fusion.
+  //
+  // But because this is a heuristic and our limit
+  // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a
+  // big difference), we ignore this small inaccuracy in favor of simplicity.
+  int64 num_output_buffers = ShapeUtil::SubshapeCount(instr1.shape()) +
+                             ShapeUtil::SubshapeCount(instr2.shape());
+
+  // The new fusion will have no more operands and outputs than
+  //   producer_operands + consumer_operands - 1 + num_output_buffers
+  // (minus one because we may be fusing a producer->consumer edge between `a`
+  // and `b`).
+  //
+  // This fact may be enough to let us avoid having to compute the true total
+  // number of operands, which can be expensive.
+  if (instr1.operand_count() + instr2.operand_count() - 1 + num_output_buffers <=
+      kMaxOperandsAndOutputsPerFusion) {
+    return false;
+  }
+
+  // Compute the precise number of operands to the new fusion.
+  absl::flat_hash_set<const HloInstruction*> operands(instr1.operands().begin(),
+                                                      instr1.operands().end());
+  operands.insert(instr2.operands().begin(), instr2.operands().end());
+  // If there's an edge between `a` and `b`, don't count it: We're fusing that
+  // producer -> consumer relationship.
+  operands.erase(&instr1);
+  operands.erase(&instr2);
+  return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
+}
+
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr) {
   // We can fuse reduces and loop fusions. Elementwise instructions can be fused
   // with any other instruction.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index a4501fd31dc5e4..073896530008b3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -24,6 +24,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
+
 // Whether 'instr' can occur inside fusions, i.e. whether it is a candidate
 // for being fused. Note that further restrictions apply, e.g. Scatter must
 // be the root of an input fusion.
@@ -59,6 +61,11 @@ bool IsInputFusibleReduction(const HloInstruction& instr);
 // is either an unfused scatter op or a scatter input fusion.
 bool IsInputFusibleScatter(const HloInstruction& instr);
 
+// Determines whether the combination of `instr1` and `instr2` into a (possibly
+// multi-output) fusion would be "too large" -- i.e., have more operands and
+// outputs than is allowed.
+bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
@@ -69,6 +76,17 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                           const HloInstruction& instr2);
 
+// Whether the instructions are compatible for producer-consumer fusion
+// i.e. whether the producer and consumer are loop/input fusible and
+// they are not library calls.
+bool IsProducerConsumerFusible(const HloInstruction& producer,
+                                   const HloInstruction& consumer);
+
+// Whether the instructions are producer-consumer fusible with multiple outputs.
+// That is, the root tuple of the multi-output fusion will contain the results
+// of both, the producer and consumer.
+bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
+                                              const HloInstruction& consumer);
 // Whether `instr` is a candidate for sibling fusion or as a consumer in
 // a producer-consumer multi-output fusion.
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index cee678e290253b..ea60f641969bda 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -643,5 +643,264 @@ TEST_F(GpuFusibleTest,
   EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
 }
 
+TEST_F(GpuFusibleTest, IsFusibleAsMultiOutputFusionRoot) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    })")
+                    .ValueOrDie();
+
+  const HloInstruction* root =
+      module->entry_computation()->root_instruction();
+  EXPECT_TRUE(IsFusibleAsMultiOutputFusionRoot(*root));
+}
+
+TEST_F(GpuFusibleTest, ScatterIsNotFusibleAsMultiOutputFusionRoot) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY Scatter {
+      p0 = s32[3,3] parameter(0)
+      operand = s32[3,3] add(p0, p0)
+      p1 = s32[2] parameter(1)
+      indices = s32[2] add(p1, p1)
+      p2 = s32[2,3] parameter(2)
+      updates = s32[2,3] add(p2, p2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=add,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    })")
+                    .ValueOrDie();
+
+  const HloInstruction* scatter_inst =
+      module->entry_computation()->root_instruction();
+  EXPECT_FALSE(IsFusibleAsMultiOutputFusionRoot(*scatter_inst));
+}
+
+TEST_F(GpuFusibleTest, ProducerConsumerFusionElementwiseAndReduce) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[32,32,32]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      exp = f32[32,32,32]{2,1,0} exponential(p0)
+      reduce = f32[32,32]{1,0} reduce(exp, c0), dimensions={2},
+        to_apply=scalar_add
+      ROOT root = (f32[32,32]{1,0}, f32[32,32,32]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* consumer = root->operand(0);
+  const HloInstruction* producer = root->operand(1);
+  EXPECT_TRUE(IsProducerConsumerMultiOutputFusible(*producer, *consumer));
+}
+
+TEST_F(GpuFusibleTest, ProducerConsumerFusionLoopFusionAndReduce) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_add {
+      p0.1 = f32[32,32,32]{2,1,0} parameter(0)
+      p1.1 = f32[32,32,32]{2,1,0} parameter(1)
+      ROOT add = f32[32,32,32]{2,1,0} add(p0.1, p1.1)
+    }
+
+    ENTRY reduce {
+      p0 = f32[32,32,32]{2,1,0} parameter(0)
+      p1 = f32[32,32,32]{2,1,0} parameter(1)
+      c0 = f32[] constant(0)
+      add = f32[32,32,32]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_add
+      reduce = f32[32,32]{1,0} reduce(add, c0), dimensions={2},
+        to_apply=scalar_add
+      ROOT root = (f32[32,32]{1,0}, f32[32,32,32]{2,1,0}) tuple(reduce, add)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* consumer = root->operand(0);
+  const HloInstruction* producer = root->operand(1);
+  EXPECT_TRUE(IsProducerConsumerMultiOutputFusible(*producer, *consumer));
+}
+
+TEST_F(GpuFusibleTest, ProducerConsumerFusionLoopFusionAndReduceFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_select {
+      p1.1 = f32[32,32,32]{2,1,0} parameter(1)
+      c0 = f32[] constant(0)
+      broadcast = f32[32,32,32]{2,1,0} broadcast(f32[] c0), dimensions={}
+      greater-than = pred[32,32,32]{2,1,0} compare(f32[32,32,32]{2,1,0} p1.1,
+        f32[32,32,32]{2,1,0} broadcast), direction=GT
+      p0.1 = f32[32,32,32]{2,1,0} parameter(0)
+      ROOT select = f32[32,32,32]{2,1,0} select(pred[32,32,32]{2,1,0}
+        greater-than, f32[32,32,32]{2,1,0} p0.1, f32[32,32,32]{2,1,0} broadcast)
+    }
+
+    fused_reduce {
+      p0.2 = f32[32,32,32]{2,1,0} parameter(0)
+      c1 = f32[] constant(0)
+      r1 = f32[32,32]{1,0} reduce(p0.2, c1), dimensions={2},
+        to_apply=scalar_add
+      mul = f32[32,32,32]{2,1,0} multiply(p0.2, p0.2)
+      r2 = f32[32,32]{1,0} reduce(mul, c1), dimensions={2},
+        to_apply=scalar_add
+      ROOT tuple = (f32[32,32]{1,0}, f32[32,32]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p0 = f32[32,32,32]{2,1,0} parameter(0)
+      p1 = f32[32,32,32]{2,1,0} parameter(1)
+      select = f32[32,32,32]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select
+      fusion = (f32[32,32]{1,0}, f32[32,32]{1,0}) fusion(select), kind=kInput,
+        calls=fused_reduce
+      ROOT root = (f32[32,32]{1,0}, f32[32,32,32]{2,1,0}) tuple(fusion, select)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* consumer = root->operand(0);
+  const HloInstruction* producer = root->operand(1);
+  EXPECT_TRUE(IsProducerConsumerMultiOutputFusible(*producer, *consumer));
+}
+
+TEST_F(GpuFusibleTest, ProducerConsumerFusionDoNotFuseLoopReduceFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_element_wise {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      ROOT root = f32[2,2,2]{2,1,0} add(p0.1, p1.1)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2,
+        f32[2,2,2]{2,1,0} p0.2)
+      broadcast = f32[2,2,2,2]{3,2,1,0} broadcast(mul), dimensions={3,2,1}
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2,2]{3,2,1,0} broadcast,
+        f32[] c1), dimensions={1,3}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      element_wise = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_element_wise
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(element_wise), kind=kLoop, calls=fused_reduce
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(fusion, element_wise)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* consumer = root->operand(0);
+  const HloInstruction* producer = root->operand(1);
+  // Not fusible as multioutput fusion root
+  EXPECT_FALSE(IsProducerConsumerMultiOutputFusible(*producer, *consumer));
+}
+
+TEST_F(GpuFusibleTest,
+       ProducerConsumerFusionReduceUnfriendlyLoopFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    mixed_input_layouts_computation {
+      p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
+      p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1)
+      copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1)
+      c0 = f16[] constant(0)
+      broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={}
+      greater-than = pred[128,1024,32,32]{1,3,2,0} compare(copy, broadcast), direction=GT
+      ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast)
+    }
+    fused_reduce {
+      p0.2 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
+      convert = f32[128,1024,32,32]{1,3,2,0} convert(p0.2)
+      c0.2 = f32[] constant(0)
+      ROOT reduce = f32[1024]{0} reduce(convert, c0.2), dimensions={0,2,3}, to_apply=scalar_add
+    }
+    ENTRY reduce {
+      p0 = f16[128,1024,32,32]{3,2,1,0} parameter(0)
+      p1 = f16[128,1024,32,32]{1,3,2,0} parameter(1)
+      loop_fusion = f16[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=mixed_input_layouts_computation
+      reduce_fusion = f32[1024]{0} fusion(loop_fusion), kind=kInput, calls=fused_reduce
+      ROOT root = (f32[1024]{0}, f16[128,1024,32,32]{1,3,2,0}) tuple(reduce_fusion, loop_fusion)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* consumer = root->operand(0);
+  const HloInstruction* producer = root->operand(1);
+  EXPECT_FALSE(IsProducerConsumerMultiOutputFusible(*producer, *consumer));
+}
+
+TEST_F(GpuFusibleTest, NonscalarConstantsNotFused) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY BroadcastIntoReduce {
+      constant = f32[16] constant({0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})
+      broadcast = f32[16,16,16,16]{3,2,1,0} broadcast(constant), dimensions={0}
+      constant.1 = f32[] constant(0)
+      reduce = f32[] reduce(broadcast, constant.1), dimensions={0,1,2,3},
+                                                         to_apply=add
+      ROOT root = (f32[], f32[], f32[16,16,16,16], f32[16]) tuple(reduce, constant.1, broadcast, constant)
+    })")
+                    .ValueOrDie();
+  // Do not fuse if producer is a non-scalar constant or consumer is non-fusion node.
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* consumer = root->operand(0);
+  const HloInstruction* producer = root->operand(1);
+  const HloInstruction* consumer2 = root->operand(2);
+  const HloInstruction* producer2 = root->operand(3);
+  EXPECT_FALSE(IsProducerConsumerFusible(*producer, *consumer));
+  EXPECT_FALSE(IsProducerConsumerFusible(*producer2, *consumer2));
+}
+
+TEST_F(GpuFusibleTest, DoNotFuseLayoutChangingOpWithReduce) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY entry {
+      p0 = f32[16,16,16,16]{3,2,1,0} parameter(0)
+      copy = f32[16,16,16,16]{0,1,2,3} copy(p0)
+      constant.1 = f32[] constant(0)
+      ROOT reduce = f32[16] reduce(copy, constant.1), dimensions={0,1,2}, to_apply=add
+    })")
+                    .ValueOrDie();
+
+  const HloInstruction* consumer = module->entry_computation()->root_instruction();
+  const HloInstruction* producer = consumer->operand(0);
+  EXPECT_FALSE(IsProducerConsumerFusible(*producer, *consumer));
+}
+
+TEST_F(GpuFusibleTest, FuseLayoutChangingOpWithElementwise) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+    ENTRY entry {
+      p0 = f32[16,16,16,16]{3,2,1,0} parameter(0)
+      copy = f32[16,16,16,16]{0,1,2,3} copy(p0)
+      ROOT add = f32[16,16,16,16]{0,1,2,3} add(copy, copy)
+    })")
+                    .ValueOrDie();
+
+  const HloInstruction* consumer = module->entry_computation()->root_instruction();
+  const HloInstruction* producer = consumer->operand(0);
+  EXPECT_TRUE(IsProducerConsumerFusible(*producer, *consumer));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 54cab21ab4c240..0083e3e87a6d8b 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -53,67 +53,6 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
   }
 }
 
-// This function limits the maximum number of operands to a fusion.
-//
-// There's a cap on how many parameters we can pass to a CUDA kernel, but
-// exactly what that limit is hazy, as it depends on (among other things) how
-// much GPU constant memory is in use for other purposes.
-//
-// Moreover, we don't even know at the point that we're running fusion how many
-// arguments the CUDA kernel for a fusion node will have: It depends on buffer
-// assignment, where we will decide which of the fusion's operands live in XLA's
-// big temp buffer versus in other allocations.
-//
-// As a heuristic, we simply cap the number of fusion operands plus outputs at
-// kMaxOperandsAndOutputsPerFusion.  This puts an upper bound on the number of
-// parameters to the kernel, working around the correctness problem.
-//
-// This limit is also often good for performance.  In a fusion with many
-// operands, each GPU thread likely has to do a lot of work, and so possibly
-// uses a lot of registers, thus limiting occupancy.
-/*static*/ bool GpuInstructionFusion::FusionWouldBeTooLarge(
-    const HloInstruction* a, const HloInstruction* b) {
-  // Compute the number of outputs of the (possibly multi-output) fusion node
-  // we're considering creating.
-  //
-  // This isn't precise; we may be off by one if
-  //  - We're creating a multi-output fusion out of two non-MOFs.  Creating a
-  //    MOF adds a new buffer, namely, the tuple buffer.
-  //  - We're merging two MOFs.  In this case, we should count the tuple buffer
-  //    only once.
-  //  - WLOG there's an edge from `a` to `b` and `b` is the only consumer of
-  //    `a`.  In this case the result of `a` is not part of the output of the
-  //    fusion.
-  //
-  // But because this is a heuristic and our limit
-  // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a
-  // big difference), we ignore this small inaccuracy in favor of simplicity.
-  int64 num_output_buffers = ShapeUtil::SubshapeCount(a->shape()) +
-                             ShapeUtil::SubshapeCount(b->shape());
-
-  // The new fusion will have no more operands and outputs than
-  //   producer_operands + consumer_operands - 1 + num_output_buffers
-  // (minus one because we may be fusing a producer->consumer edge between `a`
-  // and `b`).
-  //
-  // This fact may be enough to let us avoid having to compute the true total
-  // number of operands, which can be expensive.
-  if (a->operand_count() + b->operand_count() - 1 + num_output_buffers <=
-      kMaxOperandsAndOutputsPerFusion) {
-    return false;
-  }
-
-  // Compute the precise number of operands to the new fusion.
-  absl::flat_hash_set<const HloInstruction*> operands(a->operands().begin(),
-                                                      a->operands().end());
-  operands.insert(b->operands().begin(), b->operands().end());
-  // If there's an edge between `a` and `b`, don't count it: We're fusing that
-  // producer -> consumer relationship.
-  operands.erase(a);
-  operands.erase(b);
-  return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
-}
-
 bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
                                                        int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
@@ -184,42 +123,6 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
     return false;
   }
 
-  // Other output fusions are not currently supported on GPUs.
-  if (producer->opcode() == HloOpcode::kFusion) {
-    return false;
-  }
-
-  // RNG operations are not currently parallel-friendly on GPU.
-  if (producer->opcode() == HloOpcode::kRng) {
-    return false;
-  }
-
-  // Do not fuse to-vector reduction into other consumers. They should be
-  // unfused or the root of a kInput fusion.
-  if (IsReductionFromOrToContiguousDimensions(*producer)) {
-    return false;
-  }
-
-  // Scatter is only supported at the root of a kInput fusion.
-  if (producer->opcode() == HloOpcode::kScatter) {
-    return false;
-  }
-
-  // Do not fuse into reduce input fusions if the resulting kernel would suffer
-  // from poor data locality (due to unfriendly input layouts).
-  if (IsInputFusibleReduction(*consumer) &&
-      !LayoutsAreReduceInputFusionFriendly(*producer, *consumer)) {
-    return false;
-  }
-
-  // We can't fuse library calls, so if a user of such an op could become a
-  // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for
-  // further rationale.
-  if (producer->CouldBeBitcast() &&
-      ImplementedAsLibraryCall(*producer->operand(0))) {
-    return false;
-  }
-
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
   if (producer->opcode() != HloOpcode::kFusion &&
@@ -228,20 +131,7 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
     return false;
   }
 
-  // Fuse scalar constants into loop fusion nodes. This reduces the number of
-  // parameters and makes matching scalar broadcasts easier.
-  //
-  // Don't fuse other constants: Unfused constants in GPU land can be
-  // represented as an external constant (i.e. not emitted in LLVM IR / PTX),
-  // but fused constants are handled by shrared CPU/GPU code and always emitted
-  // in the IR/PTX.  The external constant representation makes for faster
-  // compiles and significantly smaller assembly code.
-  if (producer->opcode() == HloOpcode::kConstant) {
-    return ShapeUtil::IsEffectiveScalar(producer->shape()) &&
-           consumer->opcode() == HloOpcode::kFusion;
-  }
-
-  if (!IsFusible(*producer) || !IsFusible(*consumer) ||
+  if (!IsProducerConsumerFusible(*producer, *consumer) ||
       !InstructionFusion::ShouldFuse(consumer, operand_index)) {
     return false;
   }
@@ -259,8 +149,8 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   if (consumer->opcode() == HloOpcode::kReduce && consumer->shape().IsTuple()) {
     return false;
   }
-  // The following checks are potentially expensive.
-  if (FusionWouldBeTooLarge(consumer, producer)) {
+  // The following checks are potentially expensive.	
+  if (FusionWouldBeTooLarge(*consumer, *producer)) {
     return false;
   }
   // Also check that our emitter can handle the fusion node. We currently can
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index 2f8f40b4b5ef4f..482b7e8f42aedc 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -27,19 +27,6 @@ class GpuInstructionFusion : public InstructionFusion {
   explicit GpuInstructionFusion(bool may_duplicate)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
 
-  // Maximum number of operands plus outputs allowed on a single fusion node.
-  // Exposed publicly mainly for tests.
-  static constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
-
-  // Determines whether the combination of `a` and `b` into a (possibly
-  // multi-output) fusion would be "too large" -- i.e., have more operands and
-  // outputs than is allowed.
-  //
-  // `ShouldFuse` and `ShouldFuseIntoMultiOutput` call this; it's public so that
-  // other fusion passes (e.g. GPU multi-output fusion) can also call this.
-  static bool FusionWouldBeTooLarge(const HloInstruction* a,
-                                    const HloInstruction* b);
-
   static bool IsExpensive(const HloInstruction& instruction);
 
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index edb6ecf6247734..dd5bdc3a245580 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -560,7 +561,7 @@ TEST_F(InstructionFusionTest, FuseScalarConstant) {
 // Check that we limit the number of operands to fusions we create.
 TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
   constexpr int64 kNumParams = 200;
-  ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion);
+  ASSERT_GT(kNumParams, kMaxOperandsAndOutputsPerFusion);
 
   // Compute p0 + p1 + ... + pN.
   HloComputation::Builder b(TestName());
@@ -582,7 +583,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
   SCOPED_TRACE(module->ToString());
   for (const HloInstruction* instr : computation->instructions()) {
     EXPECT_LE(instr->operand_count(),
-              GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion)
+              kMaxOperandsAndOutputsPerFusion)
         << instr->ToString();
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index a00900fabab7b9..5018c87de0926f 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -102,7 +102,7 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
   }
 
   // Do this check last, as it may be expensive.
-  return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2);
+  return !FusionWouldBeTooLarge(*instr1, *instr2);
 }
 
 bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
@@ -127,38 +127,24 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
       VLOG(3) << consumer->name() << " has no users.";
       continue;
     }
-    if (!IsInputFusibleReduction(*consumer)) {
-      VLOG(3) << consumer->name() << " is not an input-fusible reduction.";
-      continue;
-    }
     VLOG(3) << consumer->name()
             << " is a fusion candidate. Looking for fuseable operands.";
 
     auto consumer_operands = consumer->operands();
     for (size_t i = 0; i < consumer_operands.size(); ++i) {
       HloInstruction* producer = consumer_operands[i];
-      if (!producer->IsFusible()) {
-        VLOG(3) << producer->name() << " is not fusible.";
+      if (!IsProducerConsumerMultiOutputFusible(*producer, *consumer)) {
+        VLOG(3) << producer->name() << " and " << consumer->name() << " are not fusible.";
         continue;
       }
+
       // Never multi-output fuse constants.  To the extent that we want to fuse
       // constants, that should be handled by the regular fusion pass.
       if (producer->opcode() == HloOpcode::kConstant) {
         VLOG(3) << producer->name() << " is a constant.";
         continue;
       }
-      if (!producer->IsElementwise() && !producer->IsLoopFusion()) {
-        VLOG(3) << producer->name() << " is not a loop fusion.";
-        continue;
-      }
-      if (!ShapesCompatibleForMultiOutputFusion(*producer, *consumer)) {
-        VLOG(3) << producer->name() << " has an incompatible shape.";
-        continue;
-      }
-      if (!LayoutsAreReduceInputFusionFriendly(*producer, *consumer)) {
-        VLOG(3) << producer->name() << " has inputs with mixed layouts.";
-        continue;
-      }
+
       // If we have already decided to fuse this producer, skip it.
       if (ContainsKey(to_fuse, producer)) {
         VLOG(3) << producer->name() << " will be fused with another consumer.";
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 2aa61b8951ad4a..99e2511cb6e0c0 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
@@ -601,7 +602,7 @@ TEST_F(MultiOutputFusionTest,
 // Check that we limit the number of operands to fusions we create.
 TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   constexpr int64 kNumParams = 200;
-  ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion);
+  ASSERT_GT(kNumParams, kMaxOperandsAndOutputsPerFusion);
 
   // Compute
   //   p0 * p1,
@@ -646,7 +647,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   SCOPED_TRACE(module->ToString());
   for (const HloInstruction* instr : computation->instructions()) {
     EXPECT_LE(instr->operand_count() + ShapeUtil::SubshapeCount(instr->shape()),
-              GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion)
+              kMaxOperandsAndOutputsPerFusion)
         << instr->ToString();
   }
 }