tensorflow · tensorflow-copybara · Jun 13, 2019 · Apr 19, 2019 · Apr 19, 2019 · Apr 19, 2019
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -195,20 +195,12 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  // Skip multiple output fusion. It's not yet supported.
-  if (fusion->IsMultiOutputFusion()) {
-    VLOG(3) << "Not merging " << fusion->name() << ": Is multi-output fusion.";
-    ++num_fail_not_loop_fusion_;
-    return Status::OK();
-  }
   // Skip 'fusion' instruction if we cannot merge into all of its users.
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
-               (user->IsLoopFusion() ||
-                (IsReduceInputFusion(*user) &&
-                 LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
+	       IsProducerConsumerFusible(*fusion, *user);
       })) {
     VLOG(3) << "Not merging " << fusion->name()
             << ": Some of its users are not loop/input fusion kernels.";

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -178,7 +178,7 @@ bool IsLoopFusible(const HloInstruction& instr) {
           instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
           (instr.opcode() == HloOpcode::kFusion &&
            instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
-          instr.opcode() == HloOpcode::kGather ||
+	  instr.opcode() == HloOpcode::kGather ||
           instr.opcode() == HloOpcode::kIota ||
           instr.opcode() == HloOpcode::kPad ||
           (instr.opcode() == HloOpcode::kReduce &&
@@ -187,13 +187,135 @@ bool IsLoopFusible(const HloInstruction& instr) {
           instr.opcode() == HloOpcode::kReshape ||
           instr.opcode() == HloOpcode::kReverse ||
           instr.opcode() == HloOpcode::kSlice ||
+          instr.opcode() == HloOpcode::kConstant ||
           instr.opcode() == HloOpcode::kTranspose);
 }
 
 bool IsFusible(const HloInstruction& instr) {
   return IsInputFusible(instr) || IsLoopFusible(instr);
 }
 
+bool IsProducerConsumerFusible(const HloInstruction& producer,
+                                   const HloInstruction& consumer) {
+
+  if (!IsLoopFusible(producer) || !IsFusible(consumer)) {
+    return false;
+  }
+
+  // Skip multiple output fusion. It's not yet supported.
+  if (producer.IsMultiOutputFusion()) {
+    return false;
+  }
+
+  // Do not fuse into reduce input fusions if the resulting kernel would suffer
+  // from poor data locality (due to unfriendly input layouts).
+  if (IsInputFusibleReduction(consumer) &&
+      !LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+    return false;
+  }
+
+  // We can't fuse library calls, so if a user of such an op could become a
+  // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for
+  // further rationale.
+  if (producer.CouldBeBitcast() &&
+      ImplementedAsLibraryCall(*producer.operand(0))) {
+    return false;
+  }
+
+  // Fuse scalar constants into loop fusion nodes. This reduces the number of
+  // parameters and makes matching scalar broadcasts easier.
+  //
+  // Don't fuse other constants: Unfused constants in GPU land can be
+  // represented as an external constant (i.e. not emitted in LLVM IR / PTX),
+  // but fused constants are handled by shrared CPU/GPU code and always emitted
+  // in the IR/PTX.  The external constant representation makes for faster
+  // compiles and significantly smaller assembly code.
+  if (producer.opcode() == HloOpcode::kConstant) {
+    return ShapeUtil::IsEffectiveScalar(producer.shape()) &&
+           consumer.opcode() == HloOpcode::kFusion;
+  }
+
+  return true;
+}
+
+bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
+                                              const HloInstruction& consumer) {
+
+  if (!IsFusibleAsMultiOutputFusionRoot(producer) || !IsFusibleAsMultiOutputFusionRoot(consumer)) {
+    return false;
+  }
+
+  if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
+    return false;
+  }
+
+  if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+    return false;
+  }
+
+  return true;
+}
+
+// This function limits the maximum number of operands to a fusion.
+//
+// There's a cap on how many parameters we can pass to a CUDA kernel, but
+// exactly what that limit is hazy, as it depends on (among other things) how
+// much GPU constant memory is in use for other purposes.
+//
+// Moreover, we don't even know at the point that we're running fusion how many
+// arguments the CUDA kernel for a fusion node will have: It depends on buffer
+// assignment, where we will decide which of the fusion's operands live in XLA's
+// big temp buffer versus in other allocations.
+//
+// As a heuristic, we simply cap the number of fusion operands plus outputs at
+// kMaxOperandsAndOutputsPerFusion.  This puts an upper bound on the number of
+// parameters to the kernel, working around the correctness problem.
+//
+// This limit is also often good for performance.  In a fusion with many
+// operands, each GPU thread likely has to do a lot of work, and so possibly
+// uses a lot of registers, thus limiting occupancy.
+bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2) {
+  // Compute the number of outputs of the (possibly multi-output) fusion node
+  // we're considering creating.
+  //
+  // This isn't precise; we may be off by one if
+  //  - We're creating a multi-output fusion out of two non-MOFs.  Creating a
+  //    MOF adds a new buffer, namely, the tuple buffer.
+  //  - We're merging two MOFs.  In this case, we should count the tuple buffer
+  //    only once.
+  //  - WLOG there's an edge from `a` to `b` and `b` is the only consumer of
+  //    `a`.  In this case the result of `a` is not part of the output of the
+  //    fusion.
+  //
+  // But because this is a heuristic and our limit
+  // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a
+  // big difference), we ignore this small inaccuracy in favor of simplicity.
+  int64 num_output_buffers = ShapeUtil::SubshapeCount(instr1.shape()) +
+                             ShapeUtil::SubshapeCount(instr2.shape());
+
+  // The new fusion will have no more operands and outputs than
+  //   producer_operands + consumer_operands - 1 + num_output_buffers
+  // (minus one because we may be fusing a producer->consumer edge between `a`
+  // and `b`).
+  //
+  // This fact may be enough to let us avoid having to compute the true total
+  // number of operands, which can be expensive.
+  if (instr1.operand_count() + instr2.operand_count() - 1 + num_output_buffers <=
+      kMaxOperandsAndOutputsPerFusion) {
+    return false;
+  }
+
+  // Compute the precise number of operands to the new fusion.
+  absl::flat_hash_set<const HloInstruction*> operands(instr1.operands().begin(),
+                                                      instr1.operands().end());
+  operands.insert(instr2.operands().begin(), instr2.operands().end());
+  // If there's an edge between `a` and `b`, don't count it: We're fusing that
+  // producer -> consumer relationship.
+  operands.erase(&instr1);
+  operands.erase(&instr2);
+  return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
+}
+
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr) {
   // We can fuse reduces and loop fusions. Elementwise instructions can be fused
   // with any other instruction.

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -24,6 +24,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
+
 // Whether 'instr' can occur inside fusions, i.e. whether it is a candidate
 // for being fused. Note that further restrictions apply, e.g. Scatter must
 // be the root of an input fusion.
@@ -59,6 +61,11 @@ bool IsInputFusibleReduction(const HloInstruction& instr);
 // is either an unfused scatter op or a scatter input fusion.
 bool IsInputFusibleScatter(const HloInstruction& instr);
 
+// Determines whether the combination of `instr1` and `instr2` into a (possibly
+// multi-output) fusion would be "too large" -- i.e., have more operands and
+// outputs than is allowed.
+bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
@@ -69,6 +76,17 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                           const HloInstruction& instr2);
 
+// Whether the instructions are compatible for producer-consumer fusion
+// i.e. whether the producer and consumer are loop/input fusible and
+// they are not library calls.
+bool IsProducerConsumerFusible(const HloInstruction& producer,
+                                   const HloInstruction& consumer);
+
+// Whether the instructions are producer-consumer fusible with multiple outputs.
+// That is, the root tuple of the multi-output fusion will contain the results
+// of both, the producer and consumer.
+bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
+                                              const HloInstruction& consumer);
 // Whether `instr` is a candidate for sibling fusion or as a consumer in
 // a producer-consumer multi-output fusion.
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);