diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc index 4103605df99450..5e7c03bc2cee56 100644 --- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc +++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc @@ -195,20 +195,12 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) { return Status::OK(); } - // Skip multiple output fusion. It's not yet supported. - if (fusion->IsMultiOutputFusion()) { - VLOG(3) << "Not merging " << fusion->name() << ": Is multi-output fusion."; - ++num_fail_not_loop_fusion_; - return Status::OK(); - } // Skip 'fusion' instruction if we cannot merge into all of its users. // Merging into all users enables the removal of 'fusion' from the // computation. if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) { return user->opcode() == HloOpcode::kFusion && - (user->IsLoopFusion() || - (IsReduceInputFusion(*user) && - LayoutsAreReduceInputFusionFriendly(*fusion, *user))); + IsProducerConsumerFusible(*fusion, *user); })) { VLOG(3) << "Not merging " << fusion->name() << ": Some of its users are not loop/input fusion kernels."; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc index d5b351f69e3dea..896abd4080ab8a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc @@ -178,7 +178,7 @@ bool IsLoopFusible(const HloInstruction& instr) { instr.opcode() == HloOpcode::kDynamicUpdateSlice || (instr.opcode() == HloOpcode::kFusion && instr.fusion_kind() == HloInstruction::FusionKind::kLoop) || - instr.opcode() == HloOpcode::kGather || + instr.opcode() == HloOpcode::kGather || instr.opcode() == HloOpcode::kIota || instr.opcode() == HloOpcode::kPad || (instr.opcode() == HloOpcode::kReduce && @@ -187,6 +187,7 @@ bool IsLoopFusible(const HloInstruction& instr) { instr.opcode() == HloOpcode::kReshape || instr.opcode() == HloOpcode::kReverse || instr.opcode() == HloOpcode::kSlice || + instr.opcode() == HloOpcode::kConstant || instr.opcode() == HloOpcode::kTranspose); } @@ -194,6 +195,127 @@ bool IsFusible(const HloInstruction& instr) { return IsInputFusible(instr) || IsLoopFusible(instr); } +bool IsProducerConsumerFusible(const HloInstruction& producer, + const HloInstruction& consumer) { + + if (!IsLoopFusible(producer) || !IsFusible(consumer)) { + return false; + } + + // Skip multiple output fusion. It's not yet supported. + if (producer.IsMultiOutputFusion()) { + return false; + } + + // Do not fuse into reduce input fusions if the resulting kernel would suffer + // from poor data locality (due to unfriendly input layouts). + if (IsInputFusibleReduction(consumer) && + !LayoutsAreReduceInputFusionFriendly(producer, consumer)) { + return false; + } + + // We can't fuse library calls, so if a user of such an op could become a + // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for + // further rationale. + if (producer.CouldBeBitcast() && + ImplementedAsLibraryCall(*producer.operand(0))) { + return false; + } + + // Fuse scalar constants into loop fusion nodes. This reduces the number of + // parameters and makes matching scalar broadcasts easier. + // + // Don't fuse other constants: Unfused constants in GPU land can be + // represented as an external constant (i.e. not emitted in LLVM IR / PTX), + // but fused constants are handled by shrared CPU/GPU code and always emitted + // in the IR/PTX. The external constant representation makes for faster + // compiles and significantly smaller assembly code. + if (producer.opcode() == HloOpcode::kConstant) { + return ShapeUtil::IsEffectiveScalar(producer.shape()) && + consumer.opcode() == HloOpcode::kFusion; + } + + return true; +} + +bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer, + const HloInstruction& consumer) { + + if (!IsFusibleAsMultiOutputFusionRoot(producer) || !IsFusibleAsMultiOutputFusionRoot(consumer)) { + return false; + } + + if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) { + return false; + } + + if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) { + return false; + } + + return true; +} + +// This function limits the maximum number of operands to a fusion. +// +// There's a cap on how many parameters we can pass to a CUDA kernel, but +// exactly what that limit is hazy, as it depends on (among other things) how +// much GPU constant memory is in use for other purposes. +// +// Moreover, we don't even know at the point that we're running fusion how many +// arguments the CUDA kernel for a fusion node will have: It depends on buffer +// assignment, where we will decide which of the fusion's operands live in XLA's +// big temp buffer versus in other allocations. +// +// As a heuristic, we simply cap the number of fusion operands plus outputs at +// kMaxOperandsAndOutputsPerFusion. This puts an upper bound on the number of +// parameters to the kernel, working around the correctness problem. +// +// This limit is also often good for performance. In a fusion with many +// operands, each GPU thread likely has to do a lot of work, and so possibly +// uses a lot of registers, thus limiting occupancy. +bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2) { + // Compute the number of outputs of the (possibly multi-output) fusion node + // we're considering creating. + // + // This isn't precise; we may be off by one if + // - We're creating a multi-output fusion out of two non-MOFs. Creating a + // MOF adds a new buffer, namely, the tuple buffer. + // - We're merging two MOFs. In this case, we should count the tuple buffer + // only once. + // - WLOG there's an edge from `a` to `b` and `b` is the only consumer of + // `a`. In this case the result of `a` is not part of the output of the + // fusion. + // + // But because this is a heuristic and our limit + // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a + // big difference), we ignore this small inaccuracy in favor of simplicity. + int64 num_output_buffers = ShapeUtil::SubshapeCount(instr1.shape()) + + ShapeUtil::SubshapeCount(instr2.shape()); + + // The new fusion will have no more operands and outputs than + // producer_operands + consumer_operands - 1 + num_output_buffers + // (minus one because we may be fusing a producer->consumer edge between `a` + // and `b`). + // + // This fact may be enough to let us avoid having to compute the true total + // number of operands, which can be expensive. + if (instr1.operand_count() + instr2.operand_count() - 1 + num_output_buffers <= + kMaxOperandsAndOutputsPerFusion) { + return false; + } + + // Compute the precise number of operands to the new fusion. + absl::flat_hash_set operands(instr1.operands().begin(), + instr1.operands().end()); + operands.insert(instr2.operands().begin(), instr2.operands().end()); + // If there's an edge between `a` and `b`, don't count it: We're fusing that + // producer -> consumer relationship. + operands.erase(&instr1); + operands.erase(&instr2); + return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion; +} + bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr) { // We can fuse reduces and loop fusions. Elementwise instructions can be fused // with any other instruction. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h index a4501fd31dc5e4..073896530008b3 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h @@ -24,6 +24,8 @@ limitations under the License. namespace xla { namespace gpu { +constexpr int64 kMaxOperandsAndOutputsPerFusion = 64; + // Whether 'instr' can occur inside fusions, i.e. whether it is a candidate // for being fused. Note that further restrictions apply, e.g. Scatter must // be the root of an input fusion. @@ -59,6 +61,11 @@ bool IsInputFusibleReduction(const HloInstruction& instr); // is either an unfused scatter op or a scatter input fusion. bool IsInputFusibleScatter(const HloInstruction& instr); +// Determines whether the combination of `instr1` and `instr2` into a (possibly +// multi-output) fusion would be "too large" -- i.e., have more operands and +// outputs than is allowed. +bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2); + // Whether instruction shapes are compatible for multi-output fusion, i.e. // whether the emitters support lowering the resulting fusion. // This function works for both, sibling and producer-consumer multi-output @@ -69,6 +76,17 @@ bool IsInputFusibleScatter(const HloInstruction& instr); bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1, const HloInstruction& instr2); +// Whether the instructions are compatible for producer-consumer fusion +// i.e. whether the producer and consumer are loop/input fusible and +// they are not library calls. +bool IsProducerConsumerFusible(const HloInstruction& producer, + const HloInstruction& consumer); + +// Whether the instructions are producer-consumer fusible with multiple outputs. +// That is, the root tuple of the multi-output fusion will contain the results +// of both, the producer and consumer. +bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer, + const HloInstruction& consumer); // Whether `instr` is a candidate for sibling fusion or as a consumer in // a producer-consumer multi-output fusion. bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc index cee678e290253b..ea60f641969bda 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc @@ -643,5 +643,264 @@ TEST_F(GpuFusibleTest, EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2)); } +TEST_F(GpuFusibleTest, IsFusibleAsMultiOutputFusionRoot) { + auto module = ParseHloString(R"( + HloModule test_module + + ENTRY add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + })") + .ValueOrDie(); + + const HloInstruction* root = + module->entry_computation()->root_instruction(); + EXPECT_TRUE(IsFusibleAsMultiOutputFusionRoot(*root)); +} + +TEST_F(GpuFusibleTest, ScatterIsNotFusibleAsMultiOutputFusionRoot) { + auto module = ParseHloString(R"( + HloModule test_module + + add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + } + + ENTRY Scatter { + p0 = s32[3,3] parameter(0) + operand = s32[3,3] add(p0, p0) + p1 = s32[2] parameter(1) + indices = s32[2] add(p1, p1) + p2 = s32[2,3] parameter(2) + updates = s32[2,3] add(p2, p2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 + })") + .ValueOrDie(); + + const HloInstruction* scatter_inst = + module->entry_computation()->root_instruction(); + EXPECT_FALSE(IsFusibleAsMultiOutputFusionRoot(*scatter_inst)); +} + +TEST_F(GpuFusibleTest, ProducerConsumerFusionElementwiseAndReduce) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + ENTRY reduce { + p0 = f32[32,32,32]{2,1,0} parameter(0) + c0 = f32[] constant(0) + exp = f32[32,32,32]{2,1,0} exponential(p0) + reduce = f32[32,32]{1,0} reduce(exp, c0), dimensions={2}, + to_apply=scalar_add + ROOT root = (f32[32,32]{1,0}, f32[32,32,32]{2,1,0}) tuple(reduce, exp) + })")) + .ValueOrDie(); + const HloInstruction* root = module->entry_computation()->root_instruction(); + const HloInstruction* consumer = root->operand(0); + const HloInstruction* producer = root->operand(1); + EXPECT_TRUE(IsProducerConsumerMultiOutputFusible(*producer, *consumer)); +} + +TEST_F(GpuFusibleTest, ProducerConsumerFusionLoopFusionAndReduce) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_add { + p0.1 = f32[32,32,32]{2,1,0} parameter(0) + p1.1 = f32[32,32,32]{2,1,0} parameter(1) + ROOT add = f32[32,32,32]{2,1,0} add(p0.1, p1.1) + } + + ENTRY reduce { + p0 = f32[32,32,32]{2,1,0} parameter(0) + p1 = f32[32,32,32]{2,1,0} parameter(1) + c0 = f32[] constant(0) + add = f32[32,32,32]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_add + reduce = f32[32,32]{1,0} reduce(add, c0), dimensions={2}, + to_apply=scalar_add + ROOT root = (f32[32,32]{1,0}, f32[32,32,32]{2,1,0}) tuple(reduce, add) + })")) + .ValueOrDie(); + const HloInstruction* root = module->entry_computation()->root_instruction(); + const HloInstruction* consumer = root->operand(0); + const HloInstruction* producer = root->operand(1); + EXPECT_TRUE(IsProducerConsumerMultiOutputFusible(*producer, *consumer)); +} + +TEST_F(GpuFusibleTest, ProducerConsumerFusionLoopFusionAndReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_select { + p1.1 = f32[32,32,32]{2,1,0} parameter(1) + c0 = f32[] constant(0) + broadcast = f32[32,32,32]{2,1,0} broadcast(f32[] c0), dimensions={} + greater-than = pred[32,32,32]{2,1,0} compare(f32[32,32,32]{2,1,0} p1.1, + f32[32,32,32]{2,1,0} broadcast), direction=GT + p0.1 = f32[32,32,32]{2,1,0} parameter(0) + ROOT select = f32[32,32,32]{2,1,0} select(pred[32,32,32]{2,1,0} + greater-than, f32[32,32,32]{2,1,0} p0.1, f32[32,32,32]{2,1,0} broadcast) + } + + fused_reduce { + p0.2 = f32[32,32,32]{2,1,0} parameter(0) + c1 = f32[] constant(0) + r1 = f32[32,32]{1,0} reduce(p0.2, c1), dimensions={2}, + to_apply=scalar_add + mul = f32[32,32,32]{2,1,0} multiply(p0.2, p0.2) + r2 = f32[32,32]{1,0} reduce(mul, c1), dimensions={2}, + to_apply=scalar_add + ROOT tuple = (f32[32,32]{1,0}, f32[32,32]{1,0}) tuple(r1, r2) + } + + ENTRY reduce { + p0 = f32[32,32,32]{2,1,0} parameter(0) + p1 = f32[32,32,32]{2,1,0} parameter(1) + select = f32[32,32,32]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select + fusion = (f32[32,32]{1,0}, f32[32,32]{1,0}) fusion(select), kind=kInput, + calls=fused_reduce + ROOT root = (f32[32,32]{1,0}, f32[32,32,32]{2,1,0}) tuple(fusion, select) + })")) + .ValueOrDie(); + const HloInstruction* root = module->entry_computation()->root_instruction(); + const HloInstruction* consumer = root->operand(0); + const HloInstruction* producer = root->operand(1); + EXPECT_TRUE(IsProducerConsumerMultiOutputFusible(*producer, *consumer)); +} + +TEST_F(GpuFusibleTest, ProducerConsumerFusionDoNotFuseLoopReduceFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + fused_element_wise { + p0.1 = f32[2,2,2]{2,1,0} parameter(0) + p1.1 = f32[2,2,2]{2,1,0} parameter(1) + ROOT root = f32[2,2,2]{2,1,0} add(p0.1, p1.1) + } + + fused_reduce { + p0.2 = f32[2,2,2]{2,1,0} parameter(0) + mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, + f32[2,2,2]{2,1,0} p0.2) + broadcast = f32[2,2,2,2]{3,2,1,0} broadcast(mul), dimensions={3,2,1} + c1 = f32[] constant(0) + ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2,2]{3,2,1,0} broadcast, + f32[] c1), dimensions={1,3}, to_apply=scalar_add + } + + ENTRY reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + p1 = f32[2,2,2]{2,1,0} parameter(1) + element_wise = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_element_wise + fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(element_wise), kind=kLoop, calls=fused_reduce + ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(fusion, element_wise) + })")) + .ValueOrDie(); + const HloInstruction* root = module->entry_computation()->root_instruction(); + const HloInstruction* consumer = root->operand(0); + const HloInstruction* producer = root->operand(1); + // Not fusible as multioutput fusion root + EXPECT_FALSE(IsProducerConsumerMultiOutputFusible(*producer, *consumer)); +} + +TEST_F(GpuFusibleTest, + ProducerConsumerFusionReduceUnfriendlyLoopFusion) { + auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( + mixed_input_layouts_computation { + p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1) + copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1) + c0 = f16[] constant(0) + broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={} + greater-than = pred[128,1024,32,32]{1,3,2,0} compare(copy, broadcast), direction=GT + ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast) + } + fused_reduce { + p0.2 = f16[128,1024,32,32]{1,3,2,0} parameter(0) + convert = f32[128,1024,32,32]{1,3,2,0} convert(p0.2) + c0.2 = f32[] constant(0) + ROOT reduce = f32[1024]{0} reduce(convert, c0.2), dimensions={0,2,3}, to_apply=scalar_add + } + ENTRY reduce { + p0 = f16[128,1024,32,32]{3,2,1,0} parameter(0) + p1 = f16[128,1024,32,32]{1,3,2,0} parameter(1) + loop_fusion = f16[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=mixed_input_layouts_computation + reduce_fusion = f32[1024]{0} fusion(loop_fusion), kind=kInput, calls=fused_reduce + ROOT root = (f32[1024]{0}, f16[128,1024,32,32]{1,3,2,0}) tuple(reduce_fusion, loop_fusion) + })")) + .ValueOrDie(); + const HloInstruction* root = module->entry_computation()->root_instruction(); + const HloInstruction* consumer = root->operand(0); + const HloInstruction* producer = root->operand(1); + EXPECT_FALSE(IsProducerConsumerMultiOutputFusible(*producer, *consumer)); +} + +TEST_F(GpuFusibleTest, NonscalarConstantsNotFused) { + auto module = ParseHloString(R"( + HloModule test_module + + add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + } + + ENTRY BroadcastIntoReduce { + constant = f32[16] constant({0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}) + broadcast = f32[16,16,16,16]{3,2,1,0} broadcast(constant), dimensions={0} + constant.1 = f32[] constant(0) + reduce = f32[] reduce(broadcast, constant.1), dimensions={0,1,2,3}, + to_apply=add + ROOT root = (f32[], f32[], f32[16,16,16,16], f32[16]) tuple(reduce, constant.1, broadcast, constant) + })") + .ValueOrDie(); + // Do not fuse if producer is a non-scalar constant or consumer is non-fusion node. + const HloInstruction* root = module->entry_computation()->root_instruction(); + const HloInstruction* consumer = root->operand(0); + const HloInstruction* producer = root->operand(1); + const HloInstruction* consumer2 = root->operand(2); + const HloInstruction* producer2 = root->operand(3); + EXPECT_FALSE(IsProducerConsumerFusible(*producer, *consumer)); + EXPECT_FALSE(IsProducerConsumerFusible(*producer2, *consumer2)); +} + +TEST_F(GpuFusibleTest, DoNotFuseLayoutChangingOpWithReduce) { + auto module = ParseHloString(R"( + HloModule test_module + + add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + } + + ENTRY entry { + p0 = f32[16,16,16,16]{3,2,1,0} parameter(0) + copy = f32[16,16,16,16]{0,1,2,3} copy(p0) + constant.1 = f32[] constant(0) + ROOT reduce = f32[16] reduce(copy, constant.1), dimensions={0,1,2}, to_apply=add + })") + .ValueOrDie(); + + const HloInstruction* consumer = module->entry_computation()->root_instruction(); + const HloInstruction* producer = consumer->operand(0); + EXPECT_FALSE(IsProducerConsumerFusible(*producer, *consumer)); +} + +TEST_F(GpuFusibleTest, FuseLayoutChangingOpWithElementwise) { + auto module = ParseHloString(R"( + HloModule test_module + ENTRY entry { + p0 = f32[16,16,16,16]{3,2,1,0} parameter(0) + copy = f32[16,16,16,16]{0,1,2,3} copy(p0) + ROOT add = f32[16,16,16,16]{0,1,2,3} add(copy, copy) + })") + .ValueOrDie(); + + const HloInstruction* consumer = module->entry_computation()->root_instruction(); + const HloInstruction* producer = consumer->operand(0); + EXPECT_TRUE(IsProducerConsumerFusible(*producer, *consumer)); +} + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc index 54cab21ab4c240..0083e3e87a6d8b 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc @@ -53,67 +53,6 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) { } } -// This function limits the maximum number of operands to a fusion. -// -// There's a cap on how many parameters we can pass to a CUDA kernel, but -// exactly what that limit is hazy, as it depends on (among other things) how -// much GPU constant memory is in use for other purposes. -// -// Moreover, we don't even know at the point that we're running fusion how many -// arguments the CUDA kernel for a fusion node will have: It depends on buffer -// assignment, where we will decide which of the fusion's operands live in XLA's -// big temp buffer versus in other allocations. -// -// As a heuristic, we simply cap the number of fusion operands plus outputs at -// kMaxOperandsAndOutputsPerFusion. This puts an upper bound on the number of -// parameters to the kernel, working around the correctness problem. -// -// This limit is also often good for performance. In a fusion with many -// operands, each GPU thread likely has to do a lot of work, and so possibly -// uses a lot of registers, thus limiting occupancy. -/*static*/ bool GpuInstructionFusion::FusionWouldBeTooLarge( - const HloInstruction* a, const HloInstruction* b) { - // Compute the number of outputs of the (possibly multi-output) fusion node - // we're considering creating. - // - // This isn't precise; we may be off by one if - // - We're creating a multi-output fusion out of two non-MOFs. Creating a - // MOF adds a new buffer, namely, the tuple buffer. - // - We're merging two MOFs. In this case, we should count the tuple buffer - // only once. - // - WLOG there's an edge from `a` to `b` and `b` is the only consumer of - // `a`. In this case the result of `a` is not part of the output of the - // fusion. - // - // But because this is a heuristic and our limit - // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a - // big difference), we ignore this small inaccuracy in favor of simplicity. - int64 num_output_buffers = ShapeUtil::SubshapeCount(a->shape()) + - ShapeUtil::SubshapeCount(b->shape()); - - // The new fusion will have no more operands and outputs than - // producer_operands + consumer_operands - 1 + num_output_buffers - // (minus one because we may be fusing a producer->consumer edge between `a` - // and `b`). - // - // This fact may be enough to let us avoid having to compute the true total - // number of operands, which can be expensive. - if (a->operand_count() + b->operand_count() - 1 + num_output_buffers <= - kMaxOperandsAndOutputsPerFusion) { - return false; - } - - // Compute the precise number of operands to the new fusion. - absl::flat_hash_set operands(a->operands().begin(), - a->operands().end()); - operands.insert(b->operands().begin(), b->operands().end()); - // If there's an edge between `a` and `b`, don't count it: We're fusing that - // producer -> consumer relationship. - operands.erase(a); - operands.erase(b); - return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion; -} - bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer, int64 operand_index) { HloInstruction* producer = consumer->mutable_operand(operand_index); @@ -184,42 +123,6 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer, return false; } - // Other output fusions are not currently supported on GPUs. - if (producer->opcode() == HloOpcode::kFusion) { - return false; - } - - // RNG operations are not currently parallel-friendly on GPU. - if (producer->opcode() == HloOpcode::kRng) { - return false; - } - - // Do not fuse to-vector reduction into other consumers. They should be - // unfused or the root of a kInput fusion. - if (IsReductionFromOrToContiguousDimensions(*producer)) { - return false; - } - - // Scatter is only supported at the root of a kInput fusion. - if (producer->opcode() == HloOpcode::kScatter) { - return false; - } - - // Do not fuse into reduce input fusions if the resulting kernel would suffer - // from poor data locality (due to unfriendly input layouts). - if (IsInputFusibleReduction(*consumer) && - !LayoutsAreReduceInputFusionFriendly(*producer, *consumer)) { - return false; - } - - // We can't fuse library calls, so if a user of such an op could become a - // bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for - // further rationale. - if (producer->CouldBeBitcast() && - ImplementedAsLibraryCall(*producer->operand(0))) { - return false; - } - // Cost condition: not fuse (simple, expensive producers) and (consumers who // reuse operand elements). if (producer->opcode() != HloOpcode::kFusion && @@ -228,20 +131,7 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer, return false; } - // Fuse scalar constants into loop fusion nodes. This reduces the number of - // parameters and makes matching scalar broadcasts easier. - // - // Don't fuse other constants: Unfused constants in GPU land can be - // represented as an external constant (i.e. not emitted in LLVM IR / PTX), - // but fused constants are handled by shrared CPU/GPU code and always emitted - // in the IR/PTX. The external constant representation makes for faster - // compiles and significantly smaller assembly code. - if (producer->opcode() == HloOpcode::kConstant) { - return ShapeUtil::IsEffectiveScalar(producer->shape()) && - consumer->opcode() == HloOpcode::kFusion; - } - - if (!IsFusible(*producer) || !IsFusible(*consumer) || + if (!IsProducerConsumerFusible(*producer, *consumer) || !InstructionFusion::ShouldFuse(consumer, operand_index)) { return false; } @@ -259,8 +149,8 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer, if (consumer->opcode() == HloOpcode::kReduce && consumer->shape().IsTuple()) { return false; } - // The following checks are potentially expensive. - if (FusionWouldBeTooLarge(consumer, producer)) { + // The following checks are potentially expensive. + if (FusionWouldBeTooLarge(*consumer, *producer)) { return false; } // Also check that our emitter can handle the fusion node. We currently can diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h index 2f8f40b4b5ef4f..482b7e8f42aedc 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h @@ -27,19 +27,6 @@ class GpuInstructionFusion : public InstructionFusion { explicit GpuInstructionFusion(bool may_duplicate) : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {} - // Maximum number of operands plus outputs allowed on a single fusion node. - // Exposed publicly mainly for tests. - static constexpr int64 kMaxOperandsAndOutputsPerFusion = 64; - - // Determines whether the combination of `a` and `b` into a (possibly - // multi-output) fusion would be "too large" -- i.e., have more operands and - // outputs than is allowed. - // - // `ShouldFuse` and `ShouldFuseIntoMultiOutput` call this; it's public so that - // other fusion passes (e.g. GPU multi-output fusion) can also call this. - static bool FusionWouldBeTooLarge(const HloInstruction* a, - const HloInstruction* b); - static bool IsExpensive(const HloInstruction& instruction); bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override; diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc index edb6ecf6247734..dd5bdc3a245580 100644 --- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" @@ -560,7 +561,7 @@ TEST_F(InstructionFusionTest, FuseScalarConstant) { // Check that we limit the number of operands to fusions we create. TEST_F(InstructionFusionTest, AvoidsLargeFusion) { constexpr int64 kNumParams = 200; - ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion); + ASSERT_GT(kNumParams, kMaxOperandsAndOutputsPerFusion); // Compute p0 + p1 + ... + pN. HloComputation::Builder b(TestName()); @@ -582,7 +583,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) { SCOPED_TRACE(module->ToString()); for (const HloInstruction* instr : computation->instructions()) { EXPECT_LE(instr->operand_count(), - GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion) + kMaxOperandsAndOutputsPerFusion) << instr->ToString(); } } diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc index a00900fabab7b9..5018c87de0926f 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc @@ -102,7 +102,7 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1, } // Do this check last, as it may be expensive. - return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2); + return !FusionWouldBeTooLarge(*instr1, *instr2); } bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { @@ -127,38 +127,24 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { VLOG(3) << consumer->name() << " has no users."; continue; } - if (!IsInputFusibleReduction(*consumer)) { - VLOG(3) << consumer->name() << " is not an input-fusible reduction."; - continue; - } VLOG(3) << consumer->name() << " is a fusion candidate. Looking for fuseable operands."; auto consumer_operands = consumer->operands(); for (size_t i = 0; i < consumer_operands.size(); ++i) { HloInstruction* producer = consumer_operands[i]; - if (!producer->IsFusible()) { - VLOG(3) << producer->name() << " is not fusible."; + if (!IsProducerConsumerMultiOutputFusible(*producer, *consumer)) { + VLOG(3) << producer->name() << " and " << consumer->name() << " are not fusible."; continue; } + // Never multi-output fuse constants. To the extent that we want to fuse // constants, that should be handled by the regular fusion pass. if (producer->opcode() == HloOpcode::kConstant) { VLOG(3) << producer->name() << " is a constant."; continue; } - if (!producer->IsElementwise() && !producer->IsLoopFusion()) { - VLOG(3) << producer->name() << " is not a loop fusion."; - continue; - } - if (!ShapesCompatibleForMultiOutputFusion(*producer, *consumer)) { - VLOG(3) << producer->name() << " has an incompatible shape."; - continue; - } - if (!LayoutsAreReduceInputFusionFriendly(*producer, *consumer)) { - VLOG(3) << producer->name() << " has inputs with mixed layouts."; - continue; - } + // If we have already decided to fuse this producer, skip it. if (ContainsKey(to_fuse, producer)) { VLOG(3) << producer->name() << " will be fused with another consumer."; diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc index 2aa61b8951ad4a..99e2511cb6e0c0 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h" #include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" @@ -601,7 +602,7 @@ TEST_F(MultiOutputFusionTest, // Check that we limit the number of operands to fusions we create. TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) { constexpr int64 kNumParams = 200; - ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion); + ASSERT_GT(kNumParams, kMaxOperandsAndOutputsPerFusion); // Compute // p0 * p1, @@ -646,7 +647,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) { SCOPED_TRACE(module->ToString()); for (const HloInstruction* instr : computation->instructions()) { EXPECT_LE(instr->operand_count() + ShapeUtil::SubshapeCount(instr->shape()), - GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion) + kMaxOperandsAndOutputsPerFusion) << instr->ToString(); } }