Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu_fusible refactoring change: Create methods to check if two instructions are fusible #28399

Merged
merged 16 commits into from Jun 13, 2019
Merged
10 changes: 1 addition & 9 deletions tensorflow/compiler/xla/service/gpu/fusion_merger.cc
Expand Up @@ -195,20 +195,12 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
return Status::OK();
}

// Skip multiple output fusion. It's not yet supported.
if (fusion->IsMultiOutputFusion()) {
VLOG(3) << "Not merging " << fusion->name() << ": Is multi-output fusion.";
++num_fail_not_loop_fusion_;
return Status::OK();
}
// Skip 'fusion' instruction if we cannot merge into all of its users.
// Merging into all users enables the removal of 'fusion' from the
// computation.
if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
return user->opcode() == HloOpcode::kFusion &&
(user->IsLoopFusion() ||
(IsReduceInputFusion(*user) &&
LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
IsProducerConsumerFusible(*fusion, *user);
})) {
VLOG(3) << "Not merging " << fusion->name()
<< ": Some of its users are not loop/input fusion kernels.";
Expand Down
124 changes: 123 additions & 1 deletion tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
Expand Up @@ -178,7 +178,7 @@ bool IsLoopFusible(const HloInstruction& instr) {
instr.opcode() == HloOpcode::kDynamicUpdateSlice ||
(instr.opcode() == HloOpcode::kFusion &&
instr.fusion_kind() == HloInstruction::FusionKind::kLoop) ||
instr.opcode() == HloOpcode::kGather ||
instr.opcode() == HloOpcode::kGather ||
instr.opcode() == HloOpcode::kIota ||
instr.opcode() == HloOpcode::kPad ||
(instr.opcode() == HloOpcode::kReduce &&
Expand All @@ -187,13 +187,135 @@ bool IsLoopFusible(const HloInstruction& instr) {
instr.opcode() == HloOpcode::kReshape ||
instr.opcode() == HloOpcode::kReverse ||
instr.opcode() == HloOpcode::kSlice ||
instr.opcode() == HloOpcode::kConstant ||
instr.opcode() == HloOpcode::kTranspose);
}

bool IsFusible(const HloInstruction& instr) {
return IsInputFusible(instr) || IsLoopFusible(instr);
}

bool IsProducerConsumerFusible(const HloInstruction& producer,
const HloInstruction& consumer) {

if (!IsLoopFusible(producer) || !IsFusible(consumer)) {
return false;
}

// Skip multiple output fusion. It's not yet supported.
if (producer.IsMultiOutputFusion()) {
return false;
}

// Do not fuse into reduce input fusions if the resulting kernel would suffer
// from poor data locality (due to unfriendly input layouts).
if (IsInputFusibleReduction(consumer) &&
!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
return false;
}

// We can't fuse library calls, so if a user of such an op could become a
// bitcast, leave it unfused. See `xla::InstructionFusion::ShouldFuse` for
// further rationale.
if (producer.CouldBeBitcast() &&
ImplementedAsLibraryCall(*producer.operand(0))) {
return false;
}

// Fuse scalar constants into loop fusion nodes. This reduces the number of
// parameters and makes matching scalar broadcasts easier.
//
// Don't fuse other constants: Unfused constants in GPU land can be
// represented as an external constant (i.e. not emitted in LLVM IR / PTX),
// but fused constants are handled by shrared CPU/GPU code and always emitted
// in the IR/PTX. The external constant representation makes for faster
// compiles and significantly smaller assembly code.
if (producer.opcode() == HloOpcode::kConstant) {
return ShapeUtil::IsEffectiveScalar(producer.shape()) &&
consumer.opcode() == HloOpcode::kFusion;
}

return true;
}

bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
const HloInstruction& consumer) {

if (!IsFusibleAsMultiOutputFusionRoot(producer) || !IsFusibleAsMultiOutputFusionRoot(consumer)) {
return false;
}

if (!ShapesCompatibleForMultiOutputFusion(producer, consumer)) {
return false;
}

if (!LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
return false;
}

return true;
}

// This function limits the maximum number of operands to a fusion.
//
// There's a cap on how many parameters we can pass to a CUDA kernel, but
// exactly what that limit is hazy, as it depends on (among other things) how
// much GPU constant memory is in use for other purposes.
//
// Moreover, we don't even know at the point that we're running fusion how many
// arguments the CUDA kernel for a fusion node will have: It depends on buffer
// assignment, where we will decide which of the fusion's operands live in XLA's
// big temp buffer versus in other allocations.
//
// As a heuristic, we simply cap the number of fusion operands plus outputs at
// kMaxOperandsAndOutputsPerFusion. This puts an upper bound on the number of
// parameters to the kernel, working around the correctness problem.
//
// This limit is also often good for performance. In a fusion with many
// operands, each GPU thread likely has to do a lot of work, and so possibly
// uses a lot of registers, thus limiting occupancy.
bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2) {
// Compute the number of outputs of the (possibly multi-output) fusion node
// we're considering creating.
//
// This isn't precise; we may be off by one if
// - We're creating a multi-output fusion out of two non-MOFs. Creating a
// MOF adds a new buffer, namely, the tuple buffer.
// - We're merging two MOFs. In this case, we should count the tuple buffer
// only once.
// - WLOG there's an edge from `a` to `b` and `b` is the only consumer of
// `a`. In this case the result of `a` is not part of the output of the
// fusion.
//
// But because this is a heuristic and our limit
// kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a
// big difference), we ignore this small inaccuracy in favor of simplicity.
int64 num_output_buffers = ShapeUtil::SubshapeCount(instr1.shape()) +
ShapeUtil::SubshapeCount(instr2.shape());

// The new fusion will have no more operands and outputs than
// producer_operands + consumer_operands - 1 + num_output_buffers
// (minus one because we may be fusing a producer->consumer edge between `a`
// and `b`).
//
// This fact may be enough to let us avoid having to compute the true total
// number of operands, which can be expensive.
if (instr1.operand_count() + instr2.operand_count() - 1 + num_output_buffers <=
kMaxOperandsAndOutputsPerFusion) {
return false;
}

// Compute the precise number of operands to the new fusion.
absl::flat_hash_set<const HloInstruction*> operands(instr1.operands().begin(),
instr1.operands().end());
operands.insert(instr2.operands().begin(), instr2.operands().end());
// If there's an edge between `a` and `b`, don't count it: We're fusing that
// producer -> consumer relationship.
operands.erase(&instr1);
operands.erase(&instr2);
return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
}

bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr) {
// We can fuse reduces and loop fusions. Elementwise instructions can be fused
// with any other instruction.
Expand Down
18 changes: 18 additions & 0 deletions tensorflow/compiler/xla/service/gpu/gpu_fusible.h
Expand Up @@ -24,6 +24,8 @@ limitations under the License.
namespace xla {
namespace gpu {

constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;

// Whether 'instr' can occur inside fusions, i.e. whether it is a candidate
// for being fused. Note that further restrictions apply, e.g. Scatter must
// be the root of an input fusion.
Expand Down Expand Up @@ -59,6 +61,11 @@ bool IsInputFusibleReduction(const HloInstruction& instr);
// is either an unfused scatter op or a scatter input fusion.
bool IsInputFusibleScatter(const HloInstruction& instr);

// Determines whether the combination of `instr1` and `instr2` into a (possibly
// multi-output) fusion would be "too large" -- i.e., have more operands and
// outputs than is allowed.
bool FusionWouldBeTooLarge(const HloInstruction& instr1, const HloInstruction& instr2);

// Whether instruction shapes are compatible for multi-output fusion, i.e.
// whether the emitters support lowering the resulting fusion.
// This function works for both, sibling and producer-consumer multi-output
Expand All @@ -69,6 +76,17 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
const HloInstruction& instr2);

// Whether the instructions are compatible for producer-consumer fusion
// i.e. whether the producer and consumer are loop/input fusible and
// they are not library calls.
bool IsProducerConsumerFusible(const HloInstruction& producer,
const HloInstruction& consumer);

// Whether the instructions are producer-consumer fusible with multiple outputs.
// That is, the root tuple of the multi-output fusion will contain the results
// of both, the producer and consumer.
bool IsProducerConsumerMultiOutputFusible(const HloInstruction& producer,
const HloInstruction& consumer);
// Whether `instr` is a candidate for sibling fusion or as a consumer in
// a producer-consumer multi-output fusion.
bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
Expand Down