From e3e2849ca3092a4d6ff9c3f19d30cd8e42991fd7 Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Tue, 26 Jan 2021 17:02:37 -0800 Subject: [PATCH 1/2] Fix a SimplifyCFG typo that leads to unbounded optimization Fixes rdar://73357726 ([SR-14068]: Compiling with optimisation runs indefinitely for grpc-swift) The root cause of this problem is that SimplifyCFG::tryJumpThreading jump threads into loops, effectively peeling loops. This is not the right way to implement loop peeling. That belongs in a loop optimization pass. There's is simply no sane way to control jump threading if it is allowed across loop boundaries, both from the standpoint of requiring optimizations to terminate and from the standpoint of reducing senseless code bloat. SimplifyCFG does have a mechanism to avoid jump-threading into loop in most cases. That mechanism would actually prevent the infinite loop peeling in this particular case if it were implemented correctly. But the original implementation circa 2014 appears to have a typo. This commit fixes that obvious bug. I do not think it's a sufficient to ensure we never see the bad behavior. I will file separate bugs for the broader issue. This bad behavior was exposed incidentally by splitting critical edges. Without edge splitting, SimplifyCFG::simplifyBlocks only performs "jump threading" once, creating a critical edge to the loop header. Because simplifyBlocks works under the assumption that there are no critical edges, it never attempts to perform jump threading again. In other words, the presence of the critical edge "breaks" the optimization, preventing it from continuing as intended. With edge splitting, the simplifyBlocks worklist performs "jump threading" followed by "jump to trampoline" removal, which creates a new loop-back edge to the original loop header. This is fine. However, simplifyBlocks iteratively attempts all optimizations up to a fix point and it does not stop at loop headers! So, splitting the critical edge causes simplifyBlocks to work as intended, which leads to infinite loop peeling. The end result is an infinite sequence of nested loops. Each peeled iteration is actually within the parent loop. (cherry picked from commit 8948f7565a2800e0378c2925085bf0a723089973) --- lib/SILOptimizer/Transforms/SimplifyCFG.cpp | 2 +- test/SILOptimizer/simplify_cfg_simple.sil | 66 +++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp index 49ab8b662bfd3..6977ec1dbbd51 100644 --- a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp +++ b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp @@ -1401,7 +1401,7 @@ bool SimplifyCFG::simplifyBranchBlock(BranchInst *BI) { // Eliminating the trampoline can expose opportunities to improve the // new block we branch to. if (LoopHeaders.count(DestBB)) - LoopHeaders.insert(BB); + LoopHeaders.insert(trampolineDest.destBB); addToWorklist(trampolineDest.destBB); BI->eraseFromParent(); diff --git a/test/SILOptimizer/simplify_cfg_simple.sil b/test/SILOptimizer/simplify_cfg_simple.sil index 68834ac2a04a2..6a15f33959544 100644 --- a/test/SILOptimizer/simplify_cfg_simple.sil +++ b/test/SILOptimizer/simplify_cfg_simple.sil @@ -7,6 +7,11 @@ sil_stage canonical import Builtin import Swift +internal enum Enum { + case one + case two +} + // CHECK-LABEL: sil @simple_test : $@convention(thin) () -> () { // CHECK: bb0: // CHECK-NEXT: tuple @@ -27,3 +32,64 @@ bb3: %9999 = tuple () return %9999 : $() } + +// Test that SimplifyCFG::simplifyBlocks, tryJumpThreading does not +// perform unbounded loop peeling. +// +// rdar://73357726 ([SR-14068]: Compiling with optimisation runs indefinitely for grpc-swift) +// CHECK-LABEL: sil @testInfinitePeeling : $@convention(method) (Builtin.Int64, Enum) -> () { +// +// There is only one switch_enum blocks, and it is no longer in a loop. +// CHECK: bb0(%0 : $Builtin.Int64, %1 : $Enum): +// CHECK: switch_enum %1 : $Enum, case #Enum.one!enumelt: bb3, case #Enum.two!enumelt: bb4 +// CHECK: bb1: +// CHECK: br bb8 +// CHECK: bb2: +// CHECK: br bb5(%{{.*}} : $Enum) +// +// This is the original cond_br block +// CHECK: bb3: +// CHECK: cond_br %{{.*}}, bb2, bb1 +// CHECK: bb4: +// CHECK: br bb5(%1 : $Enum) +// +// This is the cond_br block after jump-threading. +// CHECK: bb5(%{{.*}} : $Enum): +// CHECK: cond_br %{{.*}}, bb6, bb7 +// CHECK: bb6: +// CHECK: br bb5(%{{.*}} : $Enum) +// CHECK: bb7: +// CHECK: br bb8 +// CHECK: bb8: +// CHECK: return %19 : $() +// CHECK-LABEL: } // end sil function 'testInfinitePeeling' +sil @testInfinitePeeling : $@convention(method) (Builtin.Int64, Enum) -> () { +bb0(%0 : $Builtin.Int64, %1 : $Enum): + %2 = integer_literal $Builtin.Int64, 99999999 + br bb1(%0 : $Builtin.Int64, %1 : $Enum) + +bb1(%4 : $Builtin.Int64, %5 : $Enum): + switch_enum %5 : $Enum, case #Enum.one!enumelt: bb4, default bb5 + +bb2(%7 : $Builtin.Int64, %8 : $Enum): + %9 = builtin "cmp_slt_Int64"(%2 : $Builtin.Int64, %7 : $Builtin.Int64) : $Builtin.Int1 + cond_br %9, bb3, bb6 + +bb3: + br bb1(%7 : $Builtin.Int64, %8 : $Enum) + +bb4: + %12 = integer_literal $Builtin.Int64, 1 + %13 = integer_literal $Builtin.Int1, -1 + %14 = builtin "sadd_with_overflow_Int64"(%4 : $Builtin.Int64, %12 : $Builtin.Int64, %13 : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1) + %15 = tuple_extract %14 : $(Builtin.Int64, Builtin.Int1), 0 + %16 = enum $Enum, #Enum.two!enumelt + br bb2(%15 : $Builtin.Int64, %16 : $Enum) + +bb5: + br bb2(%2 : $Builtin.Int64, %5 : $Enum) + +bb6: + %19 = tuple () + return %19 : $() +} From c6df438145aa1f8f782605c48b5858a926ed0cc9 Mon Sep 17 00:00:00 2001 From: Andrew Trick Date: Tue, 26 Jan 2021 19:40:37 -0800 Subject: [PATCH 2/2] Add a safeguard to SimplifyCFG tryJumpThreading to avoid infinite loop peeling rdar://73644659 (Add a safeguard to SimplifyCFG tryJumpThreading to avoid infinite loop peeling) A case of infinite loop peeling was exposed recently: ([SR-14068]: Compiling with optimisation runs indefinitely for grpc-swift) It was trivially fixed here: --- commit 8948f7565a2800e0378c2925085bf0a723089973 (HEAD -> fix-simplifycfg-tramp, public/fix-simplifycfg-tramp) Author: Andrew Trick Date: Tue Jan 26 17:02:37 2021 Fix a SimplifyCFG typo that leads to unbounded optimization --- However, that fix isn't a strong guarantee against this behavior. The obvious complete fix is that jump-threading should not affect loop structure. But changing that requires a performance investigation. In the meantime this change introduces a simple mechanism that guarantees that a loop header is not repeatedly cloned. This safeguard is worthwhile because jump-threading across loop boundaries is kicking in more frequently now the critical edges are being split within SimplifyCFG. Note that it is both necessary and desirable to split critical edges between transformations so that SIL remains in a valid state. That allows other code in SimplifyCFG to call arbitrary SIL utilities, allows verifying SimplifyCFG by running verification between transformation, and simplifies the patters that SimplifyCFG itself needs to consider. (cherry picked from commit 8b2098445e64f4c78ab2d2047730ab374164d5df) --- lib/SILOptimizer/Transforms/SimplifyCFG.cpp | 45 +++++++++++++++------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp index 6977ec1dbbd51..bb8aaa7b4af06 100644 --- a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp +++ b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp @@ -78,6 +78,10 @@ class SimplifyCFG { llvm::SmallDenseMap WorklistMap; // Keep track of loop headers - we don't want to jump-thread through them. SmallPtrSet LoopHeaders; + // The set of cloned loop headers to avoid infinite loop peeling. Blocks in + // this set may or may not still be LoopHeaders. + // (ultimately this can be used to eliminate findLoopHeaders) + SmallPtrSet ClonedLoopHeaders; // The cost (~ number of copied instructions) of jump threading per basic // block. Used to prevent infinite jump threading loops. llvm::SmallDenseMap JumpThreadingCost; @@ -125,6 +129,16 @@ class SimplifyCFG { } private: + // Called when \p newBlock inherits the former predecessors of \p + // oldBlock. e.g. if \p oldBlock was a loop header, then newBlock is now a + // loop header. + void substitutedBlockPreds(SILBasicBlock *oldBlock, SILBasicBlock *newBlock) { + if (LoopHeaders.count(oldBlock)) + LoopHeaders.insert(newBlock); + if (ClonedLoopHeaders.count(oldBlock)) + ClonedLoopHeaders.insert(newBlock); + } + void clearWorklist() { WorklistMap.clear(); WorklistList.clear(); @@ -170,8 +184,10 @@ class SimplifyCFG { // Remove it from the map as well. WorklistMap.erase(It); - if (LoopHeaders.count(BB)) + if (LoopHeaders.count(BB)) { LoopHeaders.erase(BB); + ClonedLoopHeaders.erase(BB); + } } bool simplifyBlocks(); @@ -1082,10 +1098,13 @@ bool SimplifyCFG::tryJumpThreading(BranchInst *BI) { return false; // Don't jump thread through a potential header - this can produce irreducible - // control flow. Still, we make an exception for switch_enum. + // control flow and lead to infinite loop peeling. bool DestIsLoopHeader = (LoopHeaders.count(DestBB) != 0); if (DestIsLoopHeader) { - if (!isa(destTerminator)) + // Make an exception for switch_enum, but only if it's block was not already + // peeled out of it's original loop. In that case, further jump threading + // can accomplish nothing, and the loop will be infinitely peeled. + if (!isa(destTerminator) || ClonedLoopHeaders.count(DestBB)) return false; } @@ -1127,8 +1146,14 @@ bool SimplifyCFG::tryJumpThreading(BranchInst *BI) { // If we jump-thread a switch_enum in the loop header, we have to recalculate // the loop header info. - if (DestIsLoopHeader) + // + // FIXME: findLoopHeaders should not be called repeatedly during simplify-cfg + // iteration. It is a whole-function analysis! It also does no nothing help to + // avoid infinite loop peeling. + if (DestIsLoopHeader) { + ClonedLoopHeaders.insert(Cloner.getNewBB()); findLoopHeaders(); + } ++NumJumpThreads; return true; @@ -1375,8 +1400,7 @@ bool SimplifyCFG::simplifyBranchBlock(BranchInst *BI) { for (auto &Succ : remainingBlock->getSuccessors()) addToWorklist(Succ); - if (LoopHeaders.count(deletedBlock)) - LoopHeaders.insert(remainingBlock); + substitutedBlockPreds(deletedBlock, remainingBlock); auto Iter = JumpThreadingCost.find(deletedBlock); if (Iter != JumpThreadingCost.end()) { @@ -1400,8 +1424,7 @@ bool SimplifyCFG::simplifyBranchBlock(BranchInst *BI) { trampolineDest.newSourceBranchArgs); // Eliminating the trampoline can expose opportunities to improve the // new block we branch to. - if (LoopHeaders.count(DestBB)) - LoopHeaders.insert(trampolineDest.destBB); + substitutedBlockPreds(DestBB, trampolineDest.destBB); addToWorklist(trampolineDest.destBB); BI->eraseFromParent(); @@ -1586,8 +1609,7 @@ bool SimplifyCFG::simplifyCondBrBlock(CondBranchInst *BI) { BI->getTrueBBCount(), BI->getFalseBBCount()); BI->eraseFromParent(); - if (LoopHeaders.count(TrueSide)) - LoopHeaders.insert(ThisBB); + substitutedBlockPreds(TrueSide, ThisBB); removeIfDead(TrueSide); addToWorklist(ThisBB); return true; @@ -1605,8 +1627,7 @@ bool SimplifyCFG::simplifyCondBrBlock(CondBranchInst *BI) { falseTrampolineDest.destBB, falseTrampolineDest.newSourceBranchArgs, BI->getTrueBBCount(), BI->getFalseBBCount()); BI->eraseFromParent(); - if (LoopHeaders.count(FalseSide)) - LoopHeaders.insert(ThisBB); + substitutedBlockPreds(FalseSide, ThisBB); removeIfDead(FalseSide); addToWorklist(ThisBB); return true;