From e3e2849ca3092a4d6ff9c3f19d30cd8e42991fd7 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Tue, 26 Jan 2021 17:02:37 -0800
Subject: [PATCH 1/2] Fix a SimplifyCFG typo that leads to unbounded
 optimization

Fixes rdar://73357726 ([SR-14068]: Compiling with optimisation runs
indefinitely for grpc-swift)

The root cause of this problem is that SimplifyCFG::tryJumpThreading
jump threads into loops, effectively peeling loops. This is not the
right way to implement loop peeling. That belongs in a loop
optimization pass. There's is simply no sane way to control jump
threading if it is allowed across loop boundaries, both from the
standpoint of requiring optimizations to terminate and from the
standpoint of reducing senseless code bloat.

SimplifyCFG does have a mechanism to avoid jump-threading into loop in
most cases. That mechanism would actually prevent the infinite loop
peeling in this particular case if it were implemented correctly. But
the original implementation circa 2014 appears to have a typo.

This commit fixes that obvious bug. I do not think it's a sufficient
to ensure we never see the bad behavior. I will file separate bugs for
the broader issue.

This bad behavior was exposed incidentally by splitting critical
edges. Without edge splitting, SimplifyCFG::simplifyBlocks only
performs "jump threading" once, creating a critical edge to the loop
header. Because simplifyBlocks works under the assumption that there
are no critical edges, it never attempts to perform jump threading
again. In other words, the presence of the critical edge "breaks" the
optimization, preventing it from continuing as intended.

With edge splitting, the simplifyBlocks worklist performs "jump
threading" followed by "jump to trampoline" removal, which creates a
new loop-back edge to the original loop header. This is fine. However,
simplifyBlocks iteratively attempts all optimizations up to a fix
point and it does not stop at loop headers! So, splitting the critical
edge causes simplifyBlocks to work as intended, which leads to
infinite loop peeling. The end result is an infinite sequence of
nested loops. Each peeled iteration is actually within the parent
loop.

(cherry picked from commit 8948f7565a2800e0378c2925085bf0a723089973)
---
 lib/SILOptimizer/Transforms/SimplifyCFG.cpp |  2 +-
 test/SILOptimizer/simplify_cfg_simple.sil   | 66 +++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
index 49ab8b662bfd3..6977ec1dbbd51 100644
--- a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
+++ b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
@@ -1401,7 +1401,7 @@ bool SimplifyCFG::simplifyBranchBlock(BranchInst *BI) {
     // Eliminating the trampoline can expose opportunities to improve the
     // new block we branch to.
     if (LoopHeaders.count(DestBB))
-      LoopHeaders.insert(BB);
+      LoopHeaders.insert(trampolineDest.destBB);
 
     addToWorklist(trampolineDest.destBB);
     BI->eraseFromParent();
diff --git a/test/SILOptimizer/simplify_cfg_simple.sil b/test/SILOptimizer/simplify_cfg_simple.sil
index 68834ac2a04a2..6a15f33959544 100644
--- a/test/SILOptimizer/simplify_cfg_simple.sil
+++ b/test/SILOptimizer/simplify_cfg_simple.sil
@@ -7,6 +7,11 @@ sil_stage canonical
 import Builtin
 import Swift
 
+internal enum Enum {
+  case one
+  case two
+}
+
 // CHECK-LABEL: sil @simple_test : $@convention(thin) () -> () {
 // CHECK: bb0:
 // CHECK-NEXT: tuple
@@ -27,3 +32,64 @@ bb3:
   %9999 = tuple ()
   return %9999 : $()
 }
+
+// Test that SimplifyCFG::simplifyBlocks, tryJumpThreading does not
+// perform unbounded loop peeling.
+//
+// rdar://73357726 ([SR-14068]: Compiling with optimisation runs indefinitely for grpc-swift)
+// CHECK-LABEL: sil @testInfinitePeeling : $@convention(method) (Builtin.Int64, Enum) -> () {
+//
+// There is only one switch_enum blocks, and it is no longer in a loop.
+// CHECK: bb0(%0 : $Builtin.Int64, %1 : $Enum):
+// CHECK:   switch_enum %1 : $Enum, case #Enum.one!enumelt: bb3, case #Enum.two!enumelt: bb4
+// CHECK: bb1:
+// CHECK:   br bb8
+// CHECK: bb2:
+// CHECK:   br bb5(%{{.*}} : $Enum)
+//
+// This is the original cond_br block
+// CHECK: bb3:
+// CHECK:   cond_br %{{.*}}, bb2, bb1
+// CHECK: bb4:
+// CHECK:   br bb5(%1 : $Enum)
+//
+// This is the cond_br block after jump-threading.
+// CHECK: bb5(%{{.*}} : $Enum):
+// CHECK:   cond_br %{{.*}}, bb6, bb7
+// CHECK: bb6:
+// CHECK:   br bb5(%{{.*}} : $Enum)
+// CHECK: bb7:
+// CHECK:   br bb8
+// CHECK: bb8:
+// CHECK:   return %19 : $()
+// CHECK-LABEL: } // end sil function 'testInfinitePeeling'
+sil @testInfinitePeeling : $@convention(method) (Builtin.Int64, Enum) -> () {
+bb0(%0 : $Builtin.Int64, %1 : $Enum):
+  %2 = integer_literal $Builtin.Int64, 99999999
+  br bb1(%0 : $Builtin.Int64, %1 : $Enum)
+
+bb1(%4 : $Builtin.Int64, %5 : $Enum):
+  switch_enum %5 : $Enum, case #Enum.one!enumelt: bb4, default bb5
+
+bb2(%7 : $Builtin.Int64, %8 : $Enum):
+  %9 = builtin "cmp_slt_Int64"(%2 : $Builtin.Int64, %7 : $Builtin.Int64) : $Builtin.Int1
+  cond_br %9, bb3, bb6
+
+bb3:
+  br bb1(%7 : $Builtin.Int64, %8 : $Enum)
+
+bb4:
+  %12 = integer_literal $Builtin.Int64, 1
+  %13 = integer_literal $Builtin.Int1, -1
+  %14 = builtin "sadd_with_overflow_Int64"(%4 : $Builtin.Int64, %12 : $Builtin.Int64, %13 : $Builtin.Int1) : $(Builtin.Int64, Builtin.Int1)
+  %15 = tuple_extract %14 : $(Builtin.Int64, Builtin.Int1), 0
+  %16 = enum $Enum, #Enum.two!enumelt
+  br bb2(%15 : $Builtin.Int64, %16 : $Enum)
+
+bb5:
+  br bb2(%2 : $Builtin.Int64, %5 : $Enum)
+
+bb6:
+  %19 = tuple ()
+  return %19 : $()
+}

From c6df438145aa1f8f782605c48b5858a926ed0cc9 Mon Sep 17 00:00:00 2001
From: Andrew Trick <atrick@apple.com>
Date: Tue, 26 Jan 2021 19:40:37 -0800
Subject: [PATCH 2/2] Add a safeguard to SimplifyCFG tryJumpThreading to avoid
 infinite loop peeling

rdar://73644659 (Add a safeguard to SimplifyCFG tryJumpThreading to avoid infinite loop peeling)

A case of infinite loop peeling was exposed recently:

([SR-14068]: Compiling with optimisation runs indefinitely for grpc-swift)

It was trivially fixed here:

---
commit 8948f7565a2800e0378c2925085bf0a723089973 (HEAD -> fix-simplifycfg-tramp, public/fix-simplifycfg-tramp)
Author: Andrew Trick <atrick@apple.com>
Date:   Tue Jan 26 17:02:37 2021

Fix a SimplifyCFG typo that leads to unbounded optimization
---

However, that fix isn't a strong guarantee against this behavior. The
obvious complete fix is that jump-threading should not affect loop
structure. But changing that requires a performance investigation. In
the meantime this change introduces a simple mechanism that guarantees
that a loop header is not repeatedly cloned.

This safeguard is worthwhile because jump-threading across loop
boundaries is kicking in more frequently now the critical edges are
being split within SimplifyCFG.

Note that it is both necessary and desirable to split critical edges
between transformations so that SIL remains in a valid state. That
allows other code in SimplifyCFG to call arbitrary SIL utilities,
allows verifying SimplifyCFG by running verification between
transformation, and simplifies the patters that SimplifyCFG itself
needs to consider.

(cherry picked from commit 8b2098445e64f4c78ab2d2047730ab374164d5df)
---
 lib/SILOptimizer/Transforms/SimplifyCFG.cpp | 45 +++++++++++++++------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
index 6977ec1dbbd51..bb8aaa7b4af06 100644
--- a/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
+++ b/lib/SILOptimizer/Transforms/SimplifyCFG.cpp
@@ -78,6 +78,10 @@ class SimplifyCFG {
   llvm::SmallDenseMap<SILBasicBlock *, unsigned, 32> WorklistMap;
   // Keep track of loop headers - we don't want to jump-thread through them.
   SmallPtrSet<SILBasicBlock *, 32> LoopHeaders;
+  // The set of cloned loop headers to avoid infinite loop peeling. Blocks in
+  // this set may or may not still be LoopHeaders.
+  // (ultimately this can be used to eliminate findLoopHeaders)
+  SmallPtrSet<SILBasicBlock *, 4> ClonedLoopHeaders;
   // The cost (~ number of copied instructions) of jump threading per basic
   // block. Used to prevent infinite jump threading loops.
   llvm::SmallDenseMap<SILBasicBlock *, int, 8> JumpThreadingCost;
@@ -125,6 +129,16 @@ class SimplifyCFG {
   }
 
 private:
+  // Called when \p newBlock inherits the former predecessors of \p
+  // oldBlock. e.g. if \p oldBlock was a loop header, then newBlock is now a
+  // loop header.
+  void substitutedBlockPreds(SILBasicBlock *oldBlock, SILBasicBlock *newBlock) {
+    if (LoopHeaders.count(oldBlock))
+      LoopHeaders.insert(newBlock);
+    if (ClonedLoopHeaders.count(oldBlock))
+      ClonedLoopHeaders.insert(newBlock);
+  }
+
   void clearWorklist() {
     WorklistMap.clear();
     WorklistList.clear();
@@ -170,8 +184,10 @@ class SimplifyCFG {
     // Remove it from the map as well.
     WorklistMap.erase(It);
 
-    if (LoopHeaders.count(BB))
+    if (LoopHeaders.count(BB)) {
       LoopHeaders.erase(BB);
+      ClonedLoopHeaders.erase(BB);
+    }
   }
 
   bool simplifyBlocks();
@@ -1082,10 +1098,13 @@ bool SimplifyCFG::tryJumpThreading(BranchInst *BI) {
     return false;
 
   // Don't jump thread through a potential header - this can produce irreducible
-  // control flow. Still, we make an exception for switch_enum.
+  // control flow and lead to infinite loop peeling.
   bool DestIsLoopHeader = (LoopHeaders.count(DestBB) != 0);
   if (DestIsLoopHeader) {
-    if (!isa<SwitchEnumInst>(destTerminator))
+    // Make an exception for switch_enum, but only if it's block was not already
+    // peeled out of it's original loop. In that case, further jump threading
+    // can accomplish nothing, and the loop will be infinitely peeled.
+    if (!isa<SwitchEnumInst>(destTerminator) || ClonedLoopHeaders.count(DestBB))
       return false;
   }
 
@@ -1127,8 +1146,14 @@ bool SimplifyCFG::tryJumpThreading(BranchInst *BI) {
 
   // If we jump-thread a switch_enum in the loop header, we have to recalculate
   // the loop header info.
-  if (DestIsLoopHeader)
+  //
+  // FIXME: findLoopHeaders should not be called repeatedly during simplify-cfg
+  // iteration. It is a whole-function analysis! It also does no nothing help to
+  // avoid infinite loop peeling.
+  if (DestIsLoopHeader) {
+    ClonedLoopHeaders.insert(Cloner.getNewBB());
     findLoopHeaders();
+  }
 
   ++NumJumpThreads;
   return true;
@@ -1375,8 +1400,7 @@ bool SimplifyCFG::simplifyBranchBlock(BranchInst *BI) {
     for (auto &Succ : remainingBlock->getSuccessors())
       addToWorklist(Succ);
 
-    if (LoopHeaders.count(deletedBlock))
-      LoopHeaders.insert(remainingBlock);
+    substitutedBlockPreds(deletedBlock, remainingBlock);
 
     auto Iter = JumpThreadingCost.find(deletedBlock);
     if (Iter != JumpThreadingCost.end()) {
@@ -1400,8 +1424,7 @@ bool SimplifyCFG::simplifyBranchBlock(BranchInst *BI) {
                                          trampolineDest.newSourceBranchArgs);
     // Eliminating the trampoline can expose opportunities to improve the
     // new block we branch to.
-    if (LoopHeaders.count(DestBB))
-      LoopHeaders.insert(trampolineDest.destBB);
+    substitutedBlockPreds(DestBB, trampolineDest.destBB);
 
     addToWorklist(trampolineDest.destBB);
     BI->eraseFromParent();
@@ -1586,8 +1609,7 @@ bool SimplifyCFG::simplifyCondBrBlock(CondBranchInst *BI) {
         BI->getTrueBBCount(), BI->getFalseBBCount());
     BI->eraseFromParent();
 
-    if (LoopHeaders.count(TrueSide))
-      LoopHeaders.insert(ThisBB);
+    substitutedBlockPreds(TrueSide, ThisBB);
     removeIfDead(TrueSide);
     addToWorklist(ThisBB);
     return true;
@@ -1605,8 +1627,7 @@ bool SimplifyCFG::simplifyCondBrBlock(CondBranchInst *BI) {
         falseTrampolineDest.destBB, falseTrampolineDest.newSourceBranchArgs,
         BI->getTrueBBCount(), BI->getFalseBBCount());
     BI->eraseFromParent();
-    if (LoopHeaders.count(FalseSide))
-      LoopHeaders.insert(ThisBB);
+    substitutedBlockPreds(FalseSide, ThisBB);
     removeIfDead(FalseSide);
     addToWorklist(ThisBB);
     return true;