diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e5d496737d462..98034cfcd8413 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -389,6 +389,8 @@ Non-comprehensive list of changes in this release
   this build without optimizations (i.e. use `-O0` or use the `optnone` function
   attribute) or use the `fno-sanitize-merge=` flag in optimized builds.
 
+- ``__builtin_assume_dereferenceable`` now accepts non-constant size operands.
+
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 0b3ba89dfceb3..f63fc0f43ea7e 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -854,7 +854,7 @@ def BuiltinAssumeAligned : Builtin {
 def BuiltinAssumeDereferenceable : Builtin {
   let Spellings = ["__builtin_assume_dereferenceable"];
   let Attributes = [NoThrow, Const];
-  let Prototype = "void(void const*, _Constant size_t)";
+  let Prototype = "void(void const*, size_t)";
 }
 
 def BuiltinFree : Builtin {
diff --git a/clang/test/CodeGen/builtin-assume-dereferenceable.c b/clang/test/CodeGen/builtin-assume-dereferenceable.c
index cadffd4a84c26..0dc4ba089ee3a 100644
--- a/clang/test/CodeGen/builtin-assume-dereferenceable.c
+++ b/clang/test/CodeGen/builtin-assume-dereferenceable.c
@@ -32,3 +32,62 @@ int test2(int *a) {
   __builtin_assume_dereferenceable(a, 32ull);
   return a[0];
 }
+
+// CHECK-LABEL: @test3(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[N:%.*]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP1]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP0]], i64 [[CONV]]) ]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+int test3(int *a, int n) {
+  __builtin_assume_dereferenceable(a, n);
+  return a[0];
+}
+
+// CHECK-LABEL: @test4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[N:%.*]], ptr [[N_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[N_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP0]], i64 [[TMP1]]) ]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+int test4(int *a, unsigned long long n) {
+  __builtin_assume_dereferenceable(a, n);
+  return a[0];
+}
+
+// CHECK-LABEL: @test5(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca float, align 4
+// CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store float [[N:%.*]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = fptoui float [[TMP1]] to i64
+// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP0]], i64 [[CONV]]) ]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+int test5(int *a, float n) {
+  __builtin_assume_dereferenceable(a, n);
+  return a[0];
+}
diff --git a/clang/test/SemaCXX/builtin-assume-dereferenceable.cpp b/clang/test/SemaCXX/builtin-assume-dereferenceable.cpp
index b79b7c059567e..6380a9a7debf7 100644
--- a/clang/test/SemaCXX/builtin-assume-dereferenceable.cpp
+++ b/clang/test/SemaCXX/builtin-assume-dereferenceable.cpp
@@ -18,12 +18,12 @@ int test3(int *a) {
 }
 
 int test4(int *a, unsigned size) {
-  a = __builtin_assume_dereferenceable(a, size); // expected-error {{argument to '__builtin_assume_dereferenceable' must be a constant integer}}
+  __builtin_assume_dereferenceable(a, size);
   return a[0];
 }
 
 int test5(int *a, unsigned long long size) {
-  a = __builtin_assume_dereferenceable(a, size); // expected-error {{argument to '__builtin_assume_dereferenceable' must be a constant integer}}
+  __builtin_assume_dereferenceable(a, size);
   return a[0];
 }
 
@@ -53,3 +53,8 @@ constexpr void *l = __builtin_assume_dereferenceable(p, 4); // expected-error {{
 void *foo() {
   return l;
 }
+
+int test10(int *a) {
+  __builtin_assume_dereferenceable(a, a); // expected-error {{cannot initialize a parameter of type 'unsigned long' with an lvalue of type 'int *'}}
+  return a[0];
+}
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 73bfe1aabb4e0..52ab38583d5de 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -180,11 +180,15 @@ class MemoryDepChecker {
                         const SmallVectorImpl<Instruction *> &Instrs) const;
   };
 
-  MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
+  MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC,
+                   DominatorTree *DT, const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
-                   unsigned MaxTargetVectorWidthInBits)
-      : PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
-        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
+                   unsigned MaxTargetVectorWidthInBits,
+                   std::optional<ScalarEvolution::LoopGuards> &LoopGuards)
+      : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L),
+        SymbolicStrides(SymbolicStrides),
+        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
+        LoopGuards(LoopGuards) {}
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -236,8 +240,8 @@ class MemoryDepChecker {
 
   /// In same cases when the dependency check fails we can still
   /// vectorize the loop with a dynamic array access check.
-  bool shouldRetryWithRuntimeCheck() const {
-    return FoundNonConstantDistanceDependence &&
+  bool shouldRetryWithRuntimeChecks() const {
+    return ShouldRetryWithRuntimeChecks &&
            Status == VectorizationSafetyStatus::PossiblySafeWithRtChecks;
   }
 
@@ -288,6 +292,15 @@ class MemoryDepChecker {
     return PointerBounds;
   }
 
+  DominatorTree *getDT() const {
+    assert(DT && "requested DT, but it is not available");
+    return DT;
+  }
+  AssumptionCache *getAC() const {
+    assert(AC && "requested AC, but it is not available");
+    return AC;
+  }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks, and
   /// applies dynamic knowledge to simplify SCEV expressions and convert them
@@ -296,6 +309,10 @@ class MemoryDepChecker {
   /// example we might assume a unit stride for a pointer in order to prove
   /// that a memory access is strided and doesn't wrap.
   PredicatedScalarEvolution &PSE;
+
+  AssumptionCache *AC;
+  DominatorTree *DT;
+
   const Loop *InnermostLoop;
 
   /// Reference to map of pointer values to
@@ -327,9 +344,9 @@ class MemoryDepChecker {
   uint64_t MaxStoreLoadForwardSafeDistanceInBits =
       std::numeric_limits<uint64_t>::max();
 
-  /// If we see a non-constant dependence distance we can still try to
-  /// vectorize this loop with runtime checks.
-  bool FoundNonConstantDistanceDependence = false;
+  /// Whether we should try to vectorize the loop with runtime checks, if the
+  /// dependencies are not safe.
+  bool ShouldRetryWithRuntimeChecks = false;
 
   /// Result of the dependence checks, indicating whether the checked
   /// dependences are safe for vectorization, require RT checks or are known to
@@ -358,7 +375,7 @@ class MemoryDepChecker {
       PointerBounds;
 
   /// Cache for the loop guards of InnermostLoop.
-  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
+  std::optional<ScalarEvolution::LoopGuards> &LoopGuards;
 
   /// Check whether there is a plausible dependence between the two
   /// accesses.
@@ -516,8 +533,9 @@ class RuntimePointerChecking {
           AliasSetId(AliasSetId), Expr(Expr), NeedsFreeze(NeedsFreeze) {}
   };
 
-  RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE)
-      : DC(DC), SE(SE) {}
+  RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE,
+                         std::optional<ScalarEvolution::LoopGuards> &LoopGuards)
+      : DC(DC), SE(SE), LoopGuards(LoopGuards) {}
 
   /// Reset the state of the pointer runtime information.
   void reset() {
@@ -631,6 +649,9 @@ class RuntimePointerChecking {
   /// Holds a pointer to the ScalarEvolution analysis.
   ScalarEvolution *SE;
 
+  /// Cache for the loop guards of the loop.
+  std::optional<ScalarEvolution::LoopGuards> &LoopGuards;
+
   /// Set of run-time checks required to establish independence of
   /// otherwise may-aliasing pointers in the loop.
   SmallVector<RuntimePointerCheck, 4> Checks;
@@ -670,7 +691,7 @@ class LoopAccessInfo {
   LLVM_ABI LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                           const TargetTransformInfo *TTI,
                           const TargetLibraryInfo *TLI, AAResults *AA,
-                          DominatorTree *DT, LoopInfo *LI,
+                          DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC,
                           bool AllowPartial = false);
 
   /// Return true we can analyze the memory accesses in the loop and there are
@@ -806,6 +827,9 @@ class LoopAccessInfo {
 
   Loop *TheLoop;
 
+  /// Cache for the loop guards of TheLoop.
+  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
+
   /// Determines whether we should generate partial runtime checks when not all
   /// memory accesses could be analyzed.
   bool AllowPartial;
@@ -922,7 +946,9 @@ LLVM_ABI std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
     const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds);
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+    DominatorTree *DT, AssumptionCache *AC,
+    std::optional<ScalarEvolution::LoopGuards> &LoopGuards);
 
 class LoopAccessInfoManager {
   /// The cache.
@@ -935,12 +961,13 @@ class LoopAccessInfoManager {
   LoopInfo &LI;
   TargetTransformInfo *TTI;
   const TargetLibraryInfo *TLI = nullptr;
+  AssumptionCache *AC;
 
 public:
   LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
                         LoopInfo &LI, TargetTransformInfo *TTI,
-                        const TargetLibraryInfo *TLI)
-      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI) {}
+                        const TargetLibraryInfo *TLI, AssumptionCache *AC)
+      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {}
 
   LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false);
 
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index b83fc3b72abf3..ca0b7823558ff 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1000,10 +1000,14 @@ class ScalarEvolution {
   /// (at every loop iteration).  It is, at the same time, the minimum number
   /// of times S is divisible by 2.  For example, given {4,+,8} it returns 2.
   /// If S is guaranteed to be 0, it returns the bitwidth of S.
-  LLVM_ABI uint32_t getMinTrailingZeros(const SCEV *S);
+  /// If \p CtxI is not nullptr, return a constant multiple valid at \p CtxI.
+  LLVM_ABI uint32_t getMinTrailingZeros(const SCEV *S,
+                                        const Instruction *CtxI = nullptr);
 
-  /// Returns the max constant multiple of S.
-  LLVM_ABI APInt getConstantMultiple(const SCEV *S);
+  /// Returns the max constant multiple of S. If \p CtxI is not nullptr, return
+  /// a constant multiple valid at \p CtxI.
+  LLVM_ABI APInt getConstantMultiple(const SCEV *S,
+                                     const Instruction *CtxI = nullptr);
 
   // Returns the max constant multiple of S. If S is exactly 0, return 1.
   LLVM_ABI APInt getNonZeroConstantMultiple(const SCEV *S);
@@ -1339,6 +1343,7 @@ class ScalarEvolution {
 
   class LoopGuards {
     DenseMap<const SCEV *, const SCEV *> RewriteMap;
+    SmallDenseSet<std::pair<const SCEV *, const SCEV *>> NotEqual;
     bool PreserveNUW = false;
     bool PreserveNSW = false;
     ScalarEvolution &SE;
@@ -1525,8 +1530,10 @@ class ScalarEvolution {
   /// Return the Value set from which the SCEV expr is generated.
   ArrayRef<Value *> getSCEVValues(const SCEV *S);
 
-  /// Private helper method for the getConstantMultiple method.
-  APInt getConstantMultipleImpl(const SCEV *S);
+  /// Private helper method for the getConstantMultiple method. If \p CtxI is
+  /// not nullptr, return a constant multiple valid at \p CtxI.
+  APInt getConstantMultipleImpl(const SCEV *S,
+                                const Instruction *Ctx = nullptr);
 
   /// Information about the number of times a particular loop exit may be
   /// reached before exiting the loop.
@@ -2310,10 +2317,6 @@ class ScalarEvolution {
   /// an add rec on said loop.
   void getUsedLoops(const SCEV *S, SmallPtrSetImpl<const Loop *> &LoopsUsed);
 
-  /// Try to match the pattern generated by getURemExpr(A, B). If successful,
-  /// Assign A and B to LHS and RHS, respectively.
-  LLVM_ABI bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
-
   /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
   /// `UniqueSCEVs`.  Return if found, else nullptr.
   SCEV *findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 09e3945f5a8ff..24a86c59d1cdb 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -91,6 +91,14 @@ inline bind_ty<const SCEVUnknown> m_SCEVUnknown(const SCEVUnknown *&V) {
   return V;
 }
 
+inline bind_ty<const SCEVAddExpr> m_scev_Add(const SCEVAddExpr *&V) {
+  return V;
+}
+
+inline bind_ty<const SCEVMulExpr> m_scev_Mul(const SCEVMulExpr *&V) {
+  return V;
+}
+
 /// Match a specified const SCEV *.
 struct specificscev_ty {
   const SCEV *Expr;
@@ -160,6 +168,18 @@ m_scev_ZExt(const Op0_t &Op0) {
   return m_scev_Unary<SCEVZeroExtendExpr>(Op0);
 }
 
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>
+m_scev_PtrToInt(const Op0_t &Op0) {
+  return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVTruncateExpr, Op0_t>
+m_scev_Trunc(const Op0_t &Op0) {
+  return m_scev_Unary<SCEVTruncateExpr>(Op0);
+}
+
 /// Match a binary SCEV.
 template <typename SCEVTy, typename Op0_t, typename Op1_t>
 struct SCEVBinaryExpr_match {
@@ -199,6 +219,76 @@ m_scev_UDiv(const Op0_t &Op0, const Op1_t &Op1) {
   return m_scev_Binary<SCEVUDivExpr>(Op0, Op1);
 }
 
+/// Match unsigned remainder pattern.
+/// Matches patterns generated by getURemExpr.
+template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
+  Op0_t Op0;
+  Op1_t Op1;
+  ScalarEvolution &SE;
+
+  SCEVURem_match(Op0_t Op0, Op1_t Op1, ScalarEvolution &SE)
+      : Op0(Op0), Op1(Op1), SE(SE) {}
+
+  bool match(const SCEV *Expr) const {
+    if (Expr->getType()->isPointerTy())
+      return false;
+
+    // Try to match 'zext (trunc A to iB) to iY', which is used
+    // for URem with constant power-of-2 second operands. Make sure the size of
+    // the operand A matches the size of the whole expressions.
+    const SCEV *LHS;
+    if (SCEVPatternMatch::match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) {
+      Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType();
+      // Bail out if the type of the LHS is larger than the type of the
+      // expression for now.
+      if (SE.getTypeSizeInBits(LHS->getType()) >
+          SE.getTypeSizeInBits(Expr->getType()))
+        return false;
+      if (LHS->getType() != Expr->getType())
+        LHS = SE.getZeroExtendExpr(LHS, Expr->getType());
+      const SCEV *RHS =
+          SE.getConstant(APInt(SE.getTypeSizeInBits(Expr->getType()), 1)
+                         << SE.getTypeSizeInBits(TruncTy));
+      return Op0.match(LHS) && Op1.match(RHS);
+    }
+
+    const SCEV *A;
+    const SCEVMulExpr *Mul;
+    if (!SCEVPatternMatch::match(Expr, m_scev_Add(m_scev_Mul(Mul), m_SCEV(A))))
+      return false;
+
+    const auto MatchURemWithDivisor = [&](const SCEV *B) {
+      // (SomeExpr + (-(SomeExpr / B) * B)).
+      if (Expr == SE.getURemExpr(A, B))
+        return Op0.match(A) && Op1.match(B);
+      return false;
+    };
+
+    // (SomeExpr + (-1 * (SomeExpr / B) * B)).
+    if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
+      return MatchURemWithDivisor(Mul->getOperand(1)) ||
+             MatchURemWithDivisor(Mul->getOperand(2));
+
+    // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
+    if (Mul->getNumOperands() == 2)
+      return MatchURemWithDivisor(Mul->getOperand(1)) ||
+             MatchURemWithDivisor(Mul->getOperand(0)) ||
+             MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(1))) ||
+             MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(0)));
+    return false;
+  }
+};
+
+/// Match the mathematical pattern A - (A / B) * B, where A and B can be
+/// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
+/// for URem with constant power-of-2 second operands. It's not always easy, as
+/// A and B can be folded (imagine A is X / 2, and B is 4, A / B becomes X / 8).
+template <typename Op0_t, typename Op1_t>
+inline SCEVURem_match<Op0_t, Op1_t> m_scev_URem(Op0_t LHS, Op1_t RHS,
+                                                ScalarEvolution &SE) {
+  return SCEVURem_match<Op0_t, Op1_t>(LHS, RHS, SE);
+}
+
 inline class_match<const Loop> m_Loop() { return class_match<const Loop>(); }
 
 /// Match an affine SCEVAddRecExpr.
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 02990a3cb44f7..3e1e718114008 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -618,6 +618,12 @@ LLVM_ABI bool isValidAssumeForContext(const Instruction *I,
                                       const DominatorTree *DT = nullptr,
                                       bool AllowEphemerals = false);
 
+/// Returns true, if no instruction between \p Assume and \p CtxI may free
+/// memory and the function is marked as NoSync. The latter ensures the current
+/// function cannot arrange for another thread to free on its behalf.
+LLVM_ABI bool willNotFreeBetween(const Instruction *Assume,
+                                 const Instruction *CtxI);
+
 enum class OverflowResult {
   /// Always overflows in the direction of signed/unsigned min value.
   AlwaysOverflowsLow,
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index a101151eed7cc..39fef921a9590 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -530,6 +530,7 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
 
   bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
 
+  Value *tryToReuseLCSSAPhi(const SCEVAddRecExpr *S);
   Value *expandAddRecExprLiterally(const SCEVAddRecExpr *);
   PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
                                      const Loop *L, Type *&TruncTy,
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index fcc2cf2f7e8e7..ff0a9966551ce 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -36,17 +36,13 @@ static bool isDereferenceableAndAlignedPointerViaAssumption(
     function_ref<bool(const RetainedKnowledge &RK)> CheckSize,
     const DataLayout &DL, const Instruction *CtxI, AssumptionCache *AC,
     const DominatorTree *DT) {
-  // Dereferenceable information from assumptions is only valid if the value
-  // cannot be freed between the assumption and use. For now just use the
-  // information for values that cannot be freed in the function.
-  // TODO: More precisely check if the pointer can be freed between assumption
-  // and use.
-  if (!CtxI || Ptr->canBeFreed())
+  if (!CtxI)
     return false;
   /// Look through assumes to see if both dereferencability and alignment can
   /// be proven by an assume if needed.
   RetainedKnowledge AlignRK;
   RetainedKnowledge DerefRK;
+  bool PtrCanBeFreed = Ptr->canBeFreed();
   bool IsAligned = Ptr->getPointerAlignment(DL) >= Alignment;
   return getKnowledgeForValue(
       Ptr, {Attribute::Dereferenceable, Attribute::Alignment}, *AC,
@@ -55,7 +51,11 @@ static bool isDereferenceableAndAlignedPointerViaAssumption(
           return false;
         if (RK.AttrKind == Attribute::Alignment)
           AlignRK = std::max(AlignRK, RK);
-        if (RK.AttrKind == Attribute::Dereferenceable)
+
+        // Dereferenceable information from assumptions is only valid if the
+        // value cannot be freed between the assumption and use.
+        if ((!PtrCanBeFreed || willNotFreeBetween(Assume, CtxI)) &&
+            RK.AttrKind == Attribute::Dereferenceable)
           DerefRK = std::max(DerefRK, RK);
         IsAligned |= AlignRK && AlignRK.ArgValue >= Alignment.value();
         if (IsAligned && DerefRK && CheckSize(DerefRK))
@@ -332,17 +332,10 @@ bool llvm::isDereferenceableAndAlignedInLoop(
                             : SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     return false;
-
-  if (isa<SCEVCouldNotCompute>(BECount)) {
-    // TODO: Support symbolic max backedge taken counts for loops without
-    // computable backedge taken counts.
-    MaxBECount =
-        Predicates
-            ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates)
-            : SE.getConstantMaxBackedgeTakenCount(L);
-  }
-  const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
-      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr);
+  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
+  const auto &[AccessStart, AccessEnd] =
+      getStartAndEndForAccess(L, PtrScev, LI->getType(), BECount, MaxBECount,
+                              &SE, nullptr, &DT, AC, LoopGuards);
   if (isa<SCEVCouldNotCompute>(AccessStart) ||
       isa<SCEVCouldNotCompute>(AccessEnd))
     return false;
@@ -351,7 +344,13 @@ bool llvm::isDereferenceableAndAlignedInLoop(
   const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
   if (isa<SCEVCouldNotCompute>(PtrDiff))
     return false;
-  APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
+
+  if (!LoopGuards)
+    LoopGuards.emplace(
+        ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE));
+
+  APInt MaxPtrDiff =
+      SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, *LoopGuards));
 
   Value *Base = nullptr;
   APInt AccessSize;
@@ -391,16 +390,22 @@ bool llvm::isDereferenceableAndAlignedInLoop(
   } else
     return false;
 
-  Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt();
+  Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
+  if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
+    if (isa<BranchInst>(LoopPred->getTerminator()))
+      CtxI = LoopPred->getTerminator();
+  }
   return isDereferenceableAndAlignedPointerViaAssumption(
              Base, Alignment,
-             [&SE, AccessSizeSCEV](const RetainedKnowledge &RK) {
-               return SE.isKnownPredicate(CmpInst::ICMP_ULE, AccessSizeSCEV,
-                                          SE.getSCEV(RK.IRArgValue));
+             [&SE, AccessSizeSCEV, &LoopGuards](const RetainedKnowledge &RK) {
+               return SE.isKnownPredicate(
+                   CmpInst::ICMP_ULE,
+                   SE.applyLoopGuards(AccessSizeSCEV, *LoopGuards),
+                   SE.applyLoopGuards(SE.getSCEV(RK.IRArgValue), *LoopGuards));
              },
-             DL, HeaderFirstNonPHI, AC, &DT) ||
+             DL, CtxI, AC, &DT) ||
          isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
-                                            HeaderFirstNonPHI, AC, &DT);
+                                            CtxI, AC, &DT);
 }
 
 static bool suppressSpeculativeLoadForSanitizers(const Instruction &CtxI) {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index f8f741575f87a..02a6a66362493 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -23,6 +23,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -208,28 +210,57 @@ static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
 
 /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
 /// \p MaxBTC is guaranteed inbounds of the accessed object.
-static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
-                                                 const SCEV *MaxBTC,
-                                                 const SCEV *EltSize,
-                                                 ScalarEvolution &SE,
-                                                 const DataLayout &DL) {
+static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
+    const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
+    ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
+    AssumptionCache *AC,
+    std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
   auto *PointerBase = SE.getPointerBase(AR->getStart());
   auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase);
   if (!StartPtr)
     return false;
+  const Loop *L = AR->getLoop();
   bool CheckForNonNull, CheckForFreed;
-  uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes(
+  Value *StartPtrV = StartPtr->getValue();
+  uint64_t DerefBytes = StartPtrV->getPointerDereferenceableBytes(
       DL, CheckForNonNull, CheckForFreed);
 
-  if (CheckForNonNull || CheckForFreed)
+  if (DerefBytes && (CheckForNonNull || CheckForFreed))
     return false;
 
   const SCEV *Step = AR->getStepRecurrence(SE);
+  Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
+  const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes);
+
+  // Check if we have a suitable dereferencable assumption we can use.
+  Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
+  if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
+    if (isa<BranchInst>(LoopPred->getTerminator()))
+      CtxI = LoopPred->getTerminator();
+  }
+  RetainedKnowledge DerefRK;
+  getKnowledgeForValue(StartPtrV, {Attribute::Dereferenceable}, *AC,
+                       [&](RetainedKnowledge RK, Instruction *Assume, auto) {
+                         if (!isValidAssumeForContext(Assume, CtxI, DT))
+                           return false;
+                         if (StartPtrV->canBeFreed() &&
+                             !willNotFreeBetween(Assume, CtxI))
+                           return false;
+                         DerefRK = std::max(DerefRK, RK);
+                         return true;
+                       });
+  if (DerefRK) {
+    DerefBytesSCEV =
+        SE.getUMaxExpr(DerefBytesSCEV, SE.getSCEV(DerefRK.IRArgValue));
+  }
+
+  if (DerefBytesSCEV->isZero())
+    return false;
+
   bool IsKnownNonNegative = SE.isKnownNonNegative(Step);
   if (!IsKnownNonNegative && !SE.isKnownNegative(Step))
     return false;
 
-  Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
   Step = SE.getNoopOrSignExtend(Step, WiderTy);
   MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy);
 
@@ -239,10 +270,25 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
   const SCEV *StartOffset = SE.getNoopOrZeroExtend(
       SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy);
 
+  if (!LoopGuards)
+    LoopGuards.emplace(ScalarEvolution::LoopGuards::collect(AR->getLoop(), SE));
+  MaxBTC = SE.applyLoopGuards(MaxBTC, *LoopGuards);
+
   const SCEV *OffsetAtLastIter =
       mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE);
-  if (!OffsetAtLastIter)
-    return false;
+  if (!OffsetAtLastIter) {
+    // Re-try with constant max backedge-taken count if using the symbolic one
+    // failed.
+    MaxBTC = SE.getConstantMaxBackedgeTakenCount(AR->getLoop());
+    if (isa<SCEVCouldNotCompute>(MaxBTC))
+      return false;
+    MaxBTC = SE.getNoopOrZeroExtend(
+        MaxBTC, WiderTy);
+    OffsetAtLastIter =
+        mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE);
+    if (!OffsetAtLastIter)
+      return false;
+  }
 
   const SCEV *OffsetEndBytes = addSCEVNoOverflow(
       OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE);
@@ -256,8 +302,9 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
     const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE);
     if (!EndBytes)
       return false;
-    return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes,
-                               SE.getConstant(WiderTy, DerefBytes));
+
+    DerefBytesSCEV = SE.applyLoopGuards(DerefBytesSCEV, *LoopGuards);
+    return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV);
   }
 
   // For negative steps check if
@@ -265,15 +312,16 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
   //  * StartOffset <= DerefBytes.
   assert(SE.isKnownNegative(Step) && "must be known negative");
   return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) &&
-         SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset,
-                             SE.getConstant(WiderTy, DerefBytes));
+         SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, DerefBytesSCEV);
 }
 
 std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC,
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds,
+    DominatorTree *DT, AssumptionCache *AC,
+    std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
   std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
   if (PointerBounds) {
     auto [Iter, Ins] = PointerBounds->insert(
@@ -308,8 +356,8 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
       // sets ScEnd to the maximum unsigned value for the type. Note that LAA
       // separately checks that accesses cannot not wrap, so unsigned max
       // represents an upper bound.
-      if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE,
-                                               DL)) {
+      if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL,
+                                               DT, AC, LoopGuards)) {
         ScEnd = AR->evaluateAtIteration(MaxBTC, *SE);
       } else {
         ScEnd = SE->getAddExpr(
@@ -356,9 +404,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     bool NeedsFreeze) {
   const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
   const SCEV *BTC = PSE.getBackedgeTakenCount();
-  const auto &[ScStart, ScEnd] =
-      getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC,
-                              PSE.getSE(), &DC.getPointerBounds());
+  const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
+      Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(),
+      &DC.getPointerBounds(), DC.getDT(), DC.getAC(), LoopGuards);
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -589,11 +637,11 @@ void RuntimePointerChecking::groupChecks(
   // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
   // us to perform an accurate check in this case.
   //
-  // The above case requires that we have an UnknownDependence between
-  // accesses to the same underlying object. This cannot happen unless
-  // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
-  // is also false. In this case we will use the fallback path and create
-  // separate checking groups for all pointers.
+  // In the above case, we have a non-constant distance and an Unknown
+  // dependence between accesses to the same underlying object, and could retry
+  // with runtime checks. Therefore UseDependencies is false. In this case we
+  // will use the fallback path and create separate checking groups for all
+  // pointers.
 
   // If we don't have the dependency partitions, construct a new
   // checking pointer group for each pointer. This is also required
@@ -819,7 +867,7 @@ class AccessAnalysis {
   /// perform dependency checking.
   ///
   /// Note that this can later be cleared if we retry memcheck analysis without
-  /// dependency checking (i.e. FoundNonConstantDistanceDependence).
+  /// dependency checking (i.e. ShouldRetryWithRuntimeChecks).
   bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); }
 
   /// We decided that no dependence analysis would be used.  Reset the state.
@@ -896,7 +944,7 @@ class AccessAnalysis {
   ///
   /// Note that, this is different from isDependencyCheckNeeded.  When we retry
   /// memcheck analysis without dependency checking
-  /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
+  /// (i.e. ShouldRetryWithRuntimeChecks), isDependencyCheckNeeded is
   /// cleared while this remains set if we have potentially dependent accesses.
   bool IsRTCheckAnalysisNeeded = false;
 
@@ -1961,13 +2009,15 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
   const SCEV *BTC = PSE.getBackedgeTakenCount();
   const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount();
   ScalarEvolution &SE = *PSE.getSE();
-  const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
-      InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds);
+  const auto &[SrcStart_, SrcEnd_] =
+      getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC,
+                              &SE, &PointerBounds, DT, AC, LoopGuards);
   if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_))
     return false;
 
-  const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
-      InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds);
+  const auto &[SinkStart_, SinkEnd_] =
+      getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC,
+                              &SE, &PointerBounds, DT, AC, LoopGuards);
   if (isa<SCEVCouldNotCompute>(SinkStart_) ||
       isa<SCEVCouldNotCompute>(SinkEnd_))
     return false;
@@ -2079,11 +2129,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   if (StrideAScaled == StrideBScaled)
     CommonStride = StrideAScaled;
 
-  // TODO: FoundNonConstantDistanceDependence is used as a necessary condition
-  // to consider retrying with runtime checks. Historically, we did not set it
-  // when (unscaled) strides were different but there is no inherent reason to.
+  // TODO: Historically, we didn't retry with runtime checks when (unscaled)
+  // strides were different but there is no inherent reason to.
   if (!isa<SCEVConstant>(Dist))
-    FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt;
+    ShouldRetryWithRuntimeChecks |= StrideAPtrInt == StrideBPtrInt;
 
   return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
                                       TypeByteSize, AIsWrite, BIsWrite);
@@ -2713,7 +2762,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     DepsAreSafe =
         DepChecker->areDepsSafe(DepCands, Accesses.getDependenciesToCheck());
 
-    if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) {
+    if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeChecks()) {
       LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
 
       // Clear the dependency checks. We assume they are not needed.
@@ -3004,7 +3053,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetTransformInfo *TTI,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI,
-                               bool AllowPartial)
+                               AssumptionCache *AC, bool AllowPartial)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
   unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
@@ -3014,9 +3063,10 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
     MaxTargetVectorWidthInBits =
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
-  DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
-                                                  MaxTargetVectorWidthInBits);
-  PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
+  DepChecker = std::make_unique<MemoryDepChecker>(
+      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards);
+  PtrRtChecking =
+      std::make_unique<RuntimePointerChecking>(*DepChecker, SE, LoopGuards);
   if (canAnalyzeLoop())
     CanVecMem = analyzeLoop(AA, LI, TLI, DT);
 }
@@ -3084,7 +3134,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
   // or if it was created with a different value of AllowPartial.
   if (Inserted || It->second->hasAllowPartial() != AllowPartial)
     It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT,
-                                                  &LI, AllowPartial);
+                                                  &LI, AC, AllowPartial);
 
   return *It->second;
 }
@@ -3127,7 +3177,8 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
   auto &LI = FAM.getResult<LoopAnalysis>(F);
   auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
-  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI);
+  auto &AC = FAM.getResult<AssumptionAnalysis>(F);
+  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
 }
 
 AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 221468a2d1a84..239849e670350 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1774,7 +1774,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
   {
     const SCEV *LHS;
     const SCEV *RHS;
-    if (matchURem(Op, LHS, RHS))
+    if (match(Op, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), *this)))
       return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1),
                          getZeroExtendExpr(RHS, Ty, Depth + 1));
   }
@@ -2682,21 +2682,30 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         return getAddExpr(NewOps, PreservedFlags);
       }
     }
-  }
 
-  // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
-  if (Ops.size() == 2) {
-    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[0]);
-    if (Mul && Mul->getNumOperands() == 2 &&
-        Mul->getOperand(0)->isAllOnesValue()) {
-      const SCEV *X;
-      const SCEV *Y;
-      if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) {
-        return getMulExpr(Y, getUDivExpr(X, Y));
+    // Try to push the constant operand into a ZExt: A + zext (-A + B) -> zext
+    // (B), if trunc (A) + -A + B  does not unsigned-wrap.
+    const SCEVAddExpr *InnerAdd;
+    if (match(B, m_scev_ZExt(m_scev_Add(InnerAdd)))) {
+      const SCEV *NarrowA = getTruncateExpr(A, InnerAdd->getType());
+      if (NarrowA == getNegativeSCEV(InnerAdd->getOperand(0)) &&
+          getZeroExtendExpr(NarrowA, B->getType()) == A &&
+          hasFlags(StrengthenNoWrapFlags(this, scAddExpr, {NarrowA, InnerAdd},
+                                         SCEV::FlagAnyWrap),
+                   SCEV::FlagNUW)) {
+        return getZeroExtendExpr(getAddExpr(NarrowA, InnerAdd), B->getType());
       }
     }
   }
 
+  // Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
+  const SCEV *Y;
+  if (Ops.size() == 2 &&
+      match(Ops[0],
+            m_scev_Mul(m_scev_AllOnes(),
+                       m_scev_URem(m_scev_Specific(Ops[1]), m_SCEV(Y), *this))))
+    return getMulExpr(Y, getUDivExpr(Ops[1], Y));
+
   // Skip past any other cast SCEVs.
   while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
     ++Idx;
@@ -3186,6 +3195,46 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                AddRec->getNoWrapFlags(FlagsMask));
         }
       }
+
+      // Try to push the constant operand into a ZExt: C * zext (A + B) ->
+      // zext (C*A + C*B) if trunc (C) * (A + B)  does not unsigned-wrap.
+      const SCEVAddExpr *InnerAdd;
+      if (match(Ops[1], m_scev_ZExt(m_scev_Add(InnerAdd)))) {
+        const SCEV *NarrowC = getTruncateExpr(LHSC, InnerAdd->getType());
+        if (isa<SCEVConstant>(InnerAdd->getOperand(0)) &&
+            getZeroExtendExpr(NarrowC, Ops[1]->getType()) == LHSC &&
+            hasFlags(StrengthenNoWrapFlags(this, scMulExpr, {NarrowC, InnerAdd},
+                                           SCEV::FlagAnyWrap),
+                     SCEV::FlagNUW)) {
+          auto *Res = getMulExpr(NarrowC, InnerAdd, SCEV::FlagNUW, Depth + 1);
+          return getZeroExtendExpr(Res, Ops[1]->getType(), Depth + 1);
+        };
+      }
+
+      // Try to fold (C1 * D /u C2) -> C1/C2 * D, if C1 and C2 are powers-of-2,
+      // D is a multiple of C2, and C1 is a multiple of C2. If C2 is a multiple
+      // of C1, fold to (D /u (C2 /u C1)).
+      const SCEV *D;
+      APInt C1V = LHSC->getAPInt();
+      // (C1 * D /u C2) == -1 * -C1 * D /u C2 when C1 != INT_MIN. Don't treat -1
+      // as -1 * 1, as it won't enable additional folds.
+      if (C1V.isNegative() && !C1V.isMinSignedValue() && !C1V.isAllOnes())
+        C1V = C1V.abs();
+      const SCEVConstant *C2;
+      if (C1V.isPowerOf2() &&
+          match(Ops[1], m_scev_UDiv(m_SCEV(D), m_SCEVConstant(C2))) &&
+          C2->getAPInt().isPowerOf2() &&
+          C1V.logBase2() <= getMinTrailingZeros(D)) {
+        const SCEV *NewMul = nullptr;
+        if (C1V.uge(C2->getAPInt())) {
+          NewMul = getMulExpr(getUDivExpr(getConstant(C1V), C2), D);
+        } else if (C2->getAPInt().logBase2() <= getMinTrailingZeros(D)) {
+          assert(C1V.ugt(1) && "C1 <= 1 should have been folded earlier");
+          NewMul = getUDivExpr(D, getUDivExpr(C2, getConstant(C1V)));
+        }
+        if (NewMul)
+          return C1V == LHSC->getAPInt() ? NewMul : getNegativeSCEV(NewMul);
+      }
     }
   }
 
@@ -4567,17 +4616,11 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
 
 /// If Expr computes ~A, return A else return nullptr
 static const SCEV *MatchNotExpr(const SCEV *Expr) {
-  const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (!Add || Add->getNumOperands() != 2 ||
-      !Add->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
-  if (!AddRHS || AddRHS->getNumOperands() != 2 ||
-      !AddRHS->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  return AddRHS->getOperand(1);
+  const SCEV *MulOp;
+  if (match(Expr, m_scev_Add(m_scev_AllOnes(),
+                             m_scev_Mul(m_scev_AllOnes(), m_SCEV(MulOp)))))
+    return MulOp;
+  return nullptr;
 }
 
 /// Return a SCEV corresponding to ~V = -1-V
@@ -5358,20 +5401,15 @@ static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
   if (SourceBits != NewBits)
     return nullptr;
 
-  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
-  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
-  if (!SExt && !ZExt)
-    return nullptr;
-  const SCEVTruncateExpr *Trunc =
-      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
-           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
-  if (!Trunc)
-    return nullptr;
-  const SCEV *X = Trunc->getOperand();
-  if (X != SymbolicPHI)
-    return nullptr;
-  Signed = SExt != nullptr;
-  return Trunc->getType();
+  if (match(Op, m_scev_SExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = true;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  if (match(Op, m_scev_ZExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+    Signed = false;
+    return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+  }
+  return nullptr;
 }
 
 static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
@@ -6290,19 +6328,20 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   return getGEPExpr(GEP, IndexExprs);
 }
 
-APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
+APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S,
+                                               const Instruction *CtxI) {
   uint64_t BitWidth = getTypeSizeInBits(S->getType());
   auto GetShiftedByZeros = [BitWidth](uint32_t TrailingZeros) {
     return TrailingZeros >= BitWidth
                ? APInt::getZero(BitWidth)
                : APInt::getOneBitSet(BitWidth, TrailingZeros);
   };
-  auto GetGCDMultiple = [this](const SCEVNAryExpr *N) {
+  auto GetGCDMultiple = [this, CtxI](const SCEVNAryExpr *N) {
     // The result is GCD of all operands results.
-    APInt Res = getConstantMultiple(N->getOperand(0));
+    APInt Res = getConstantMultiple(N->getOperand(0), CtxI);
     for (unsigned I = 1, E = N->getNumOperands(); I < E && Res != 1; ++I)
       Res = APIntOps::GreatestCommonDivisor(
-          Res, getConstantMultiple(N->getOperand(I)));
+          Res, getConstantMultiple(N->getOperand(I), CtxI));
     return Res;
   };
 
@@ -6310,33 +6349,33 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
   case scConstant:
     return cast<SCEVConstant>(S)->getAPInt();
   case scPtrToInt:
-    return getConstantMultiple(cast<SCEVPtrToIntExpr>(S)->getOperand());
+    return getConstantMultiple(cast<SCEVPtrToIntExpr>(S)->getOperand(), CtxI);
   case scUDivExpr:
   case scVScale:
     return APInt(BitWidth, 1);
   case scTruncate: {
     // Only multiples that are a power of 2 will hold after truncation.
     const SCEVTruncateExpr *T = cast<SCEVTruncateExpr>(S);
-    uint32_t TZ = getMinTrailingZeros(T->getOperand());
+    uint32_t TZ = getMinTrailingZeros(T->getOperand(), CtxI);
     return GetShiftedByZeros(TZ);
   }
   case scZeroExtend: {
     const SCEVZeroExtendExpr *Z = cast<SCEVZeroExtendExpr>(S);
-    return getConstantMultiple(Z->getOperand()).zext(BitWidth);
+    return getConstantMultiple(Z->getOperand(), CtxI).zext(BitWidth);
   }
   case scSignExtend: {
     // Only multiples that are a power of 2 will hold after sext.
     const SCEVSignExtendExpr *E = cast<SCEVSignExtendExpr>(S);
-    uint32_t TZ = getMinTrailingZeros(E->getOperand());
+    uint32_t TZ = getMinTrailingZeros(E->getOperand(), CtxI);
     return GetShiftedByZeros(TZ);
   }
   case scMulExpr: {
     const SCEVMulExpr *M = cast<SCEVMulExpr>(S);
     if (M->hasNoUnsignedWrap()) {
       // The result is the product of all operand results.
-      APInt Res = getConstantMultiple(M->getOperand(0));
+      APInt Res = getConstantMultiple(M->getOperand(0), CtxI);
       for (const SCEV *Operand : M->operands().drop_front())
-        Res = Res * getConstantMultiple(Operand);
+        Res = Res * getConstantMultiple(Operand, CtxI);
       return Res;
     }
 
@@ -6344,7 +6383,7 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
     // sum of trailing zeros for all its operands.
     uint32_t TZ = 0;
     for (const SCEV *Operand : M->operands())
-      TZ += getMinTrailingZeros(Operand);
+      TZ += getMinTrailingZeros(Operand, CtxI);
     return GetShiftedByZeros(TZ);
   }
   case scAddExpr:
@@ -6353,9 +6392,9 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
     if (N->hasNoUnsignedWrap())
         return GetGCDMultiple(N);
     // Find the trailing bits, which is the minimum of its operands.
-    uint32_t TZ = getMinTrailingZeros(N->getOperand(0));
+    uint32_t TZ = getMinTrailingZeros(N->getOperand(0), CtxI);
     for (const SCEV *Operand : N->operands().drop_front())
-      TZ = std::min(TZ, getMinTrailingZeros(Operand));
+      TZ = std::min(TZ, getMinTrailingZeros(Operand, CtxI));
     return GetShiftedByZeros(TZ);
   }
   case scUMaxExpr:
@@ -6365,10 +6404,20 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
   case scSequentialUMinExpr:
     return GetGCDMultiple(cast<SCEVNAryExpr>(S));
   case scUnknown: {
-    // ask ValueTracking for known bits
+    // Ask ValueTracking for known bits. SCEVUnknown only become available at
+    // the point their underlying IR instruction has been defined. If CtxI was
+    // not provided, use:
+    // * the first instruction in the entry block if it is an argument
+    // * the instruction itself otherwise.
     const SCEVUnknown *U = cast<SCEVUnknown>(S);
+    if (!CtxI) {
+      if (isa<Argument>(U->getValue()))
+        CtxI = &*F.getEntryBlock().begin();
+      else if (auto *I = dyn_cast<Instruction>(U->getValue()))
+        CtxI = I;
+    }
     unsigned Known =
-        computeKnownBits(U->getValue(), getDataLayout(), &AC, nullptr, &DT)
+        computeKnownBits(U->getValue(), getDataLayout(), &AC, CtxI, &DT)
             .countMinTrailingZeros();
     return GetShiftedByZeros(Known);
   }
@@ -6378,12 +6427,18 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
   llvm_unreachable("Unknown SCEV kind!");
 }
 
-APInt ScalarEvolution::getConstantMultiple(const SCEV *S) {
+APInt ScalarEvolution::getConstantMultiple(const SCEV *S,
+                                           const Instruction *CtxI) {
+  // Skip looking up and updating the cache if there is a context instruction,
+  // as the result will only be valid in the specified context.
+  if (CtxI)
+    return getConstantMultipleImpl(S, CtxI);
+
   auto I = ConstantMultipleCache.find(S);
   if (I != ConstantMultipleCache.end())
     return I->second;
 
-  APInt Result = getConstantMultipleImpl(S);
+  APInt Result = getConstantMultipleImpl(S, CtxI);
   auto InsertPair = ConstantMultipleCache.insert({S, Result});
   assert(InsertPair.second && "Should insert a new key");
   return InsertPair.first->second;
@@ -6394,8 +6449,9 @@ APInt ScalarEvolution::getNonZeroConstantMultiple(const SCEV *S) {
   return Multiple == 0 ? APInt(Multiple.getBitWidth(), 1) : Multiple;
 }
 
-uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S) {
-  return std::min(getConstantMultiple(S).countTrailingZeros(),
+uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S,
+                                              const Instruction *CtxI) {
+  return std::min(getConstantMultiple(S, CtxI).countTrailingZeros(),
                   (unsigned)getTypeSizeInBits(S->getType()));
 }
 
@@ -10174,8 +10230,7 @@ const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const {
 static const SCEV *
 SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
                              SmallVectorImpl<const SCEVPredicate *> *Predicates,
-
-                             ScalarEvolution &SE) {
+                             ScalarEvolution &SE, const Loop *L) {
   uint32_t BW = A.getBitWidth();
   assert(BW == SE.getTypeSizeInBits(B->getType()));
   assert(A != 0 && "A must be non-zero.");
@@ -10191,7 +10246,12 @@ SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
   //
   // B is divisible by D if and only if the multiplicity of prime factor 2 for B
   // is not less than multiplicity of this prime factor for D.
-  if (SE.getMinTrailingZeros(B) < Mult2) {
+  unsigned MinTZ = SE.getMinTrailingZeros(B);
+  // Try again with the terminator of the loop predecessor for context-specific
+  // result, if MinTZ s too small.
+  if (MinTZ < Mult2 && L->getLoopPredecessor())
+    MinTZ = SE.getMinTrailingZeros(B, L->getLoopPredecessor()->getTerminator());
+  if (MinTZ < Mult2) {
     // Check if we can prove there's no remainder using URem.
     const SCEV *URem =
         SE.getURemExpr(B, SE.getConstant(APInt::getOneBitSet(BW, Mult2)));
@@ -10639,7 +10699,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
     return getCouldNotCompute();
   const SCEV *E = SolveLinEquationWithOverflow(
       StepC->getAPInt(), getNegativeSCEV(Start),
-      AllowPredicates ? &Predicates : nullptr, *this);
+      AllowPredicates ? &Predicates : nullptr, *this, L);
 
   const SCEV *M = E;
   if (E != getCouldNotCompute()) {
@@ -11418,8 +11478,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
       XNonConstOp = X;
       XFlagsPresent = ExpectedFlags;
     }
-    if (!isa<SCEVConstant>(XConstOp) ||
-        (XFlagsPresent & ExpectedFlags) != ExpectedFlags)
+    if (!isa<SCEVConstant>(XConstOp))
       return false;
 
     if (!splitBinaryAdd(Y, YConstOp, YNonConstOp, YFlagsPresent)) {
@@ -11428,13 +11487,21 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
       YFlagsPresent = ExpectedFlags;
     }
 
-    if (!isa<SCEVConstant>(YConstOp) ||
-        (YFlagsPresent & ExpectedFlags) != ExpectedFlags)
+    if (YNonConstOp != XNonConstOp)
       return false;
 
-    if (YNonConstOp != XNonConstOp)
+    if (!isa<SCEVConstant>(YConstOp))
       return false;
 
+    // When matching ADDs with NUW flags (and unsigned predicates), only the
+    // second ADD (with the larger constant) requires NUW.
+    if ((YFlagsPresent & ExpectedFlags) != ExpectedFlags)
+      return false;
+    if (ExpectedFlags != SCEV::FlagNUW &&
+        (XFlagsPresent & ExpectedFlags) != ExpectedFlags) {
+      return false;
+    }
+
     OutC1 = cast<SCEVConstant>(XConstOp)->getAPInt();
     OutC2 = cast<SCEVConstant>(YConstOp)->getAPInt();
 
@@ -11472,7 +11539,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
     std::swap(LHS, RHS);
     [[fallthrough]];
   case ICmpInst::ICMP_ULE:
-    // (X + C1)<nuw> u<= (X + C2)<nuw> for C1 u<= C2.
+    // (X + C1) u<= (X + C2)<nuw> for C1 u<= C2.
     if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNUW) && C1.ule(C2))
       return true;
 
@@ -11482,7 +11549,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(CmpPredicate Pred,
     std::swap(LHS, RHS);
     [[fallthrough]];
   case ICmpInst::ICMP_ULT:
-    // (X + C1)<nuw> u< (X + C2)<nuw> if C1 u< C2.
+    // (X + C1) u< (X + C2)<nuw> if C1 u< C2.
     if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNUW) && C1.ult(C2))
       return true;
     break;
@@ -12113,12 +12180,11 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) {
     // Try to match a common constant multiply.
     auto MatchConstMul =
         [](const SCEV *S) -> std::optional<std::pair<const SCEV *, APInt>> {
-      auto *M = dyn_cast<SCEVMulExpr>(S);
-      if (!M || M->getNumOperands() != 2 ||
-          !isa<SCEVConstant>(M->getOperand(0)))
-        return std::nullopt;
-      return {
-          {M->getOperand(1), cast<SCEVConstant>(M->getOperand(0))->getAPInt()}};
+      const APInt *C;
+      const SCEV *Op;
+      if (match(S, m_scev_Mul(m_scev_APInt(C), m_SCEV(Op))))
+        return {{Op, *C}};
+      return std::nullopt;
     };
     if (auto MatchedMore = MatchConstMul(More)) {
       if (auto MatchedLess = MatchConstMul(Less)) {
@@ -15285,67 +15351,6 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
     }
 }
 
-// Match the mathematical pattern A - (A / B) * B, where A and B can be
-// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
-// for URem with constant power-of-2 second operands.
-// It's not always easy, as A and B can be folded (imagine A is X / 2, and B is
-// 4, A / B becomes X / 8).
-bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
-                                const SCEV *&RHS) {
-  if (Expr->getType()->isPointerTy())
-    return false;
-
-  // Try to match 'zext (trunc A to iB) to iY', which is used
-  // for URem with constant power-of-2 second operands. Make sure the size of
-  // the operand A matches the size of the whole expressions.
-  if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
-    if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
-      LHS = Trunc->getOperand();
-      // Bail out if the type of the LHS is larger than the type of the
-      // expression for now.
-      if (getTypeSizeInBits(LHS->getType()) >
-          getTypeSizeInBits(Expr->getType()))
-        return false;
-      if (LHS->getType() != Expr->getType())
-        LHS = getZeroExtendExpr(LHS, Expr->getType());
-      RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
-                        << getTypeSizeInBits(Trunc->getType()));
-      return true;
-    }
-  const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (Add == nullptr || Add->getNumOperands() != 2)
-    return false;
-
-  const SCEV *A = Add->getOperand(1);
-  const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
-
-  if (Mul == nullptr)
-    return false;
-
-  const auto MatchURemWithDivisor = [&](const SCEV *B) {
-    // (SomeExpr + (-(SomeExpr / B) * B)).
-    if (Expr == getURemExpr(A, B)) {
-      LHS = A;
-      RHS = B;
-      return true;
-    }
-    return false;
-  };
-
-  // (SomeExpr + (-1 * (SomeExpr / B) * B)).
-  if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
-    return MatchURemWithDivisor(Mul->getOperand(1)) ||
-           MatchURemWithDivisor(Mul->getOperand(2));
-
-  // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
-  if (Mul->getNumOperands() == 2)
-    return MatchURemWithDivisor(Mul->getOperand(1)) ||
-           MatchURemWithDivisor(Mul->getOperand(0)) ||
-           MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) ||
-           MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0)));
-  return false;
-}
-
 ScalarEvolution::LoopGuards
 ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
   BasicBlock *Header = L->getHeader();
@@ -15421,6 +15426,110 @@ void ScalarEvolution::LoopGuards::collectFromPHI(
   }
 }
 
+// Return a new SCEV that modifies \p Expr to the closest number divides by
+// \p Divisor and less or equal than Expr. For now, only handle constant
+// Expr.
+static const SCEV *getPreviousSCEVDivisibleByDivisor(const SCEV *Expr,
+                                                     const APInt &DivisorVal,
+                                                     ScalarEvolution &SE) {
+  const APInt *ExprVal;
+  if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() ||
+      DivisorVal.isNonPositive())
+    return Expr;
+  APInt Rem = ExprVal->urem(DivisorVal);
+  // return the SCEV: Expr - Expr % Divisor
+  return SE.getConstant(*ExprVal - Rem);
+}
+
+// Return a new SCEV that modifies \p Expr to the closest number divides by
+// \p Divisor and greater or equal than Expr. For now, only handle constant
+// Expr.
+static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr,
+                                                 const APInt &DivisorVal,
+                                                 ScalarEvolution &SE) {
+  const APInt *ExprVal;
+  if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() ||
+      DivisorVal.isNonPositive())
+    return Expr;
+  APInt Rem = ExprVal->urem(DivisorVal);
+  if (Rem.isZero())
+    return Expr;
+  // return the SCEV: Expr + Divisor - Expr % Divisor
+  return SE.getConstant(*ExprVal + DivisorVal - Rem);
+}
+
+static bool collectDivisibilityInformation(
+    ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS,
+    DenseMap<const SCEV *, const SCEV *> &DivInfo,
+    DenseMap<const SCEV *, APInt> &Multiples, ScalarEvolution &SE) {
+  // If we have LHS == 0, check if LHS is computing a property of some unknown
+  // SCEV %v which we can rewrite %v to express explicitly.
+  if (Predicate != CmpInst::ICMP_EQ || !match(RHS, m_scev_Zero()))
+    return false;
+  // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+  // explicitly express that.
+  const SCEVUnknown *URemLHS = nullptr;
+  const SCEV *URemRHS = nullptr;
+  if (!match(LHS, m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE)))
+    return false;
+
+  const SCEV *Multiple =
+      SE.getMulExpr(SE.getUDivExpr(URemLHS, URemRHS), URemRHS);
+  DivInfo[URemLHS] = Multiple;
+  if (auto *C = dyn_cast<SCEVConstant>(URemRHS))
+    Multiples[URemLHS] = C->getAPInt();
+  return true;
+}
+
+// Check if the condition is a divisibility guard (A % B == 0).
+static bool isDivisibilityGuard(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE) {
+  const SCEV *X, *Y;
+  return match(LHS, m_scev_URem(m_SCEV(X), m_SCEV(Y), SE)) && RHS->isZero();
+}
+
+// Apply divisibility by \p Divisor on MinMaxExpr with constant values,
+// recursively. This is done by aligning up/down the constant value to the
+// Divisor.
+static const SCEV *applyDivisibilityOnMinMaxExpr(const SCEV *MinMaxExpr,
+                                                 APInt Divisor,
+                                                 ScalarEvolution &SE) {
+  // Return true if \p Expr is a MinMax SCEV expression with a non-negative
+  // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
+  // the non-constant operand and in \p LHS the constant operand.
+  auto IsMinMaxSCEVWithNonNegativeConstant =
+      [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
+          const SCEV *&RHS) {
+        if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
+          if (MinMax->getNumOperands() != 2)
+            return false;
+          if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
+            if (C->getAPInt().isNegative())
+              return false;
+            SCTy = MinMax->getSCEVType();
+            LHS = MinMax->getOperand(0);
+            RHS = MinMax->getOperand(1);
+            return true;
+          }
+        }
+        return false;
+      };
+
+  const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
+  SCEVTypes SCTy;
+  if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
+                                           MinMaxRHS))
+    return MinMaxExpr;
+  auto IsMin = isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
+  assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!");
+  auto *DivisibleExpr =
+      IsMin ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE)
+            : getNextSCEVDivisibleByDivisor(MinMaxLHS, Divisor, SE);
+  SmallVector<const SCEV *> Ops = {
+      applyDivisibilityOnMinMaxExpr(MinMaxRHS, Divisor, SE), DivisibleExpr};
+  return SE.getMinMaxExpr(SCTy, Ops);
+}
+
 void ScalarEvolution::LoopGuards::collectFromBlock(
     ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
     const BasicBlock *Block, const BasicBlock *Pred,
@@ -15428,19 +15537,13 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
   SmallVector<const SCEV *> ExprsToRewrite;
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
                               const SCEV *RHS,
-                              DenseMap<const SCEV *, const SCEV *>
-                                  &RewriteMap) {
+                              DenseMap<const SCEV *, const SCEV *> &RewriteMap,
+                              const LoopGuards &DivGuards) {
     // WARNING: It is generally unsound to apply any wrap flags to the proposed
     // replacement SCEV which isn't directly implied by the structure of that
     // SCEV.  In particular, using contextual facts to imply flags is *NOT*
     // legal.  See the scoping rules for flags in the header to understand why.
 
-    // If LHS is a constant, apply information to the other expression.
-    if (isa<SCEVConstant>(LHS)) {
-      std::swap(LHS, RHS);
-      Predicate = CmpInst::getSwappedPredicate(Predicate);
-    }
-
     // Check for a condition of the form (-C1 + X < C2).  InstCombine will
     // create this form when combining two checks of the form (X u< C2 + C1) and
     // (X >=u C1).
@@ -15473,115 +15576,6 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     if (MatchRangeCheckIdiom())
       return;
 
-    // Return true if \p Expr is a MinMax SCEV expression with a non-negative
-    // constant operand. If so, return in \p SCTy the SCEV type and in \p RHS
-    // the non-constant operand and in \p LHS the constant operand.
-    auto IsMinMaxSCEVWithNonNegativeConstant =
-        [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
-            const SCEV *&RHS) {
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
-            if (MinMax->getNumOperands() != 2)
-              return false;
-            if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
-              if (C->getAPInt().isNegative())
-                return false;
-              SCTy = MinMax->getSCEVType();
-              LHS = MinMax->getOperand(0);
-              RHS = MinMax->getOperand(1);
-              return true;
-            }
-          }
-          return false;
-        };
-
-    // Checks whether Expr is a non-negative constant, and Divisor is a positive
-    // constant, and returns their APInt in ExprVal and in DivisorVal.
-    auto GetNonNegExprAndPosDivisor = [&](const SCEV *Expr, const SCEV *Divisor,
-                                          APInt &ExprVal, APInt &DivisorVal) {
-      auto *ConstExpr = dyn_cast<SCEVConstant>(Expr);
-      auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor);
-      if (!ConstExpr || !ConstDivisor)
-        return false;
-      ExprVal = ConstExpr->getAPInt();
-      DivisorVal = ConstDivisor->getAPInt();
-      return ExprVal.isNonNegative() && !DivisorVal.isNonPositive();
-    };
-
-    // Return a new SCEV that modifies \p Expr to the closest number divides by
-    // \p Divisor and greater or equal than Expr.
-    // For now, only handle constant Expr and Divisor.
-    auto GetNextSCEVDividesByDivisor = [&](const SCEV *Expr,
-                                           const SCEV *Divisor) {
-      APInt ExprVal;
-      APInt DivisorVal;
-      if (!GetNonNegExprAndPosDivisor(Expr, Divisor, ExprVal, DivisorVal))
-        return Expr;
-      APInt Rem = ExprVal.urem(DivisorVal);
-      if (!Rem.isZero())
-        // return the SCEV: Expr + Divisor - Expr % Divisor
-        return SE.getConstant(ExprVal + DivisorVal - Rem);
-      return Expr;
-    };
-
-    // Return a new SCEV that modifies \p Expr to the closest number divides by
-    // \p Divisor and less or equal than Expr.
-    // For now, only handle constant Expr and Divisor.
-    auto GetPreviousSCEVDividesByDivisor = [&](const SCEV *Expr,
-                                               const SCEV *Divisor) {
-      APInt ExprVal;
-      APInt DivisorVal;
-      if (!GetNonNegExprAndPosDivisor(Expr, Divisor, ExprVal, DivisorVal))
-        return Expr;
-      APInt Rem = ExprVal.urem(DivisorVal);
-      // return the SCEV: Expr - Expr % Divisor
-      return SE.getConstant(ExprVal - Rem);
-    };
-
-    // Apply divisibilty by \p Divisor on MinMaxExpr with constant values,
-    // recursively. This is done by aligning up/down the constant value to the
-    // Divisor.
-    std::function<const SCEV *(const SCEV *, const SCEV *)>
-        ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr,
-                                           const SCEV *Divisor) {
-          const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr;
-          SCEVTypes SCTy;
-          if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS,
-                                                   MinMaxRHS))
-            return MinMaxExpr;
-          auto IsMin =
-              isa<SCEVSMinExpr>(MinMaxExpr) || isa<SCEVUMinExpr>(MinMaxExpr);
-          assert(SE.isKnownNonNegative(MinMaxLHS) &&
-                 "Expected non-negative operand!");
-          auto *DivisibleExpr =
-              IsMin ? GetPreviousSCEVDividesByDivisor(MinMaxLHS, Divisor)
-                    : GetNextSCEVDividesByDivisor(MinMaxLHS, Divisor);
-          SmallVector<const SCEV *> Ops = {
-              ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr};
-          return SE.getMinMaxExpr(SCTy, Ops);
-        };
-
-    // If we have LHS == 0, check if LHS is computing a property of some unknown
-    // SCEV %v which we can rewrite %v to express explicitly.
-    if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
-      // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
-      // explicitly express that.
-      const SCEV *URemLHS = nullptr;
-      const SCEV *URemRHS = nullptr;
-      if (SE.matchURem(LHS, URemLHS, URemRHS)) {
-        if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
-          auto I = RewriteMap.find(LHSUnknown);
-          const SCEV *RewrittenLHS =
-              I != RewriteMap.end() ? I->second : LHSUnknown;
-          RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
-          const auto *Multiple =
-              SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
-          RewriteMap[LHSUnknown] = Multiple;
-          ExprsToRewrite.push_back(LHSUnknown);
-          return;
-        }
-      }
-    }
-
     // Do not apply information for constants or if RHS contains an AddRec.
     if (isa<SCEVConstant>(LHS) || SE.containsAddRecurrence(RHS))
       return;
@@ -15610,51 +15604,10 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
       return RewriteMap.lookup_or(S, S);
     };
 
-    // Check for the SCEV expression (A /u B) * B while B is a constant, inside
-    // \p Expr. The check is done recuresively on \p Expr, which is assumed to
-    // be a composition of Min/Max SCEVs. Return whether the SCEV expression (A
-    // /u B) * B was found, and return the divisor B in \p DividesBy. For
-    // example, if Expr = umin (umax ((A /u 8) * 8, 16), 64), return true since
-    // (A /u 8) * 8 matched the pattern, and return the constant SCEV 8 in \p
-    // DividesBy.
-    std::function<bool(const SCEV *, const SCEV *&)> HasDivisibiltyInfo =
-        [&](const SCEV *Expr, const SCEV *&DividesBy) {
-          if (auto *Mul = dyn_cast<SCEVMulExpr>(Expr)) {
-            if (Mul->getNumOperands() != 2)
-              return false;
-            auto *MulLHS = Mul->getOperand(0);
-            auto *MulRHS = Mul->getOperand(1);
-            if (isa<SCEVConstant>(MulLHS))
-              std::swap(MulLHS, MulRHS);
-            if (auto *Div = dyn_cast<SCEVUDivExpr>(MulLHS))
-              if (Div->getOperand(1) == MulRHS) {
-                DividesBy = MulRHS;
-                return true;
-              }
-          }
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
-            return HasDivisibiltyInfo(MinMax->getOperand(0), DividesBy) ||
-                   HasDivisibiltyInfo(MinMax->getOperand(1), DividesBy);
-          return false;
-        };
-
-    // Return true if Expr known to divide by \p DividesBy.
-    std::function<bool(const SCEV *, const SCEV *&)> IsKnownToDivideBy =
-        [&](const SCEV *Expr, const SCEV *DividesBy) {
-          if (SE.getURemExpr(Expr, DividesBy)->isZero())
-            return true;
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr))
-            return IsKnownToDivideBy(MinMax->getOperand(0), DividesBy) &&
-                   IsKnownToDivideBy(MinMax->getOperand(1), DividesBy);
-          return false;
-        };
-
     const SCEV *RewrittenLHS = GetMaybeRewritten(LHS);
-    const SCEV *DividesBy = nullptr;
-    if (HasDivisibiltyInfo(RewrittenLHS, DividesBy))
-      // Check that the whole expression is divided by DividesBy
-      DividesBy =
-          IsKnownToDivideBy(RewrittenLHS, DividesBy) ? DividesBy : nullptr;
+    // Apply divisibility information when computing the constant multiple.
+    const APInt &DividesBy =
+        SE.getConstantMultiple(DivGuards.rewrite(RewrittenLHS));
 
     // Collect rewrites for LHS and its transitive operands based on the
     // condition.
@@ -15669,31 +15622,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     // predicate.
     const SCEV *One = SE.getOne(RHS->getType());
     switch (Predicate) {
-      case CmpInst::ICMP_ULT:
-        if (RHS->getType()->isPointerTy())
-          return;
-        RHS = SE.getUMaxExpr(RHS, One);
-        [[fallthrough]];
-      case CmpInst::ICMP_SLT: {
-        RHS = SE.getMinusSCEV(RHS, One);
-        RHS = DividesBy ? GetPreviousSCEVDividesByDivisor(RHS, DividesBy) : RHS;
-        break;
-      }
-      case CmpInst::ICMP_UGT:
-      case CmpInst::ICMP_SGT:
-        RHS = SE.getAddExpr(RHS, One);
-        RHS = DividesBy ? GetNextSCEVDividesByDivisor(RHS, DividesBy) : RHS;
-        break;
-      case CmpInst::ICMP_ULE:
-      case CmpInst::ICMP_SLE:
-        RHS = DividesBy ? GetPreviousSCEVDividesByDivisor(RHS, DividesBy) : RHS;
-        break;
-      case CmpInst::ICMP_UGE:
-      case CmpInst::ICMP_SGE:
-        RHS = DividesBy ? GetNextSCEVDividesByDivisor(RHS, DividesBy) : RHS;
-        break;
-      default:
-        break;
+    case CmpInst::ICMP_ULT:
+      if (RHS->getType()->isPointerTy())
+        return;
+      RHS = SE.getUMaxExpr(RHS, One);
+      [[fallthrough]];
+    case CmpInst::ICMP_SLT: {
+      RHS = SE.getMinusSCEV(RHS, One);
+      RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    }
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      RHS = SE.getAddExpr(RHS, One);
+      RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
+      break;
+    default:
+      break;
     }
 
     SmallVector<const SCEV *, 16> Worklist(1, LHS);
@@ -15744,8 +15697,30 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
       case CmpInst::ICMP_NE:
         if (match(RHS, m_scev_Zero())) {
           const SCEV *OneAlignedUp =
-              DividesBy ? GetNextSCEVDividesByDivisor(One, DividesBy) : One;
+              getNextSCEVDivisibleByDivisor(One, DividesBy, SE);
           To = SE.getUMaxExpr(FromRewritten, OneAlignedUp);
+        } else {
+          // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS),
+          // but creating the subtraction eagerly is expensive. Track the
+          // inequalities in a separate map, and materialize the rewrite lazily
+          // when encountering a suitable subtraction while re-writing.
+          if (LHS->getType()->isPointerTy()) {
+            LHS = SE.getLosslessPtrToIntExpr(LHS);
+            RHS = SE.getLosslessPtrToIntExpr(RHS);
+            if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS))
+              break;
+          }
+          const SCEVConstant *C;
+          const SCEV *A, *B;
+          if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) &&
+              match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) {
+            RHS = A;
+            LHS = B;
+          }
+          if (LHS > RHS)
+            std::swap(LHS, RHS);
+          Guards.NotEqual.insert({LHS, RHS});
+          continue;
         }
         break;
       default:
@@ -15817,8 +15792,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
 
   // Now apply the information from the collected conditions to
   // Guards.RewriteMap. Conditions are processed in reverse order, so the
-  // earliest conditions is processed first. This ensures the SCEVs with the
+  // earliest conditions is processed first, except guards with divisibility
+  // information, which are moved to the back. This ensures the SCEVs with the
   // shortest dependency chains are constructed first.
+  SmallVector<std::tuple<CmpInst::Predicate, const SCEV *, const SCEV *>>
+      GuardsToProcess;
   for (auto [Term, EnterIfTrue] : reverse(Terms)) {
     SmallVector<Value *, 8> Worklist;
     SmallPtrSet<Value *, 8> Visited;
@@ -15833,7 +15811,14 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
             EnterIfTrue ? Cmp->getPredicate() : Cmp->getInversePredicate();
         const auto *LHS = SE.getSCEV(Cmp->getOperand(0));
         const auto *RHS = SE.getSCEV(Cmp->getOperand(1));
-        CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap);
+        // If LHS is a constant, apply information to the other expression.
+        // TODO: If LHS is not a constant, check if using CompareSCEVComplexity
+        // can improve results.
+        if (isa<SCEVConstant>(LHS)) {
+          std::swap(LHS, RHS);
+          Predicate = CmpInst::getSwappedPredicate(Predicate);
+        }
+        GuardsToProcess.emplace_back(Predicate, LHS, RHS);
         continue;
       }
 
@@ -15846,6 +15831,31 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     }
   }
 
+  // Process divisibility guards in reverse order to populate DivGuards early.
+  DenseMap<const SCEV *, APInt> Multiples;
+  LoopGuards DivGuards(SE);
+  for (const auto &[Predicate, LHS, RHS] : GuardsToProcess) {
+    if (!isDivisibilityGuard(LHS, RHS, SE))
+      continue;
+    collectDivisibilityInformation(Predicate, LHS, RHS, DivGuards.RewriteMap,
+                                   Multiples, SE);
+  }
+
+  for (const auto &[Predicate, LHS, RHS] : GuardsToProcess)
+    CollectCondition(Predicate, LHS, RHS, Guards.RewriteMap, DivGuards);
+
+  // Apply divisibility information last. This ensures it is applied to the
+  // outermost expression after other rewrites for the given value.
+  for (const auto &[K, Divisor] : Multiples) {
+    const SCEV *DivisorSCEV = SE.getConstant(Divisor);
+    Guards.RewriteMap[K] =
+        SE.getMulExpr(SE.getUDivExpr(applyDivisibilityOnMinMaxExpr(
+                                         Guards.rewrite(K), Divisor, SE),
+                                     DivisorSCEV),
+                      DivisorSCEV);
+    ExprsToRewrite.push_back(K);
+  }
+
   // Let the rewriter preserve NUW/NSW flags if the unsigned/signed ranges of
   // the replacement expressions are contained in the ranges of the replaced
   // expressions.
@@ -15878,13 +15888,15 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
   class SCEVLoopGuardRewriter
       : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
     const DenseMap<const SCEV *, const SCEV *> &Map;
+    const SmallDenseSet<std::pair<const SCEV *, const SCEV *>> &NotEqual;
 
     SCEV::NoWrapFlags FlagMask = SCEV::FlagAnyWrap;
 
   public:
     SCEVLoopGuardRewriter(ScalarEvolution &SE,
                           const ScalarEvolution::LoopGuards &Guards)
-        : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap) {
+        : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap),
+          NotEqual(Guards.NotEqual) {
       if (Guards.PreserveNUW)
         FlagMask = ScalarEvolution::setFlags(FlagMask, SCEV::FlagNUW);
       if (Guards.PreserveNSW)
@@ -15939,6 +15951,41 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
     }
 
     const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+      // Helper to check if S is a subtraction (A - B) where A != B, and if so,
+      // return UMax(S, 1).
+      auto RewriteSubtraction = [&](const SCEV *S) -> const SCEV * {
+        const SCEV *LHS, *RHS;
+        if (MatchBinarySub(S, LHS, RHS)) {
+          if (LHS > RHS)
+            std::swap(LHS, RHS);
+          if (NotEqual.contains({LHS, RHS})) {
+            const SCEV *OneAlignedUp = getNextSCEVDivisibleByDivisor(
+                SE.getOne(S->getType()), SE.getConstantMultiple(S), SE);
+            return SE.getUMaxExpr(OneAlignedUp, S);
+          }
+        }
+        return nullptr;
+      };
+
+      // Check if Expr itself is a subtraction pattern with guard info.
+      if (const SCEV *Rewritten = RewriteSubtraction(Expr))
+        return Rewritten;
+
+      // Trip count expressions sometimes consist of adding 3 operands, i.e.
+      // (Const + A + B). There may be guard info for A + B, and if so, apply
+      // it.
+      // TODO: Could more generally apply guards to Add sub-expressions.
+      if (isa<SCEVConstant>(Expr->getOperand(0)) &&
+          Expr->getNumOperands() == 3) {
+        const SCEV *Add =
+            SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2));
+        if (const SCEV *Rewritten = RewriteSubtraction(Add))
+          return SE.getAddExpr(
+              Expr->getOperand(0), Rewritten,
+              ScalarEvolution::maskFlags(Expr->getNoWrapFlags(), FlagMask));
+        if (const SCEV *S = Map.lookup(Add))
+          return SE.getAddExpr(Expr->getOperand(0), S);
+      }
       SmallVector<const SCEV *, 2> Operands;
       bool Changed = false;
       for (const auto *Op : Expr->operands()) {
@@ -15971,7 +16018,7 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
     }
   };
 
-  if (RewriteMap.empty())
+  if (RewriteMap.empty() && NotEqual.empty())
     return Expr;
 
   SCEVLoopGuardRewriter Rewriter(SE, *this);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 21f844c4d2f45..161ad2adba85e 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -90,6 +90,9 @@ using namespace llvm::PatternMatch;
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
                                               cl::Hidden, cl::init(20));
 
+/// Maximum number of instructions to check between assume and context
+/// instruction.
+static constexpr unsigned MaxInstrsToCheckForFree = 16;
 
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
@@ -552,6 +555,29 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
   return false;
 }
 
+bool llvm::willNotFreeBetween(const Instruction *Assume,
+                              const Instruction *CtxI) {
+  if (CtxI->getParent() != Assume->getParent() || !Assume->comesBefore(CtxI))
+    return false;
+  // Make sure the current function cannot arrange for another thread to free on
+  // its behalf.
+  if (!CtxI->getFunction()->hasNoSync())
+    return false;
+
+  // Check if there are any calls between the assume and CtxI that may
+  // free memory.
+  for (const auto &[Idx, I] :
+       enumerate(make_range(Assume->getIterator(), CtxI->getIterator()))) {
+    // Limit number of instructions to walk.
+    if (Idx > MaxInstrsToCheckForFree)
+      return false;
+    if (const auto *CB = dyn_cast<CallBase>(&I))
+      if (!CB->hasFnAttr(Attribute::NoFree))
+        return false;
+  }
+  return true;
+}
+
 // TODO: cmpExcludesZero misses many cases where `RHS` is non-constant but
 // we still have enough information about `RHS` to conclude non-zero. For
 // example Pred=EQ, RHS=isKnownNonZero. cmpExcludesZero is called in loops
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 033ef8be700eb..a98ce93017a7c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2069,6 +2069,27 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
     return new IntToPtrInst(P, CI.getType());
   }
 
+  // Replace (inttoptr (add (ptrtoint %Base), %Offset)) with
+  // (getelementptr i8, %Base, %Offset) if the pointer is only used as integer
+  // value.
+  Value *Base;
+  Value *Offset;
+  auto UsesPointerAsInt = [](User *U) {
+    if (isa<ICmpInst, PtrToIntInst>(U))
+      return true;
+    if (auto *P = dyn_cast<PHINode>(U))
+      return P->hasOneUse() && isa<ICmpInst, PtrToIntInst>(*P->user_begin());
+    return false;
+  };
+  if (match(CI.getOperand(0),
+            m_OneUse(m_c_Add(m_PtrToIntSameSize(DL, m_Value(Base)),
+                             m_Value(Offset)))) &&
+      CI.getType()->getPointerAddressSpace() ==
+          Base->getType()->getPointerAddressSpace() &&
+      all_of(CI.users(), UsesPointerAsInt)) {
+    return GetElementPtrInst::Create(Builder.getInt8Ty(), Base, Offset);
+  }
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index f3e992c039178..04039b885f3c5 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1009,7 +1009,8 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
-  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr);
+  LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr,
+                             &AR.AC);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 4f2bfb073bafa..448dc2b8b52b0 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -551,7 +551,7 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
   const Function *F = L.getHeader()->getParent();
   OptimizationRemarkEmitter ORE(F);
 
-  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr);
+  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC);
   if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index bf457194bfd8e..aff0b901bd031 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
@@ -42,6 +43,7 @@ cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
              "controls the budget that is considered cheap (default = 4)"));
 
 using namespace PatternMatch;
+using namespace SCEVPatternMatch;
 
 PoisonFlags::PoisonFlags(const Instruction *I) {
   NUW = false;
@@ -504,7 +506,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
   // Recognize the canonical representation of an unsimplifed urem.
   const SCEV *URemLHS = nullptr;
   const SCEV *URemRHS = nullptr;
-  if (SE.matchURem(S, URemLHS, URemRHS)) {
+  if (match(S, m_scev_URem(m_SCEV(URemLHS), m_SCEV(URemRHS), SE))) {
     Value *LHS = expand(URemLHS);
     Value *RHS = expand(URemRHS);
     return InsertBinop(Instruction::URem, LHS, RHS, SCEV::FlagAnyWrap,
@@ -1223,6 +1225,54 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   return Result;
 }
 
+Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) {
+  Type *STy = S->getType();
+  const Loop *L = S->getLoop();
+  BasicBlock *EB = L->getExitBlock();
+  if (!EB || !EB->getSinglePredecessor() ||
+      !SE.DT.dominates(EB, Builder.GetInsertBlock()))
+    return nullptr;
+
+  for (auto &PN : EB->phis()) {
+    if (!SE.isSCEVable(PN.getType()))
+      continue;
+    auto *ExitSCEV = SE.getSCEV(&PN);
+    if (!isa<SCEVAddRecExpr>(ExitSCEV))
+      continue;
+    Type *PhiTy = PN.getType();
+    if (STy->isIntegerTy() && PhiTy->isPointerTy()) {
+      ExitSCEV = SE.getPtrToIntExpr(ExitSCEV, STy);
+      if (isa<SCEVCouldNotCompute>(ExitSCEV))
+        continue;
+    } else if (S->getType() != PN.getType()) {
+      continue;
+    }
+
+    // Check if we can re-use the existing PN, by adjusting it with an expanded
+    // offset, if the offset is simpler.
+    const SCEV *Diff = SE.getMinusSCEV(S, ExitSCEV);
+    const SCEV *Op = Diff;
+    match(Op, m_scev_Add(m_SCEVConstant(), m_SCEV(Op)));
+    match(Op, m_scev_Mul(m_scev_AllOnes(), m_SCEV(Op)));
+    match(Op, m_scev_PtrToInt(m_SCEV(Op)));
+    if (!isa<SCEVConstant, SCEVUnknown>(Op))
+      continue;
+
+    assert(Diff->getType()->isIntegerTy() &&
+           "difference must be of integer type");
+    Value *DiffV = expand(Diff);
+    Value *BaseV = fixupLCSSAFormFor(&PN);
+    if (PhiTy->isPointerTy()) {
+      if (STy->isPointerTy())
+        return Builder.CreatePtrAdd(BaseV, DiffV);
+      BaseV = Builder.CreatePtrToInt(BaseV, DiffV->getType());
+    }
+    return Builder.CreateAdd(BaseV, DiffV);
+  }
+
+  return nullptr;
+}
+
 Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // In canonical mode we compute the addrec as an expression of a canonical IV
   // using evaluateAtIteration and expand the resulting SCEV expression. This
@@ -1262,6 +1312,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     return V;
   }
 
+  // If S is expanded outside the defining loop, check if there is a
+  // matching LCSSA phi node for it.
+  if (Value *V = tryToReuseLCSSAPhi(S))
+    return V;
+
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
     if (isa<PointerType>(S->getType())) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 703cfe969577d..b9cd3635047ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -994,7 +994,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // Returns a scalar boolean value, which is true if any lane of its
     // (boolean) vector operands is true. It produces the reduced value across
     // all unrolled iterations. Unrolling will add all copies of its original
-    // operand as additional operands.
+    // operand as additional operands. AnyOf is poison-safe as all operands
+    // will be frozen.
     AnyOf,
     // Calculates the first active lane index of the vector predicate operands.
     // It produces the lane index across all unrolled iterations. Unrolling will
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 57b713d3dfcb9..8c424e10a74a5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -858,9 +858,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
     return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
   }
   case VPInstruction::AnyOf: {
-    Value *Res = State.get(getOperand(0));
+    Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
     for (VPValue *Op : drop_begin(operands()))
-      Res = Builder.CreateOr(Res, State.get(Op));
+      Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
     return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
   case VPInstruction::FirstActiveLane: {
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
index 0d1b0829c09da..311de84993001 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different-strides-safe-dep-due-to-backedge-taken-count.ll
@@ -106,10 +106,43 @@ exit:
   ret void
 }
 
+define void @backward_dep_known_safe_due_to_backedge_taken_count(ptr %A) {
+; CHECK-LABEL: 'backward_dep_known_safe_due_to_backedge_taken_count'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %A.510 = getelementptr inbounds i32, ptr %A, i64 510
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul.2 = shl nuw nsw i64 %iv, 1
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  %l = load i32, ptr %gep, align 4
+  %add = add nsw i32 %l, 5
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
+  store i32 %add, ptr %gep.mul.2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 256
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @backward_dep_known_distance_less_than_btc(ptr %A) {
 ; CHECK-LABEL: 'backward_dep_known_distance_less_than_btc'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 8160 bits
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 4064 bits
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        BackwardVectorizable:
 ; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
@@ -130,10 +163,10 @@ entry:
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %iv.mul.2 = shl nuw nsw i64 %iv, 1
-  %gep = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep = getelementptr inbounds i32, ptr %A, i64 %iv.mul.2
   %l = load i32, ptr %gep, align 4
   %add = add nsw i32 %l, 5
-  %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv.mul.2
+  %gep.mul.2 = getelementptr inbounds i32, ptr %A.510, i64 %iv
   store i32 %add, ptr %gep.mul.2, align 4
   %iv.next = add nuw nsw i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 256
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index 1dc8d4a7e73f8..b7f2ae0529d32 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -505,7 +505,7 @@ e.1:
   ret i32 1
 }
 
-define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption(ptr %A, ptr %B) {
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption(ptr %A, ptr %B) nosync nofree {
 ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption'
 ; CHECK-NEXT:    loop.header:
 ; CHECK-NEXT:      Memory dependences are safe with run-time checks
@@ -518,10 +518,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_kno
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group GRP0:
-; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
 ; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
 ; CHECK-NEXT:        Group GRP1:
-; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
 ; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
@@ -565,7 +565,7 @@ e.2:
   ret void
 }
 
-define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small(ptr %A, ptr %B) {
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small(ptr %A, ptr %B) nosync nofree {
 ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small'
 ; CHECK-NEXT:    loop.header:
 ; CHECK-NEXT:      Memory dependences are safe with run-time checks
@@ -624,3 +624,129 @@ e.1:
 e.2:
   ret void
 }
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context(ptr %A, ptr %B) nosync {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 500
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors(ptr %A, ptr %B, i1 %c) nosync {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+  br i1 %c, label %then, label %else
+
+then:
+  br label %loop.header
+
+else:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 500
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
index 1a6e25859f085..468b22568277e 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
@@ -8,21 +8,10 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 define void @test_distance_positive_independent_via_trip_count(ptr %A) {
 ; CHECK-LABEL: 'test_distance_positive_independent_via_trip_count'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
-; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group GRP0:
-; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.400, i64 %iv
-; CHECK-NEXT:        Against group GRP1:
-; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group GRP0:
-; CHECK-NEXT:          (Low: (400 + %A)<nuw> High: (804 + %A))
-; CHECK-NEXT:            Member: {(400 + %A)<nuw>,+,4}<nuw><%loop>
-; CHECK-NEXT:        Group GRP1:
-; CHECK-NEXT:          (Low: %A High: (101 + %A))
-; CHECK-NEXT:            Member: {%A,+,1}<nuw><%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-apply-to-adds.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-apply-to-adds.ll
new file mode 100644
index 0000000000000..6b2c78cebc44a
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-apply-to-adds.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+define void @ptrtoint_based_trip_count_known_via_guards_applied_to_add_subexpr(ptr %start, ptr %end) {
+; CHECK-LABEL: 'ptrtoint_based_trip_count_known_via_guards_applied_to_add_subexpr'
+; CHECK-NEXT:  Determining loop execution counts for: @ptrtoint_based_trip_count_known_via_guards_applied_to_add_subexpr
+; CHECK-NEXT:  Loop %loop: backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %end.i = ptrtoint ptr %end to i64
+  %start.i = ptrtoint ptr %start to i64
+  %sub = sub i64 %end.i, %start.i
+  %pre.1 = icmp eq i64 %sub, 4
+  call void @llvm.assume(i1 %pre.1)
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+  store i32 0, ptr %iv
+  %iv.next = getelementptr inbounds nuw i8, ptr %iv, i64 4
+  %ec = icmp eq ptr %iv.next, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @clobber()
+
+define void @test_add_sub_1_guard(ptr %src, i32 %n) {
+; CHECK-LABEL: 'test_add_sub_1_guard'
+; CHECK-NEXT:  Determining loop execution counts for: @test_add_sub_1_guard
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (zext i32 (-1 + (%n /u 2))<nsw> to i64)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (zext i32 (-1 + (%n /u 2))<nsw> to i64)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %shr = lshr i32 %n, 1
+  %sub.1 = add i32 %shr, -1
+  %sub.ext = zext i32 %sub.1 to i64
+  %pre = icmp eq i32 %shr, 1
+  %end = getelementptr i8, ptr %src, i64 %sub.ext
+  br i1 %pre, label %loop, label %exit
+
+loop:
+  %iv = phi ptr [ %src, %entry ], [ %iv.next, %loop ]
+  call void @clobber()
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  %ec = icmp eq ptr %iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @use(ptr)
+
+define i32 @test_3_op_add(i32 %x, i32 %y, ptr %A) {
+; CHECK-LABEL: 'test_3_op_add'
+; CHECK-NEXT:  Determining loop execution counts for: @test_3_op_add
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (zext i32 (1 + (-1 * %x) + %y) to i64))<nsw>
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2147483647
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (zext i32 (1 + (-1 * %x) + %y) to i64))<nsw>
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %pre.0 = icmp ugt i32 %x, 0
+  br i1 %pre.0, label %then, label %exit
+
+then:
+  %y.sub.x = sub i32 %y, %x
+  %pre.1 = icmp slt i32 %y.sub.x, 0
+  %add.1 = add i32 %y.sub.x, 1
+  %add.ext = zext i32 %add.1 to i64
+  br i1 %pre.1, label %exit, label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %then ]
+  %and = and i64 %iv, 1
+  %gep = getelementptr i8, ptr %A, i64 %and
+  call void @use(ptr %gep)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %add.ext
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
index 3c3748a6a5f02..1964fca603e23 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
@@ -104,3 +104,56 @@ exit:
 }
 
 declare void @clobber()
+
+
+declare void @clobber.i32(i32)
+
+define void @test_guards_across_loops(i32 %N) {
+; CHECK-LABEL: 'test_guards_across_loops'
+; CHECK-NEXT:  Classifying expressions for: @test_guards_across_loops
+; CHECK-NEXT:    %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+; CHECK-NEXT:    --> {0,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.1.next = add i32 %iv.1, 1
+; CHECK-NEXT:    --> {1,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.2 = phi i32 [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><%loop.2> U: full-set S: full-set Exits: (1 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:    %iv.2.next = add i32 %iv.2, 1
+; CHECK-NEXT:    --> {1,+,1}<nw><%loop.2> U: full-set S: full-set Exits: (2 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guards_across_loops
+; CHECK-NEXT:  Loop %loop.2: backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:  Loop %loop.2: symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop.1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Predicated backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+  call void @clobber.i32(i32 %iv.1)
+  %ec.1 = icmp ugt i32 %iv.1, %N
+  %iv.1.next = add i32 %iv.1, 1
+  br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.2 = phi i32  [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+  call void @clobber.i32(i32 %iv.2)
+  %ec.2 = icmp ugt i32 %iv.2, %N
+  %iv.2.next = add i32 %iv.2, 1
+  br i1 %ec.2, label %exit, label %loop.2
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-apply-to-adds.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-apply-to-adds.ll
new file mode 100644
index 0000000000000..951b07272dd4b
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-apply-to-adds.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes='print<scalar-evolution>' -scalar-evolution-classify-expressions=0 -disable-output %s 2>&1 | FileCheck %s
+
+define void @max_btc_improved_by_applying_guards_to_add_subexpr(i32 %low, i32 %high) {
+; CHECK-LABEL: 'max_btc_improved_by_applying_guards_to_add_subexpr'
+; CHECK-NEXT:  Determining loop execution counts for: @max_btc_improved_by_applying_guards_to_add_subexpr
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (zext i32 (1 + (-1 * %low) + %high) to i64))<nsw>
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 7
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (zext i32 (1 + (-1 * %low) + %high) to i64))<nsw>
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %sub = sub i32 %high, %low
+  %pre.1 = icmp slt i32 %sub, 8
+  br i1 %pre.1, label %if.then, label %exit
+
+if.then:
+  %pre.2 = icmp slt i32 %sub, 0
+  br i1 %pre.2, label %exit, label %ph
+
+ph:
+  %add.1 = add i32 %sub, 1
+  %wide.trip.count = zext i32 %add.1 to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll
index 8c77d704eac6a..4e5033b7a2f7f 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info-rewrite-expressions.ll
@@ -12,9 +12,9 @@ define void @rewrite_zext(i32 %n) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, -8
 ; CHECK-NEXT:    --> (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> U: [0,4294967289) S: [0,4294967289)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %check ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,17) S: [0,17) Exits: (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,17) S: [0,17) Exits: (-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw nsw i64 %index, 8
-; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,25) S: [8,25) Exits: (8 + (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,25) S: [8,25) Exits: (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2
@@ -52,11 +52,11 @@ define i32 @rewrite_zext_min_max(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_min_max
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -98,11 +98,11 @@ define i32 @rewrite_min_max_zext(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %umin, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_min_max_zext
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -144,11 +144,11 @@ define i32 @rewrite_sext_min_max(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nsw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_min_max
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -190,11 +190,11 @@ define i32 @rewrite_min_max_sext(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %smin, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nsw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_min_max_sext
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -288,9 +288,9 @@ define i32 @rewrite_zext_no_icmp_ne(i32 %N) {
 ; CHECK-NEXT:    %n.vec = and i64 %n.rnd.up, 8589934588
 ; CHECK-NEXT:    --> (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw> U: [4,4294967297) S: [4,4294967297)
 ; CHECK-NEXT:    %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,4294967293) S: [0,4294967293) Exits: (4 * ((-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,4294967293) S: [0,4294967293) Exits: (-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,4294967297) S: [4,4294967297) Exits: (4 + (4 * ((-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> /u 4))<nuw><nsw>)<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,4294967297) S: [4,4294967297) Exits: (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_no_icmp_ne
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((4 + (zext i32 (-1 + (zext i2 (trunc i32 %N to i2) to i32))<nsw> to i64))<nuw><nsw> /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 1073741823
@@ -328,9 +328,9 @@ define void @rewrite_zext_and_base_1(i32 %n) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, -8
 ; CHECK-NEXT:    --> (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> U: [0,4294967289) S: [0,4294967289)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %check ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw nsw i64 %index, 8
-; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 + (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_and_base_1
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -371,9 +371,9 @@ define void @rewrite_zext_and_base_2(i32 %n) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, -8
 ; CHECK-NEXT:    --> (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> U: [0,4294967289) S: [0,4294967289)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %check ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,8}<nuw><nsw><%loop> U: [0,25) S: [0,25) Exits: (-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw nsw i64 %index, 8
-; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 + (8 * ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {8,+,8}<nuw><nsw><%loop> U: [8,33) S: [8,33) Exits: (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_and_base_2
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * ((zext i32 %n to i64) /u 8))<nuw><nsw>)<nsw> /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -412,9 +412,9 @@ define void @guard_pessimizes_analysis_step2(i1 %c, i32 %N) {
 ; CHECK-NEXT:    %init = phi i64 [ 2, %entry ], [ 4, %bb1 ]
 ; CHECK-NEXT:    --> %init U: [2,5) S: [2,5)
 ; CHECK-NEXT:    %iv = phi i64 [ %iv.next, %loop ], [ %init, %loop.ph ]
-; CHECK-NEXT:    --> {%init,+,2}<nuw><nsw><%loop> U: [2,17) S: [2,17) Exits: ((2 * ((14 + (-1 * %init)<nsw>)<nsw> /u 2))<nuw><nsw> + %init) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%init,+,2}<nuw><nsw><%loop> U: [2,17) S: [2,17) Exits: 14 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 2
-; CHECK-NEXT:    --> {(2 + %init)<nuw><nsw>,+,2}<nuw><nsw><%loop> U: [4,19) S: [4,19) Exits: (2 + (2 * ((14 + (-1 * %init)<nsw>)<nsw> /u 2))<nuw><nsw> + %init) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(2 + %init)<nuw><nsw>,+,2}<nuw><nsw><%loop> U: [4,19) S: [4,19) Exits: 16 LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @guard_pessimizes_analysis_step2
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((14 + (-1 * %init)<nsw>)<nsw> /u 2)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 6
@@ -456,11 +456,11 @@ define i32 @rewrite_sext_slt_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_slt_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -500,11 +500,11 @@ define i32 @rewrite_zext_ult_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_ult_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -544,11 +544,11 @@ define i32 @rewrite_zext_ule_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_ule_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((4 umax (zext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -588,11 +588,11 @@ define i32 @rewrite_zext_sle_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_sle_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((zext i32 (4 smax %N) to i64) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -632,11 +632,11 @@ define i32 @rewrite_zext_uge_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_uge_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -676,11 +676,11 @@ define i32 @rewrite_sext_sge_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_sge_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -720,11 +720,11 @@ define i32 @rewrite_zext_ugt_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> U: [0,17) S: [0,17)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_zext_ugt_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * ((16 umin (zext i32 %N to i64)) /u 4))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -764,11 +764,11 @@ define i32 @rewrite_sext_sgt_narrow_check(i32 %N, ptr %arr) {
 ; CHECK-NEXT:    %n.vec = and i64 %ext, 28
 ; CHECK-NEXT:    --> (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> U: [0,29) S: [0,29)
 ; CHECK-NEXT:    %index = phi i64 [ 0, %loop.ph ], [ %index.next, %loop ]
-; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {0,+,4}<nuw><nsw><%loop> U: [0,13) S: [0,13) Exits: (-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i32, ptr %arr, i64 %index
 ; CHECK-NEXT:    --> {%arr,+,16}<nuw><%loop> U: full-set S: full-set Exits: ((16 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)) + %arr) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %index.next = add nuw i64 %index, 4
-; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 + (4 * ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4))<nuw>) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {4,+,4}<nuw><nsw><%loop> U: [4,17) S: [4,17) Exits: (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_sext_sgt_narrow_check
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (4 * (zext i3 (trunc i64 ((16 smin (sext i32 %N to i64)) /u 4) to i3) to i64))<nuw><nsw>)<nsw> /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 3
@@ -808,9 +808,9 @@ define void @rewrite_add_rec() {
 ; CHECK-NEXT:    %n.vec = and i64 %sub, -2
 ; CHECK-NEXT:    --> (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw> U: [0,9) S: [0,9) Exits: 0 LoopDispositions: { %outer.header: Computable, %inner: Invariant }
 ; CHECK-NEXT:    %inner.iv = phi i64 [ 0, %inner.ph ], [ %inner.iv.next, %inner ]
-; CHECK-NEXT:    --> {0,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 * ((-2 + (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw>)<nsw> /u 2))<nuw> LoopDispositions: { %inner: Computable, %outer.header: Variant }
+; CHECK-NEXT:    --> {0,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (-2 + (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw>)<nsw> LoopDispositions: { %inner: Computable, %outer.header: Variant }
 ; CHECK-NEXT:    %inner.iv.next = add i64 %inner.iv, 2
-; CHECK-NEXT:    --> {2,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 + (2 * ((-2 + (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw>)<nsw> /u 2))<nuw>) LoopDispositions: { %inner: Computable, %outer.header: Variant }
+; CHECK-NEXT:    --> {2,+,2}<%inner> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 * ({9,+,-1}<nsw><%outer.header> /u 2))<nuw><nsw> LoopDispositions: { %inner: Computable, %outer.header: Variant }
 ; CHECK-NEXT:    %iv.next = add i64 %iv, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%outer.header> U: [1,11) S: [1,11) Exits: 10 LoopDispositions: { %outer.header: Computable, %inner: Invariant }
 ; CHECK-NEXT:  Determining loop execution counts for: @rewrite_add_rec
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
index 9bf2427eddb9c..d5a2181e9bc5e 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
@@ -1231,7 +1231,7 @@ define void @optimized_range_check_unsigned3(ptr %pred, i1 %c) {
 ; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%loop> U: [0,3) S: [0,3) Exits: (-1 + %N)<nsw> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %gep = getelementptr inbounds i16, ptr %pred, i32 %iv
-; CHECK-NEXT:    --> {%pred,+,2}<nuw><%loop> U: full-set S: full-set Exits: ((2 * (zext i32 (-1 + %N)<nsw> to i64))<nuw><nsw> + %pred) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%pred,+,2}<nuw><%loop> U: full-set S: full-set Exits: ((zext i32 (-2 + (2 * %N)<nuw><nsw>)<nsw> to i64) + %pred) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = add nuw nsw i32 %iv, 1
 ; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop> U: [1,4) S: [1,4) Exits: %N LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @optimized_range_check_unsigned3
@@ -1390,9 +1390,9 @@ define void @ptr_induction_eq_2(ptr %a, i64 %n) {
 ; CHECK-NEXT:    %b = getelementptr inbounds ptr, ptr %a, i64 %n
 ; CHECK-NEXT:    --> ((8 * %n)<nsw> + %a) U: full-set S: full-set
 ; CHECK-NEXT:    %ptr.iv = phi ptr [ %ptr.iv.next, %loop ], [ %a, %entry ]
-; CHECK-NEXT:    --> {%a,+,8}<nuw><%loop> U: full-set S: full-set Exits: ((8 * ((-8 + (8 * %n)<nsw>) /u 8))<nuw> + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%a,+,8}<nuw><%loop> U: full-set S: full-set Exits: (-8 + (8 * %n)<nsw> + %a) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 8
-; CHECK-NEXT:    --> {(8 + %a),+,8}<nuw><%loop> U: full-set S: full-set Exits: (8 + (8 * ((-8 + (8 * %n)<nsw>) /u 8))<nuw> + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(8 + %a),+,8}<nuw><%loop> U: full-set S: full-set Exits: ((8 * %n)<nsw> + %a) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @ptr_induction_eq_2
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-8 + (8 * %n)<nsw>) /u 8)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
@@ -1431,7 +1431,7 @@ define void @ptr_induction_early_exit_eq_1_with_align_on_load(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
@@ -1470,7 +1470,7 @@ define void @ptr_induction_early_exit_eq_1_with_align_on_arguments(ptr align 8 %
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
@@ -1511,7 +1511,7 @@ define void @ptr_induction_early_exit_eq_1_align_assumption_1(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
@@ -1556,7 +1556,7 @@ define void @ptr_induction_early_exit_eq_1_align_assumption_2(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
diff --git a/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
new file mode 100644
index 0000000000000..1e21fbf08a92f
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s
+
+declare void @use(ptr)
+
+define void @udiv4_and_udiv2(i1 %c, ptr %A) {
+; CHECK-LABEL: 'udiv4_and_udiv2'
+; CHECK-NEXT:  Classifying expressions for: @udiv4_and_udiv2
+; CHECK-NEXT:    %start = select i1 %c, i32 512, i32 0
+; CHECK-NEXT:    --> %start U: [0,513) S: [0,513)
+; CHECK-NEXT:    %div.2 = lshr i32 %start, 1
+; CHECK-NEXT:    --> (%start /u 2) U: [0,257) S: [0,257)
+; CHECK-NEXT:    %div.4 = lshr i32 %start, 2
+; CHECK-NEXT:    --> (%start /u 4) U: [0,129) S: [0,129)
+; CHECK-NEXT:    %iv.start = zext i32 %div.4 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 4) U: [0,129) S: [0,129)
+; CHECK-NEXT:    %wide.trip.count = zext i32 %div.2 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 2) U: [0,257) S: [0,257)
+; CHECK-NEXT:    %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {((zext i32 %start to i64) /u 4),+,1}<%loop> U: full-set S: full-set Exits: ((zext i32 %start to i64) /u 2) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.8 = getelementptr i8, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {(((zext i32 %start to i64) /u 4) + %A),+,1}<%loop> U: full-set S: full-set Exits: (((zext i32 %start to i64) /u 2) + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.16 = getelementptr i16, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {(((zext i32 %start to i64) /u 2) + %A),+,2}<%loop> U: full-set S: full-set Exits: ((zext i32 %start to i64) + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.32 = getelementptr i32, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((zext i32 %start to i64) + %A),+,4}<%loop> U: full-set S: full-set Exits: ((2 * (zext i32 %start to i64))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.40 = getelementptr <{ i32, i8 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((5 * ((zext i32 %start to i64) /u 4))<nuw><nsw> + %A),+,5}<%loop> U: full-set S: full-set Exits: ((5 * ((zext i32 %start to i64) /u 2))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.48 = getelementptr <{ i32, i16 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((6 * ((zext i32 %start to i64) /u 4))<nuw><nsw> + %A),+,6}<%loop> U: full-set S: full-set Exits: ((6 * ((zext i32 %start to i64) /u 2))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 1
+; CHECK-NEXT:    --> {(1 + ((zext i32 %start to i64) /u 4))<nuw><nsw>,+,1}<%loop> U: full-set S: full-set Exits: (1 + ((zext i32 %start to i64) /u 2))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @udiv4_and_udiv2
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 4))<nsw> + ((zext i32 %start to i64) /u 2))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 4))<nsw> + ((zext i32 %start to i64) /u 2))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %start = select i1 %c, i32 512, i32 0
+  %div.2 = lshr i32 %start, 1
+  %div.4 = lshr i32 %start, 2
+  %iv.start = zext i32 %div.4 to i64
+  %wide.trip.count = zext i32 %div.2 to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+  %gep.8 = getelementptr i8, ptr %A, i64 %iv
+  call void @use(ptr %gep.8)
+  %gep.16 = getelementptr i16, ptr %A, i64 %iv
+  call void @use(ptr %gep.16)
+  %gep.32 = getelementptr i32, ptr %A, i64 %iv
+  call void @use(ptr %gep.32)
+  %gep.40 = getelementptr <{i32, i8}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.40)
+  %gep.48 = getelementptr <{i32, i16}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.48)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %wide.trip.count
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @udiv3_and_udiv5_mul_4(i1 %c, ptr %A) {
+; CHECK-LABEL: 'udiv3_and_udiv5_mul_4'
+; CHECK-NEXT:  Classifying expressions for: @udiv3_and_udiv5_mul_4
+; CHECK-NEXT:    %start = select i1 %c, i32 512, i32 0
+; CHECK-NEXT:    --> %start U: [0,513) S: [0,513)
+; CHECK-NEXT:    %div.3 = udiv i32 %start, 3
+; CHECK-NEXT:    --> (%start /u 3) U: [0,171) S: [0,171)
+; CHECK-NEXT:    %div.5 = udiv i32 %start, 5
+; CHECK-NEXT:    --> (%start /u 5) U: [0,103) S: [0,103)
+; CHECK-NEXT:    %iv.start = zext i32 %div.5 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 5) U: [0,103) S: [0,103)
+; CHECK-NEXT:    %wide.trip.count = zext i32 %div.3 to i64
+; CHECK-NEXT:    --> ((zext i32 %start to i64) /u 3) U: [0,171) S: [0,171)
+; CHECK-NEXT:    %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {((zext i32 %start to i64) /u 5),+,1}<%loop> U: full-set S: full-set Exits: ((zext i32 %start to i64) /u 3) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.8 = getelementptr i8, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {(((zext i32 %start to i64) /u 5) + %A),+,1}<%loop> U: full-set S: full-set Exits: (((zext i32 %start to i64) /u 3) + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.16 = getelementptr i16, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((2 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,2}<%loop> U: full-set S: full-set Exits: ((2 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.32 = getelementptr i32, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((4 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.40 = getelementptr <{ i32, i8 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((5 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,5}<%loop> U: full-set S: full-set Exits: ((5 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.48 = getelementptr <{ i32, i16 }>, ptr %A, i64 %iv
+; CHECK-NEXT:    --> {((6 * ((zext i32 %start to i64) /u 5))<nuw><nsw> + %A),+,6}<%loop> U: full-set S: full-set Exits: ((6 * ((zext i32 %start to i64) /u 3))<nuw><nsw> + %A) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 1
+; CHECK-NEXT:    --> {(1 + ((zext i32 %start to i64) /u 5))<nuw><nsw>,+,1}<%loop> U: full-set S: full-set Exits: (1 + ((zext i32 %start to i64) /u 3))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @udiv3_and_udiv5_mul_4
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 5))<nsw> + ((zext i32 %start to i64) /u 3))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * ((zext i32 %start to i64) /u 5))<nsw> + ((zext i32 %start to i64) /u 3))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %start = select i1 %c, i32 512, i32 0
+  %div.3 = udiv i32 %start, 3
+  %div.5 = udiv i32 %start, 5
+  %iv.start = zext i32 %div.5 to i64
+  %wide.trip.count = zext i32 %div.3 to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+  %gep.8 = getelementptr i8, ptr %A, i64 %iv
+  call void @use(ptr %gep.8)
+  %gep.16 = getelementptr i16, ptr %A, i64 %iv
+  call void @use(ptr %gep.16)
+  %gep.32 = getelementptr i32, ptr %A, i64 %iv
+  call void @use(ptr %gep.32)
+  %gep.40 = getelementptr <{i32, i8}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.40)
+  %gep.48 = getelementptr <{i32, i16}>, ptr %A, i64 %iv
+  call void @use(ptr %gep.48)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %wide.trip.count
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @use.i64(i64)
+
+define void @dividend_not_known_multiple_of_divisor(i64 %x) {
+; CHECK-LABEL: 'dividend_not_known_multiple_of_divisor'
+; CHECK-NEXT:  Classifying expressions for: @dividend_not_known_multiple_of_divisor
+; CHECK-NEXT:    %mul.2 = shl i64 %x, 1
+; CHECK-NEXT:    --> (2 * %x) U: [0,-1) S: [-9223372036854775808,9223372036854775807)
+; CHECK-NEXT:    %div.16 = lshr exact i64 %mul.2, 4
+; CHECK-NEXT:    --> ((2 * %x) /u 16) U: [0,1152921504606846976) S: [0,1152921504606846976)
+; CHECK-NEXT:    %m2 = and i64 %div.16, 1152921504606846974
+; CHECK-NEXT:    --> (2 * ((2 * %x) /u 32))<nuw><nsw> U: [0,1152921504606846975) S: [0,1152921504606846975)
+; CHECK-NEXT:    %m3 = mul i64 %div.16, 2
+; CHECK-NEXT:    --> (2 * ((2 * %x) /u 16))<nuw><nsw> U: [0,2305843009213693951) S: [0,2305843009213693951)
+; CHECK-NEXT:    %m4 = udiv i64 %m3, 4
+; CHECK-NEXT:    --> ((2 * ((2 * %x) /u 16))<nuw><nsw> /u 4) U: [0,576460752303423488) S: [0,576460752303423488)
+; CHECK-NEXT:  Determining loop execution counts for: @dividend_not_known_multiple_of_divisor
+;
+entry:
+  %mul.2 = shl i64 %x, 1
+  %div.16 = lshr exact i64 %mul.2, 4
+  %m2 = and i64 %div.16, 1152921504606846974
+  call void @use.i64(i64 %m2)
+
+  %m3 = mul i64 %div.16, 2
+  %m4 = udiv i64 %m3, 4
+  call void @use.i64(i64 %m4)
+  ret void
+}
+
+define void @btc_depends_on_div_mul(i64 %x) {
+; CHECK-LABEL: 'btc_depends_on_div_mul'
+; CHECK-NEXT:  Classifying expressions for: @btc_depends_on_div_mul
+; CHECK-NEXT:    %mul.2 = shl i64 %x, 1
+; CHECK-NEXT:    --> (2 * %x) U: [0,-1) S: [-9223372036854775808,9223372036854775807)
+; CHECK-NEXT:    %div.16 = lshr exact i64 %mul.2, 4
+; CHECK-NEXT:    --> ((2 * %x) /u 16) U: [0,1152921504606846976) S: [0,1152921504606846976)
+; CHECK-NEXT:    %masked = and i64 %div.16, 1152921504606846974
+; CHECK-NEXT:    --> (2 * ((2 * %x) /u 32))<nuw><nsw> U: [0,1152921504606846975) S: [0,1152921504606846975)
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (-2 + (2 * ((2 * %x) /u 32))<nuw><nsw>)<nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 2
+; CHECK-NEXT:    --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: (2 * ((2 * %x) /u 32))<nuw><nsw> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @btc_depends_on_div_mul
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-2 + (2 * ((2 * %x) /u 32))<nuw><nsw>)<nsw> /u 2)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-2 + (2 * ((2 * %x) /u 32))<nuw><nsw>)<nsw> /u 2)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %mul.2 = shl i64 %x, 1
+  %div.16 = lshr exact i64 %mul.2, 4
+  %masked = and i64 %div.16, 1152921504606846974
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  call void @use.i64(i64 %iv)
+  %iv.next = add i64 %iv, 2
+  %ec = icmp eq i64 %iv.next, %masked
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll
new file mode 100644
index 0000000000000..220c5a1deb1a0
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s
+
+declare void @foo()
+
+; Tests with multiple guards for the same value and different values.
+
+define void @test_guard_order_b_then_c_and_d(ptr %a, ptr %b, ptr %c, ptr %d) {
+; CHECK-LABEL: 'test_guard_order_b_then_c_and_d'
+; CHECK-NEXT:  Classifying expressions for: @test_guard_order_b_then_c_and_d
+; CHECK-NEXT:    %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guard_order_b_then_c_and_d
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -2
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %cmp.eq.b = icmp ne ptr %a, %b
+  %cmp.eq.c = icmp ne ptr %a, %c
+  %cmp.eq.d = icmp ne ptr %b, %d
+  call void @llvm.assume(i1 %cmp.eq.b)
+  call void @llvm.assume(i1 %cmp.eq.c)
+  call void @llvm.assume(i1 %cmp.eq.d)
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  call void @foo()
+  %ec = icmp eq ptr %iv.next, %b
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_guard_order_d_then_c_and_b(ptr %a, ptr %b, ptr %c, ptr %d) {
+; CHECK-LABEL: 'test_guard_order_d_then_c_and_b'
+; CHECK-NEXT:  Classifying expressions for: @test_guard_order_d_then_c_and_b
+; CHECK-NEXT:    %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guard_order_d_then_c_and_b
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -2
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %cmp.eq.b = icmp ne ptr %a, %b
+  %cmp.eq.c = icmp ne ptr %a, %c
+  %cmp.eq.d = icmp ne ptr %b, %d
+  call void @llvm.assume(i1 %cmp.eq.d)
+  call void @llvm.assume(i1 %cmp.eq.c)
+  call void @llvm.assume(i1 %cmp.eq.b)
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  call void @foo()
+  %ec = icmp eq ptr %iv.next, %b
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll b/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll
index c79befac2fb1d..1c108bd7318e9 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr58402-large-number-of-zext-exprs.ll
@@ -17,67 +17,67 @@ define i32 @pr58402_large_number_of_zext(ptr %dst) {
 ; CHECK-NEXT:    %add7 = add i32 %i, 4
 ; CHECK-NEXT:    --> (4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [4,5) S: [4,5) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i1 = and i32 %add7, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [4,5) S: [4,5) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [4,5) S: [4,5) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.1 = add i32 %i1, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (8 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i2 = and i32 %add7.1, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (8 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [8,9) S: [8,9) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.2 = add i32 %i2, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (12 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i3 = and i32 %add7.2, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (12 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [12,13) S: [12,13) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.3 = add i32 %i3, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (16 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i4 = and i32 %add7.3, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (16 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [16,17) S: [16,17) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.4 = add i32 %i4, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (20 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i5 = and i32 %add7.4, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (20 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [20,21) S: [20,21) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.5 = add i32 %i5, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (24 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i6 = and i32 %add7.5, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (24 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [24,25) S: [24,25) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.6 = add i32 %i6, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (28 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i7 = and i32 %add7.6, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (28 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [28,29) S: [28,29) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.7 = add i32 %i7, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (32 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i8 = and i32 %add7.7, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (32 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [32,33) S: [32,33) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.8 = add i32 %i8, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (36 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i9 = and i32 %add7.8, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (36 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [36,37) S: [36,37) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.9 = add i32 %i9, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (40 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i10 = and i32 %add7.9, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (40 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [40,41) S: [40,41) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.10 = add i32 %i10, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (44 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i11 = and i32 %add7.10, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (44 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [44,45) S: [44,45) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.11 = add i32 %i11, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (48 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i12 = and i32 %add7.11, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (48 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [48,49) S: [48,49) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.12 = add i32 %i12, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (52 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i13 = and i32 %add7.12, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (52 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [52,53) S: [52,53) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.13 = add i32 %i13, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (56 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i14 = and i32 %add7.13, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (56 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [56,57) S: [56,57) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.14 = add i32 %i14, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (60 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i15 = and i32 %add7.14, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (60 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [60,61) S: [60,61) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %add7.15 = add i32 %i15, 4
-; CHECK-NEXT:    --> (4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (64 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:    %i16 = and i32 %add7.15, -2
-; CHECK-NEXT:    --> (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((4 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw>)<nuw><nsw> /u 2))<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
+; CHECK-NEXT:    --> (64 + (2 * ((zext i1 %cmp to i32) /u 2))<nuw><nsw>)<nuw><nsw> U: [64,65) S: [64,65) Exits: <<Unknown>> LoopDispositions: { %header: Variant }
 ; CHECK-NEXT:  Determining loop execution counts for: @pr58402_large_number_of_zext
 ; CHECK-NEXT:  Loop %header: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
index e784d25385980..0c1f37bf58601 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
@@ -382,7 +382,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X64-NEXT:    %i9 = ptrtoint ptr %i7 to i64
 ; X64-NEXT:    --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i10 = sub i64 %i9, %i4
-; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,-1) S: [0,-1) Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
 ; X64-NEXT:    --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i12 = load i8, ptr %i11, align 1
@@ -393,7 +393,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X64-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:  Determining loop execution counts for: @pr46786_c26_char
 ; X64-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
-; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -2
 ; X64-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
 ; X64-NEXT:  Loop %bb6: Trip multiple is 1
 ;
@@ -406,9 +406,9 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X32-NEXT:    %i8 = load i8, ptr %i7, align 1
 ; X32-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
 ; X32-NEXT:    %i9 = ptrtoint ptr %i7 to i64
-; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934590) S: [0,8589934590) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i10 = sub i64 %i9, %i4
-; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967295) S: [0,4294967295) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
 ; X32-NEXT:    --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i12 = load i8, ptr %i11, align 1
@@ -419,7 +419,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X32-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:  Determining loop execution counts for: @pr46786_c26_char
 ; X32-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
-; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -2
 ; X32-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
 ; X32-NEXT:  Loop %bb6: Trip multiple is 1
 ;
@@ -447,6 +447,84 @@ bb5:
   ret void
 }
 
+define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
+; X64-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X64-NEXT:  Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT:    %i4 = ptrtoint ptr %arg to i64
+; X64-NEXT:    --> (ptrtoint ptr %arg to i64) U: full-set S: full-set
+; X64-NEXT:    %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X64-NEXT:    --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i8 = load i8, ptr %i7, align 1
+; X64-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i9 = ptrtoint ptr %i7 to i64
+; X64-NEXT:    --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i10 = sub i64 %i9, %i4
+; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,-1) S: [0,-1) Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X64-NEXT:    --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i12 = load i8, ptr %i11, align 1
+; X64-NEXT:    --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i13 = add i8 %i12, %i8
+; X64-NEXT:    --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X64-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -2
+; X64-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT:  Loop %bb6: Trip multiple is 1
+;
+; X32-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X32-NEXT:  Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT:    %i4 = ptrtoint ptr %arg to i64
+; X32-NEXT:    --> (zext i32 (ptrtoint ptr %arg to i32) to i64) U: [0,4294967296) S: [0,4294967296)
+; X32-NEXT:    %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X32-NEXT:    --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i8 = load i8, ptr %i7, align 1
+; X32-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i9 = ptrtoint ptr %i7 to i64
+; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934590) S: [0,8589934590) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i10 = sub i64 %i9, %i4
+; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967295) S: [0,4294967295) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X32-NEXT:    --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i12 = load i8, ptr %i11, align 1
+; X32-NEXT:    --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i13 = add i8 %i12, %i8
+; X32-NEXT:    --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X32-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -2
+; X32-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT:  Loop %bb6: Trip multiple is 1
+;
+  %i = icmp eq ptr %arg1, %arg
+  br i1 %i, label %bb5, label %bb3
+
+bb3:
+  %i4 = ptrtoint ptr %arg to i64
+  br label %bb6
+
+bb6:
+  %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+  %i8 = load i8, ptr %i7
+  %i9 = ptrtoint ptr %i7 to i64
+  %i10 = sub i64 %i9, %i4
+  %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+  %i12 = load i8, ptr %i11
+  %i13 = add i8 %i12, %i8
+  store i8 %i13, ptr %i11
+  %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+  %i15 = icmp eq ptr %i14, %arg1
+  br i1 %i15, label %bb5, label %bb6
+
+bb5:
+  ret void
+}
+
+
 ; void pr46786_c26_int(int* start, int *end, int *other) {
 ;   for (int* cur = start; cur != end; ++cur)
 ;     other[cur - start] += *cur;
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll
index 8d091a00ed4b9..d38010403dad7 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-minmax.ll
@@ -61,7 +61,7 @@ define void @umin(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:  Loop %for.body: backedge-taken count is (-1 + ((2 * %a) umin (4 * %b)))
 ; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 2147483646
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-1 + ((2 * %a) umin (4 * %b)))
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 ; void umin(unsigned a, unsigned b) {
 ;   a *= 2;
@@ -157,7 +157,7 @@ define void @smin(i32 noundef %a, i32 noundef %b) {
 ; CHECK-NEXT:  Loop %for.body: backedge-taken count is (-1 + ((2 * %a)<nsw> smin (4 * %b)<nsw>))
 ; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 2147483646
 ; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-1 + ((2 * %a)<nsw> smin (4 * %b)<nsw>))
-; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 2
 ;
 ; void smin(signed a, signed b) {
 ;   a *= 2;
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index bf140c7fa216a..f35c48b3e6fc5 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -574,5 +574,189 @@ exit:
   ret void
 }
 
+define void @test_ptr_aligned_by_2_and_4_via_assumption(ptr %start, ptr %end) {
+; CHECK-LABEL: 'test_ptr_aligned_by_2_and_4_via_assumption'
+; CHECK-NEXT:  Classifying expressions for: @test_ptr_aligned_by_2_and_4_via_assumption
+; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_ptr_aligned_by_2_and_4_via_assumption
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 4) ]
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+  store ptr %iv, ptr %iv
+  %iv.next = getelementptr i8, ptr %iv, i64 4
+  %ec = icmp ne ptr %iv.next, %end
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ptrs_aligned_by_4_via_assumption(ptr %start, ptr %end) {
+; CHECK-LABEL: 'test_ptrs_aligned_by_4_via_assumption'
+; CHECK-NEXT:  Classifying expressions for: @test_ptrs_aligned_by_4_via_assumption
+; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_ptrs_aligned_by_4_via_assumption
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 4) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 4) ]
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+  store ptr %iv, ptr %iv
+  %iv.next = getelementptr i8, ptr %iv, i64 4
+  %ec = icmp ne ptr %iv.next, %end
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ptrs_aligned_by_8_via_assumption(ptr %start, ptr %end) {
+; CHECK-LABEL: 'test_ptrs_aligned_by_8_via_assumption'
+; CHECK-NEXT:  Classifying expressions for: @test_ptrs_aligned_by_8_via_assumption
+; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
+; CHECK-NEXT:    --> {(4 + %start)<nuw><nsw>,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_ptrs_aligned_by_8_via_assumption
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
+  store ptr %iv, ptr %iv
+  %iv.next = getelementptr i8, ptr %iv, i64 4
+  %ec = icmp ne ptr %iv.next, %end
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors(ptr %start, ptr %end) {
+; CHECK-LABEL: 'test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors'
+; CHECK-NEXT:  Classifying expressions for: @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors
+; CHECK-NEXT:    %c = call i1 @cond()
+; CHECK-NEXT:    --> %c U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi ptr [ %start, %then ], [ %start, %else ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 4) ]
+  %c = call i1 @cond()
+  br i1 %c, label %then, label %else
+
+then:
+  br label %loop
+
+else:
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %start, %then] , [ %start, %else ], [ %iv.next, %loop ]
+  store ptr %iv, ptr %iv
+  %iv.next = getelementptr i8, ptr %iv, i64 4
+  %ec = icmp ne ptr %iv.next, %end
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_urem_non_constant(ptr %dst, i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_urem_non_constant'
+; CHECK-NEXT:  Classifying expressions for: @test_urem_non_constant
+; CHECK-NEXT:    %rem = urem i32 %a, %b
+; CHECK-NEXT:    --> ((-1 * (%a /u %b) * %b) + %a) U: full-set S: full-set
+; CHECK-NEXT:    %and.0 = and i1 %pre.0, %pre.1
+; CHECK-NEXT:    --> (%pre.1 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %and.1 = and i1 %and.0, %pre.2
+; CHECK-NEXT:    --> (%pre.1 umin %pre.2 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+; CHECK-NEXT:    --> ((sext i32 %b to i64) + %dst) U: full-set S: full-set Exits: ((sext i32 %b to i64) + %dst) LoopDispositions: { %loop: Invariant }
+; CHECK-NEXT:    %iv.next = add i32 %iv, %b
+; CHECK-NEXT:    --> {%b,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_urem_non_constant
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %rem = urem i32 %a, %b
+  %pre.0 = icmp eq i32 %rem, 0
+  %pre.1 = icmp ne i32 %a, 0
+  %pre.2 = icmp ne i32 %b, 0
+  %and.0 = and i1 %pre.0, %pre.1
+  %and.1 = and i1 %and.0, %pre.2
+  br i1 %and.1, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+  store i8 0, ptr %gep.dst
+  %iv.next = add i32 %iv, %b
+  %ec = icmp ne i32 %iv.next, %a
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 declare void @llvm.assume(i1)
 declare void @llvm.experimental.guard(i1, ...)
diff --git a/llvm/test/Analysis/ScalarEvolution/zext-add.ll b/llvm/test/Analysis/ScalarEvolution/zext-add.ll
new file mode 100644
index 0000000000000..a08feef7098ea
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/zext-add.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s
+
+declare i1 @cond()
+
+define void @test_push_constant_into_zext(ptr %dst, ptr %src, i32 %n, i64 %offset) {
+; CHECK-LABEL: 'test_push_constant_into_zext'
+; CHECK-NEXT:  Classifying expressions for: @test_push_constant_into_zext
+; CHECK-NEXT:    %outer.ptr = phi ptr [ %src, %entry ], [ %ptr.iv.next, %inner.loop ]
+; CHECK-NEXT:    --> %outer.ptr U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer.loop: Variant, %inner.loop: Invariant }
+; CHECK-NEXT:    %c = call i1 @cond()
+; CHECK-NEXT:    --> %c U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %outer.loop: Variant, %inner.loop: Invariant }
+; CHECK-NEXT:    %iv = phi i32 [ 0, %outer.loop ], [ %iv.next, %inner.loop ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%inner.loop> U: [0,2147483647) S: [0,2147483647) Exits: (-1 + (1 smax %n))<nsw> LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
+; CHECK-NEXT:    %ptr.iv = phi ptr [ %src, %outer.loop ], [ %ptr.iv.next, %inner.loop ]
+; CHECK-NEXT:    --> {%src,+,%offset}<%inner.loop> U: full-set S: full-set Exits: (((zext i32 (-1 + (1 smax %n))<nsw> to i64) * %offset) + %src) LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
+; CHECK-NEXT:    %l = load i8, ptr %outer.ptr, align 1
+; CHECK-NEXT:    --> %l U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %inner.loop: Variant, %outer.loop: Variant }
+; CHECK-NEXT:    %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 %offset
+; CHECK-NEXT:    --> {(%offset + %src),+,%offset}<%inner.loop> U: full-set S: full-set Exits: (((zext i32 (1 smax %n) to i64) * %offset) + %src) LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
+; CHECK-NEXT:    %iv.next = add i32 %iv, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%inner.loop> U: [1,-2147483648) S: [1,-2147483648) Exits: (1 smax %n) LoopDispositions: { %inner.loop: Computable, %outer.loop: Variant }
+; CHECK-NEXT:  Determining loop execution counts for: @test_push_constant_into_zext
+; CHECK-NEXT:  Loop %inner.loop: backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %inner.loop: constant max backedge-taken count is i32 2147483646
+; CHECK-NEXT:  Loop %inner.loop: symbolic max backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %inner.loop: Trip multiple is 1
+; CHECK-NEXT:  Loop %outer.loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %outer.loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.ptr = phi ptr [ %src, %entry ], [ %ptr.iv.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv = phi i32 [ 0, %outer.loop ], [ %iv.next, %inner.loop ]
+  %ptr.iv = phi ptr [ %src, %outer.loop ], [ %ptr.iv.next, %inner.loop ]
+  %l = load i8, ptr %outer.ptr, align 1
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 %offset
+  store i8 %l, ptr %dst, align 2
+  %iv.next = add i32 %iv, 1
+  %ec = icmp slt i32 %iv.next, %n
+  br i1 %ec, label %inner.loop, label %outer.loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/zext-signed-addrec.ll b/llvm/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
deleted file mode 100644
index 899d31d266e51..0000000000000
--- a/llvm/test/Analysis/ScalarEvolution/zext-signed-addrec.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: opt -loop-reduce -S < %s | FileCheck %s
-; PR18000
-
-target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@a = global i32 0, align 4
-@b = common global i32 0, align 4
-@e = common global i8 0, align 1
-@d = common global i32 0, align 4
-@c = common global i32 0, align 4
-@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
-
-; Function Attrs: nounwind optsize uwtable
-; CHECK-LABEL: foo
-define i32 @foo() {
-entry:
-  %.pr = load i32, ptr @b, align 4
-  %cmp10 = icmp slt i32 %.pr, 1
-  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %entry.for.end9_crit_edge
-
-entry.for.end9_crit_edge:                         ; preds = %entry
-  %.pre = load i32, ptr @c, align 4
-  br label %for.end9
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry
-  %0 = load i32, ptr @a, align 4
-  %tobool = icmp eq i32 %0, 0
-  br i1 %tobool, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge, label %return.loopexit.split
-
-for.cond1.preheader.for.cond1.preheader.split_crit_edge: ; preds = %for.cond1.preheader.lr.ph, %for.inc8
-  %1 = phi i32 [ %inc, %for.inc8 ], [ %.pr, %for.cond1.preheader.lr.ph ]
-  br label %if.end
-
-; CHECK-LABEL: if.end
-if.end:                                           ; preds = %if.end, %for.cond1.preheader.for.cond1.preheader.split_crit_edge
-
-; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %if.end ], [ 258, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ]
-  %indvars.iv = phi i32 [ 1, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %indvars.iv.next, %if.end ]
-
-  %2 = phi i8 [ 1, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %dec, %if.end ]
-  %conv7 = mul i32 %indvars.iv, 258
-  %shl = and i32 %conv7, 510
-  store i32 %shl, ptr @c, align 4
-
-; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, -258
-  %dec = add i8 %2, -1
-
-  %cmp2 = icmp sgt i8 %dec, -1
-  %indvars.iv.next = add i32 %indvars.iv, -1
-  br i1 %cmp2, label %if.end, label %for.inc8
-
-for.inc8:                                         ; preds = %if.end
-  store i32 0, ptr @d, align 4
-  %inc = add nsw i32 %1, 1
-  store i32 %inc, ptr @b, align 4
-  %cmp = icmp slt i32 %1, 0
-  br i1 %cmp, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge, label %for.cond.for.end9_crit_edge
-
-for.cond.for.end9_crit_edge:                      ; preds = %for.inc8
-  store i8 %dec, ptr @e, align 1
-  br label %for.end9
-
-for.end9:                                         ; preds = %entry.for.end9_crit_edge, %for.cond.for.end9_crit_edge
-  %3 = phi i32 [ %.pre, %entry.for.end9_crit_edge ], [ %shl, %for.cond.for.end9_crit_edge ]
-  %call = tail call i32 (ptr, ...) @printf(ptr @.str, i32 %3) #2
-  br label %return
-
-return.loopexit.split:                            ; preds = %for.cond1.preheader.lr.ph
-  store i8 1, ptr @e, align 1
-  store i32 0, ptr @d, align 4
-  br label %return
-
-return:                                           ; preds = %return.loopexit.split, %for.end9
-  %retval.0 = phi i32 [ 0, %for.end9 ], [ 1, %return.loopexit.split ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind optsize
-declare i32 @printf(ptr nocapture readonly, ...)
-
diff --git a/llvm/test/CodeGen/PowerPC/common-chain.ll b/llvm/test/CodeGen/PowerPC/common-chain.ll
index b71a360d1be12..8283e7bac3457 100644
--- a/llvm/test/CodeGen/PowerPC/common-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/common-chain.ll
@@ -721,6 +721,13 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
 ; CHECK-LABEL: spill_reduce_succ:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpdi r6, 0
+; CHECK-NEXT:    ble cr0, .LBB7_9
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    sldi r6, r6, 2
+; CHECK-NEXT:    li r11, 1
+; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    mr r26, r10
+; CHECK-NEXT:    cmpdi r6, 1
 ; CHECK-NEXT:    std r14, -144(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r15, -136(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r16, -128(r1) # 8-byte Folded Spill
@@ -733,231 +740,232 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
 ; CHECK-NEXT:    std r23, -72(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r24, -64(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r25, -56(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r26, -48(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r27, -40(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r28, -32(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r31, -8(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r2, -152(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r9, -184(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r8, -176(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r7, -168(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    std r3, -160(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    ble cr0, .LBB7_7
-; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    sldi r6, r6, 2
-; CHECK-NEXT:    li r7, 1
-; CHECK-NEXT:    mr r30, r10
-; CHECK-NEXT:    cmpdi r6, 1
-; CHECK-NEXT:    iselgt r7, r6, r7
-; CHECK-NEXT:    addi r8, r7, -1
-; CHECK-NEXT:    clrldi r6, r7, 63
-; CHECK-NEXT:    cmpldi r8, 3
-; CHECK-NEXT:    blt cr0, .LBB7_4
+; CHECK-NEXT:    iselgt r11, r6, r11
+; CHECK-NEXT:    addi r12, r11, -1
+; CHECK-NEXT:    cmpldi r12, 3
+; CHECK-NEXT:    clrldi r6, r11, 63
+; CHECK-NEXT:    blt cr0, .LBB7_5
 ; CHECK-NEXT:  # %bb.2: # %for.body.preheader.new
-; CHECK-NEXT:    ld r14, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    mulli r24, r30, 24
-; CHECK-NEXT:    ld r16, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r15, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r3, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    rldicl r0, r7, 62, 2
-; CHECK-NEXT:    sldi r11, r30, 5
-; CHECK-NEXT:    sldi r19, r30, 4
-; CHECK-NEXT:    sldi r7, r14, 3
-; CHECK-NEXT:    add r14, r30, r14
-; CHECK-NEXT:    sldi r10, r16, 3
-; CHECK-NEXT:    sldi r12, r15, 3
-; CHECK-NEXT:    add r16, r30, r16
-; CHECK-NEXT:    add r15, r30, r15
-; CHECK-NEXT:    add r27, r11, r7
-; CHECK-NEXT:    add r22, r24, r7
-; CHECK-NEXT:    add r17, r19, r7
-; CHECK-NEXT:    sldi r2, r14, 3
-; CHECK-NEXT:    add r26, r24, r10
-; CHECK-NEXT:    add r25, r24, r12
-; CHECK-NEXT:    add r21, r19, r10
-; CHECK-NEXT:    add r20, r19, r12
-; CHECK-NEXT:    add r8, r11, r10
-; CHECK-NEXT:    sldi r16, r16, 3
-; CHECK-NEXT:    add r29, r5, r27
-; CHECK-NEXT:    add r28, r4, r27
-; CHECK-NEXT:    add r27, r3, r27
-; CHECK-NEXT:    add r24, r5, r22
-; CHECK-NEXT:    add r23, r4, r22
-; CHECK-NEXT:    add r22, r3, r22
-; CHECK-NEXT:    add r19, r5, r17
-; CHECK-NEXT:    add r18, r4, r17
-; CHECK-NEXT:    add r17, r3, r17
-; CHECK-NEXT:    add r14, r5, r2
-; CHECK-NEXT:    add r31, r4, r2
-; CHECK-NEXT:    add r2, r3, r2
-; CHECK-NEXT:    add r9, r5, r8
-; CHECK-NEXT:    add r8, r11, r12
+; CHECK-NEXT:    rldicl r11, r11, 62, 2
+; CHECK-NEXT:    sldi r20, r8, 3
+; CHECK-NEXT:    mr r14, r7
+; CHECK-NEXT:    sldi r7, r7, 3
+; CHECK-NEXT:    sldi r21, r9, 3
+; CHECK-NEXT:    std r3, -160(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r9, -208(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r8, -184(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r5, -200(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r4, -168(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r11, -192(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    sldi r11, r10, 5
+; CHECK-NEXT:    add r0, r11, r20
+; CHECK-NEXT:    add r12, r11, r21
+; CHECK-NEXT:    add r30, r5, r0
+; CHECK-NEXT:    add r0, r11, r7
+; CHECK-NEXT:    std r21, -216(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r20, -224(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    add r12, r5, r12
+; CHECK-NEXT:    add r29, r5, r0
+; CHECK-NEXT:    add r28, r4, r0
+; CHECK-NEXT:    add r27, r3, r0
+; CHECK-NEXT:    mulli r0, r10, 24
+; CHECK-NEXT:    std r14, -176(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    add r26, r0, r21
+; CHECK-NEXT:    add r25, r0, r20
+; CHECK-NEXT:    add r0, r0, r7
+; CHECK-NEXT:    add r24, r5, r0
+; CHECK-NEXT:    add r23, r4, r0
+; CHECK-NEXT:    add r22, r3, r0
+; CHECK-NEXT:    sldi r0, r10, 4
 ; CHECK-NEXT:    add r26, r5, r26
 ; CHECK-NEXT:    add r25, r5, r25
+; CHECK-NEXT:    add r21, r0, r21
+; CHECK-NEXT:    add r20, r0, r20
+; CHECK-NEXT:    add r0, r0, r7
+; CHECK-NEXT:    add r19, r5, r0
+; CHECK-NEXT:    add r18, r4, r0
+; CHECK-NEXT:    add r17, r3, r0
+; CHECK-NEXT:    add r0, r10, r9
 ; CHECK-NEXT:    add r21, r5, r21
 ; CHECK-NEXT:    add r20, r5, r20
-; CHECK-NEXT:    add r16, r5, r16
-; CHECK-NEXT:    add r8, r5, r8
-; CHECK-NEXT:    rldicl r3, r0, 2, 1
-; CHECK-NEXT:    addi r3, r3, -4
-; CHECK-NEXT:    sub r0, r12, r7
-; CHECK-NEXT:    sub r12, r10, r7
-; CHECK-NEXT:    li r7, 0
-; CHECK-NEXT:    mr r10, r30
-; CHECK-NEXT:    sldi r15, r15, 3
-; CHECK-NEXT:    add r15, r5, r15
-; CHECK-NEXT:    rldicl r3, r3, 62, 2
-; CHECK-NEXT:    addi r3, r3, 1
-; CHECK-NEXT:    mtctr r3
+; CHECK-NEXT:    sldi r0, r0, 3
+; CHECK-NEXT:    add r16, r5, r0
+; CHECK-NEXT:    add r0, r10, r8
+; CHECK-NEXT:    sldi r0, r0, 3
+; CHECK-NEXT:    add r15, r5, r0
+; CHECK-NEXT:    add r0, r10, r14
+; CHECK-NEXT:    sldi r0, r0, 3
+; CHECK-NEXT:    add r2, r3, r0
+; CHECK-NEXT:    ld r3, -224(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    add r14, r5, r0
+; CHECK-NEXT:    add r31, r4, r0
+; CHECK-NEXT:    sub r0, r3, r7
+; CHECK-NEXT:    ld r3, -192(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    rldicl r9, r3, 2, 1
+; CHECK-NEXT:    ld r3, -216(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi r8, r9, -4
+; CHECK-NEXT:    rldicl r8, r8, 62, 2
+; CHECK-NEXT:    sub r7, r3, r7
+; CHECK-NEXT:    ori r3, r9, 1
+; CHECK-NEXT:    addi r8, r8, 1
+; CHECK-NEXT:    mulld r3, r10, r3
+; CHECK-NEXT:    mtctr r8
+; CHECK-NEXT:    li r8, 0
+; CHECK-NEXT:    std r10, -192(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r3, -216(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB7_3: # %for.body
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lfd f0, 0(r2)
 ; CHECK-NEXT:    lfd f1, 0(r31)
-; CHECK-NEXT:    add r3, r10, r30
-; CHECK-NEXT:    add r3, r3, r30
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
 ; CHECK-NEXT:    lfd f1, 0(r14)
-; CHECK-NEXT:    add r3, r3, r30
-; CHECK-NEXT:    add r10, r3, r30
 ; CHECK-NEXT:    xsadddp f0, f1, f0
 ; CHECK-NEXT:    stfd f0, 0(r14)
 ; CHECK-NEXT:    add r14, r14, r11
 ; CHECK-NEXT:    lfdx f0, r2, r0
 ; CHECK-NEXT:    lfdx f1, r31, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r15, r7
+; CHECK-NEXT:    lfdx f1, r15, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r15, r7
-; CHECK-NEXT:    lfdx f0, r2, r12
-; CHECK-NEXT:    lfdx f1, r31, r12
+; CHECK-NEXT:    stfdx f0, r15, r8
+; CHECK-NEXT:    lfdx f0, r2, r7
+; CHECK-NEXT:    lfdx f1, r31, r7
 ; CHECK-NEXT:    add r2, r2, r11
 ; CHECK-NEXT:    add r31, r31, r11
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r16, r7
+; CHECK-NEXT:    lfdx f1, r16, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r16, r7
+; CHECK-NEXT:    stfdx f0, r16, r8
 ; CHECK-NEXT:    lfd f0, 0(r17)
 ; CHECK-NEXT:    lfd f1, 0(r18)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r19, r7
+; CHECK-NEXT:    lfdx f1, r19, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r19, r7
+; CHECK-NEXT:    stfdx f0, r19, r8
 ; CHECK-NEXT:    lfdx f0, r17, r0
 ; CHECK-NEXT:    lfdx f1, r18, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r20, r7
+; CHECK-NEXT:    lfdx f1, r20, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r20, r7
-; CHECK-NEXT:    lfdx f0, r17, r12
-; CHECK-NEXT:    lfdx f1, r18, r12
+; CHECK-NEXT:    stfdx f0, r20, r8
+; CHECK-NEXT:    lfdx f0, r17, r7
+; CHECK-NEXT:    lfdx f1, r18, r7
 ; CHECK-NEXT:    add r17, r17, r11
 ; CHECK-NEXT:    add r18, r18, r11
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r21, r7
+; CHECK-NEXT:    lfdx f1, r21, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r21, r7
+; CHECK-NEXT:    stfdx f0, r21, r8
 ; CHECK-NEXT:    lfd f0, 0(r22)
 ; CHECK-NEXT:    lfd f1, 0(r23)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r24, r7
+; CHECK-NEXT:    lfdx f1, r24, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r24, r7
+; CHECK-NEXT:    stfdx f0, r24, r8
 ; CHECK-NEXT:    lfdx f0, r22, r0
 ; CHECK-NEXT:    lfdx f1, r23, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r25, r7
+; CHECK-NEXT:    lfdx f1, r25, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r25, r7
-; CHECK-NEXT:    lfdx f0, r22, r12
-; CHECK-NEXT:    lfdx f1, r23, r12
+; CHECK-NEXT:    stfdx f0, r25, r8
+; CHECK-NEXT:    lfdx f0, r22, r7
+; CHECK-NEXT:    lfdx f1, r23, r7
 ; CHECK-NEXT:    add r22, r22, r11
 ; CHECK-NEXT:    add r23, r23, r11
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r26, r7
+; CHECK-NEXT:    lfdx f1, r26, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r26, r7
+; CHECK-NEXT:    stfdx f0, r26, r8
 ; CHECK-NEXT:    lfd f0, 0(r27)
 ; CHECK-NEXT:    lfd f1, 0(r28)
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r29, r7
+; CHECK-NEXT:    lfdx f1, r29, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r29, r7
+; CHECK-NEXT:    stfdx f0, r29, r8
 ; CHECK-NEXT:    lfdx f0, r27, r0
 ; CHECK-NEXT:    lfdx f1, r28, r0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r8, r7
+; CHECK-NEXT:    lfdx f1, r30, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r8, r7
-; CHECK-NEXT:    lfdx f0, r27, r12
-; CHECK-NEXT:    lfdx f1, r28, r12
+; CHECK-NEXT:    stfdx f0, r30, r8
+; CHECK-NEXT:    lfdx f0, r27, r7
+; CHECK-NEXT:    lfdx f1, r28, r7
 ; CHECK-NEXT:    add r27, r27, r11
 ; CHECK-NEXT:    add r28, r28, r11
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r9, r7
+; CHECK-NEXT:    lfdx f1, r12, r8
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r9, r7
-; CHECK-NEXT:    add r7, r7, r11
+; CHECK-NEXT:    stfdx f0, r12, r8
+; CHECK-NEXT:    add r8, r8, r11
 ; CHECK-NEXT:    bdnz .LBB7_3
-; CHECK-NEXT:  .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    ld r3, -160(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r4, -168(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r7, -176(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r8, -184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r10, -192(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r5, -200(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r9, -208(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r26, -216(r1) # 8-byte Folded Reload
+; CHECK-NEXT:  .LBB7_5: # %for.cond.cleanup.loopexit.unr-lcssa
 ; CHECK-NEXT:    cmpldi r6, 0
-; CHECK-NEXT:    beq cr0, .LBB7_7
-; CHECK-NEXT:  # %bb.5: # %for.body.epil.preheader
-; CHECK-NEXT:    ld r3, -184(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld r0, -160(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    sldi r8, r30, 3
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r7, r5, r3
-; CHECK-NEXT:    add r9, r4, r3
-; CHECK-NEXT:    add r11, r0, r3
-; CHECK-NEXT:    ld r3, -176(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r12, r5, r3
-; CHECK-NEXT:    add r30, r4, r3
-; CHECK-NEXT:    add r29, r0, r3
-; CHECK-NEXT:    ld r3, -168(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    add r3, r10, r3
-; CHECK-NEXT:    li r10, 0
-; CHECK-NEXT:    sldi r3, r3, 3
-; CHECK-NEXT:    add r5, r5, r3
-; CHECK-NEXT:    add r4, r4, r3
-; CHECK-NEXT:    add r3, r0, r3
+; CHECK-NEXT:    beq cr0, .LBB7_8
+; CHECK-NEXT:  # %bb.6: # %for.body.epil.preheader
+; CHECK-NEXT:    add r11, r26, r9
+; CHECK-NEXT:    add r12, r26, r8
+; CHECK-NEXT:    add r9, r26, r7
+; CHECK-NEXT:    sldi r27, r10, 3
+; CHECK-NEXT:    sldi r11, r11, 3
+; CHECK-NEXT:    sldi r0, r12, 3
+; CHECK-NEXT:    sldi r9, r9, 3
+; CHECK-NEXT:    add r28, r5, r11
+; CHECK-NEXT:    add r10, r4, r11
+; CHECK-NEXT:    add r11, r3, r11
+; CHECK-NEXT:    add r12, r5, r0
+; CHECK-NEXT:    add r30, r4, r0
+; CHECK-NEXT:    add r29, r3, r0
+; CHECK-NEXT:    add r5, r5, r9
+; CHECK-NEXT:    add r4, r4, r9
+; CHECK-NEXT:    add r3, r3, r9
+; CHECK-NEXT:    li r9, 0
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB7_6: # %for.body.epil
+; CHECK-NEXT:  .LBB7_7: # %for.body.epil
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    lfdx f0, r3, r10
-; CHECK-NEXT:    lfdx f1, r4, r10
+; CHECK-NEXT:    lfdx f0, r3, r9
+; CHECK-NEXT:    lfdx f1, r4, r9
 ; CHECK-NEXT:    addi r6, r6, -1
 ; CHECK-NEXT:    cmpldi r6, 0
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
 ; CHECK-NEXT:    lfd f1, 0(r5)
 ; CHECK-NEXT:    xsadddp f0, f1, f0
 ; CHECK-NEXT:    stfd f0, 0(r5)
-; CHECK-NEXT:    add r5, r5, r8
-; CHECK-NEXT:    lfdx f0, r29, r10
-; CHECK-NEXT:    lfdx f1, r30, r10
+; CHECK-NEXT:    add r5, r5, r27
+; CHECK-NEXT:    lfdx f0, r29, r9
+; CHECK-NEXT:    lfdx f1, r30, r9
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r12, r10
+; CHECK-NEXT:    lfdx f1, r12, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r12, r10
-; CHECK-NEXT:    lfdx f0, r11, r10
-; CHECK-NEXT:    lfdx f1, r9, r10
+; CHECK-NEXT:    stfdx f0, r12, r9
+; CHECK-NEXT:    lfdx f0, r11, r9
+; CHECK-NEXT:    lfdx f1, r10, r9
 ; CHECK-NEXT:    xsmuldp f0, f0, f1
-; CHECK-NEXT:    lfdx f1, r7, r10
+; CHECK-NEXT:    lfdx f1, r28, r9
 ; CHECK-NEXT:    xsadddp f0, f1, f0
-; CHECK-NEXT:    stfdx f0, r7, r10
-; CHECK-NEXT:    add r10, r10, r8
-; CHECK-NEXT:    bne cr0, .LBB7_6
-; CHECK-NEXT:  .LBB7_7: # %for.cond.cleanup
+; CHECK-NEXT:    stfdx f0, r28, r9
+; CHECK-NEXT:    add r9, r9, r27
+; CHECK-NEXT:    bne cr0, .LBB7_7
+; CHECK-NEXT:  .LBB7_8:
 ; CHECK-NEXT:    ld r2, -152(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r31, -8(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
-; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    ld r28, -32(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r27, -40(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r26, -48(r1) # 8-byte Folded Reload
@@ -973,6 +981,8 @@ define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64
 ; CHECK-NEXT:    ld r16, -128(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r15, -136(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld r14, -144(r1) # 8-byte Folded Reload
+; CHECK-NEXT:  .LBB7_9: # %for.cond.cleanup
+; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    blr
 entry:
   %cmp49 = icmp sgt i64 %m, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 07aa05f609c40..6f4f03caf8f37 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -912,7 +912,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    addi a6, a4, 1
 ; CHECK-NEXT:    andi a7, a6, -32
 ; CHECK-NEXT:    add a4, a7, a2
-; CHECK-NEXT:    add a2, a4, a0
+; CHECK-NEXT:    add a2, a0, a4
 ; CHECK-NEXT:    li t1, 5
 ; CHECK-NEXT:    vsetvli zero, t2, e8, m1, ta, ma
 ; CHECK-NEXT:  .LBB14_3: # %bb15
@@ -1019,10 +1019,7 @@ define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr
 ; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    beqz a2, .LBB15_3
 ; CHECK-NEXT:  # %bb.1: # %bb2
-; CHECK-NEXT:    addi a2, a2, -16
-; CHECK-NEXT:    andi a2, a2, -16
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    vsetivli zero, 16, e8, mf2, ta, ma
 ; CHECK-NEXT:  .LBB15_2: # %bb4
diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/fold-ext-add.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/fold-ext-add.ll
new file mode 100644
index 0000000000000..640c910be4a82
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/AArch64/fold-ext-add.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p indvars -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx15.0.0"
+
+declare i1 @cond()
+
+define void @pred_mip_12(ptr %dst, ptr %src, i32 %n, i64 %offset) {
+; CHECK-LABEL: define void @pred_mip_12(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[SMAX]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[OFFSET]], [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
+; CHECK:       [[OUTER_LOOP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP]]
+; CHECK:       [[OUTER_LOOP]]:
+; CHECK-NEXT:    [[OUTER_PTR:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ], [ [[SCEVGEP]], %[[OUTER_LOOP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[INNER_LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[INNER_LOOP:.*]]
+; CHECK:       [[INNER_LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[INNER_LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[OUTER_PTR]], align 1
+; CHECK-NEXT:    store i8 [[L]], ptr [[DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[INNER_LOOP]], label %[[OUTER_LOOP_LOOPEXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.ptr = phi ptr [ %src, %entry ], [ %ptr.iv.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv = phi i32 [ 0, %outer.loop ], [ %iv.next, %inner.loop ]
+  %ptr.iv = phi ptr [ %src, %outer.loop ], [ %ptr.iv.next, %inner.loop ]
+  %l = load i8, ptr %outer.ptr, align 1
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 %offset
+  store i8 %l, ptr %dst, align 2
+  %iv.next = add i32 %iv, 1
+  %ec = icmp slt i32 %iv.next, %n
+  br i1 %ec, label %inner.loop, label %outer.loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll b/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll
index 99baa6105655d..4b52479fc6c4d 100644
--- a/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/canonicalize-cmp.ll
@@ -334,4 +334,88 @@ out_of_bounds:
   ret i32 -1
 }
 
+define void @slt_no_smax_needed(i64 %n, ptr %dst) {
+; CHECK-LABEL: @slt_no_smax_needed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_TRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[N_TRUNC]], 1
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD_1]], 1
+; CHECK-NEXT:    [[PRE:%.*]] = icmp ult i32 [[ADD_1]], 8
+; CHECK-NEXT:    br i1 [[PRE]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SHR]], i32 1)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i32 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %n.trunc = trunc i64 %n to i32
+  %add.1 = add i32 %n.trunc, 1
+  %shr = lshr i32 %add.1, 1
+  %pre = icmp ult i32 %add.1, 8
+  br i1 %pre, label %exit, label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i8, ptr %dst, i32 %iv
+  store i8 0, ptr %gep, align 1
+  %iv.next = add i32 %iv, 1
+  %ec = icmp slt i32 %iv.next, %shr
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @ult_no_umax_needed(i64 %n, ptr %dst) {
+; CHECK-LABEL: @ult_no_umax_needed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_TRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[N_TRUNC]], 1
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD_1]], 1
+; CHECK-NEXT:    [[PRE:%.*]] = icmp ult i32 [[ADD_1]], 8
+; CHECK-NEXT:    br i1 [[PRE]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[SHR]], i32 1)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i32 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], [[UMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %n.trunc = trunc i64 %n to i32
+  %add.1 = add i32 %n.trunc, 1
+  %shr = lshr i32 %add.1, 1
+  %pre = icmp ult i32 %add.1, 8
+  br i1 %pre, label %exit, label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i8, ptr %dst, i32 %iv
+  store i8 0, ptr %gep, align 1
+  %iv.next = add i32 %iv, 1
+  %ec = icmp ult i32 %iv.next, %shr
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 !0 = !{i32 1, i32 2147483648}
diff --git a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
index b946bbf747083..69713e006ebb9 100644
--- a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
+++ b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
@@ -96,3 +96,201 @@ loop:
 exit:
   ret void
 }
+
+define i32 @urem_order1(i32 %n) {
+; CHECK-LABEL: define i32 @urem_order1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[UREM:%.*]] = urem i32 [[N]], 3
+; CHECK-NEXT:    [[UREM_ZERO:%.*]] = icmp eq i32 [[UREM]], 0
+; CHECK-NEXT:    br i1 [[UREM_ZERO]], label %[[PH:.*]], label %[[EXIT:.*]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[N_NON_ZERO:%.*]] = icmp ne i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[N_NON_ZERO]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 3
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ 2, %[[PH]] ], [ 3, %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %urem = urem i32 %n, 3
+  %urem.zero = icmp eq i32 %urem, 0
+  br i1 %urem.zero, label %ph, label %exit
+
+ph:
+  %n.non.zero = icmp ne i32 %n, 0
+  br i1 %n.non.zero, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %ph ], [ %iv.next, %loop ]
+  call void @foo()
+  %iv.next = add i32 %iv, 3
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ 1, %entry ], [ 2, %ph ], [ 3, %loop ]
+  ret i32 %res
+}
+
+define i32 @urem_order2(i32 %n) {
+; CHECK-LABEL: define i32 @urem_order2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[N_NON_ZERO:%.*]] = icmp ne i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[N_NON_ZERO]], label %[[PH:.*]], label %[[EXIT:.*]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[UREM:%.*]] = urem i32 [[N]], 3
+; CHECK-NEXT:    [[UREM_ZERO:%.*]] = icmp eq i32 [[UREM]], 0
+; CHECK-NEXT:    br i1 [[UREM_ZERO]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 3
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ 2, %[[PH]] ], [ 3, %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %n.non.zero = icmp ne i32 %n, 0
+  br i1 %n.non.zero, label %ph, label %exit
+
+ph:
+  %urem = urem i32 %n, 3
+  %urem.zero = icmp eq i32 %urem, 0
+  br i1 %urem.zero, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %ph ], [ %iv.next, %loop ]
+  call void @foo()
+  %iv.next = add i32 %iv, 3
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %res = phi i32 [ 1, %entry ], [ 2, %ph ], [ 3, %loop ]
+  ret i32 %res
+}
+
+define i64 @test_loop_with_div_order_1(i64 %n) {
+; CHECK-LABEL: define i64 @test_loop_with_div_order_1(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[IS_ZERO]], label %[[EXIT:.*]], label %[[CHECK_BOUNDS:.*]]
+; CHECK:       [[CHECK_BOUNDS]]:
+; CHECK-NEXT:    [[N_PLUS_63:%.*]] = add i64 [[N]], 63
+; CHECK-NEXT:    [[UPPER_BOUND:%.*]] = lshr i64 [[N_PLUS_63]], 6
+; CHECK-NEXT:    [[BOUNDS_CHECK:%.*]] = icmp ult i64 [[N_PLUS_63]], 64
+; CHECK-NEXT:    br i1 [[BOUNDS_CHECK]], label %[[EXIT]], label %[[CHECK_PARITY:.*]]
+; CHECK:       [[CHECK_PARITY]]:
+; CHECK-NEXT:    [[IS_ODD:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[PARITY_CHECK:%.*]] = icmp eq i64 [[IS_ODD]], 0
+; CHECK-NEXT:    br i1 [[PARITY_CHECK]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[DUMMY:%.*]] = load volatile i64, ptr null, align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[UPPER_BOUND]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %is_zero = icmp eq i64 %n, 0
+  br i1 %is_zero, label %exit, label %check_bounds
+
+check_bounds:
+  %n_plus_63 = add i64 %n, 63
+  %upper_bound = lshr i64 %n_plus_63, 6
+  %bounds_check = icmp ult i64 %n_plus_63, 64
+  br i1 %bounds_check, label %exit, label %check_parity
+
+check_parity:
+  %is_odd = and i64 %n, 1
+  %parity_check = icmp eq i64 %is_odd, 0
+  br i1 %parity_check, label %loop, label %exit
+
+loop:
+  %iv = phi i64 [ %iv_next, %loop ], [ 0, %check_parity ]
+  %dummy = load volatile i64, ptr null, align 8
+  %iv_next = add i64 %iv, 1
+  %exit_cond = icmp ult i64 %iv_next, %upper_bound
+  br i1 %exit_cond, label %loop, label %exit
+
+exit:
+  ret i64 0
+}
+
+define i64 @test_loop_with_div_order_2(i64 %n) {
+; CHECK-LABEL: define i64 @test_loop_with_div_order_2(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[N_PLUS_63:%.*]] = add i64 [[N]], 63
+; CHECK-NEXT:    [[UPPER_BOUND:%.*]] = lshr i64 [[N_PLUS_63]], 6
+; CHECK-NEXT:    [[BOUNDS_CHECK:%.*]] = icmp ult i64 [[N_PLUS_63]], 64
+; CHECK-NEXT:    br i1 [[BOUNDS_CHECK]], label %[[EXIT:.*]], label %[[CHECK_BOUNDS:.*]]
+; CHECK:       [[CHECK_BOUNDS]]:
+; CHECK-NEXT:    [[IS_ZERO:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[IS_ZERO]], label %[[EXIT]], label %[[CHECK_PARITY:.*]]
+; CHECK:       [[CHECK_PARITY]]:
+; CHECK-NEXT:    [[IS_ODD:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[PARITY_CHECK:%.*]] = icmp eq i64 [[IS_ODD]], 0
+; CHECK-NEXT:    br i1 [[PARITY_CHECK]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[DUMMY:%.*]] = load volatile i64, ptr null, align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[UPPER_BOUND]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %n_plus_63 = add i64 %n, 63
+  %upper_bound = lshr i64 %n_plus_63, 6
+  %bounds_check = icmp ult i64 %n_plus_63, 64
+  br i1 %bounds_check, label %exit, label %check_bounds
+
+check_bounds:
+  %is_zero = icmp eq i64 %n, 0
+  br i1 %is_zero, label %exit, label %check_parity
+
+check_parity:
+  %is_odd = and i64 %n, 1
+  %parity_check = icmp eq i64 %is_odd, 0
+  br i1 %parity_check, label %loop, label %exit
+
+loop:
+  %iv = phi i64 [ %iv_next, %loop ], [ 0, %check_parity ]
+  %dummy = load volatile i64, ptr null, align 8
+  %iv_next = add i64 %iv, 1
+  %exit_cond = icmp ult i64 %iv_next, %upper_bound
+  br i1 %exit_cond, label %loop, label %exit
+
+exit:
+  ret i64 0
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
new file mode 100644
index 0000000000000..3bf1c2ef81cb9
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p indvars -S %s | FileCheck %s
+
+declare i1 @cond()
+
+define i64 @test_ptr_compare_guard(ptr %start, ptr %end) {
+; CHECK-LABEL: define i64 @test_ptr_compare_guard(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[I64_IV:%.*]] = phi i64 [ [[I64_IV_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; CHECK-NEXT:    [[I64_IV_NEXT]] = add nuw i64 [[I64_IV]], 1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[I64_IV]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %c.0 = icmp eq ptr %start, %end
+  br i1 %c.0, label %exit, label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %i64.iv = phi i64 [ 0, %entry ], [ %i64.iv.next, %loop.latch ]
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %loop.latch, label %exit
+
+loop.latch:
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %i64.iv.next = add i64 %i64.iv, 1
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %res = phi i64 [ 0, %entry ], [ %i64.iv, %loop.latch ], [ 0, %loop.header ]
+  ret i64 %res
+}
+
+define void @test_sub_cmp(ptr align 8 %start, ptr %end) {
+; CHECK-LABEL: define void @test_sub_cmp(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT:    [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP_LATCH:%.*]] = icmp ult i64 [[IV_NEXT]], [[PTR_DIFF]]
+; CHECK-NEXT:    br i1 [[CMP_LATCH]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_EARLY]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; N32-LABEL: define void @test_sub_cmp(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT:  [[ENTRY:.*:]]
+; N32-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT:    [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT:    br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32:       [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT:    br label %[[LOOP_HEADER:.*]]
+; N32:       [[LOOP_HEADER]]:
+; N32-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT:    br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; N32:       [[LOOP_LATCH]]:
+; N32-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; N32-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[PTR_DIFF]]
+; N32-NEXT:    br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; N32:       [[EXIT_EARLY]]:
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT_LOOPEXIT]]:
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT]]:
+; N32-NEXT:    ret void
+;
+entry:
+  %start.int = ptrtoint ptr %start to i64
+  %end.int = ptrtoint ptr %end to i64
+  %ptr.diff = sub i64 %start.int, %end.int
+  %cmp.entry = icmp eq ptr %start, %end
+  br i1 %cmp.entry, label %exit, label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %exit.early, label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %cmp.latch = icmp ult i64 %iv.next, %ptr.diff
+  br i1 %cmp.latch, label %loop.header, label %exit
+
+exit.early:
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+define void @test_ptr_diff_with_assume(ptr align 8 %start, ptr align 8 %end, ptr %P) {
+; CHECK-LABEL: define void @test_ptr_diff_with_assume(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT:    [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[DIFF_CMP]])
+; CHECK-NEXT:    [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]]
+; CHECK-NEXT:    [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT:    br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; CHECK:       [[LOOP_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1
+; CHECK-NEXT:    [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]]
+; CHECK-NEXT:    br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; N32-LABEL: define void @test_ptr_diff_with_assume(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; N32-NEXT:  [[ENTRY:.*:]]
+; N32-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT:    [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; N32-NEXT:    call void @llvm.assume(i1 [[DIFF_CMP]])
+; N32-NEXT:    [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]]
+; N32-NEXT:    [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT:    br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; N32:       [[LOOP_BODY_PREHEADER]]:
+; N32-NEXT:    br label %[[LOOP_BODY:.*]]
+; N32:       [[LOOP_BODY]]:
+; N32-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ]
+; N32-NEXT:    [[TMP0:%.*]] = call i1 @cond()
+; N32-NEXT:    [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1
+; N32-NEXT:    [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]]
+; N32-NEXT:    br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; N32:       [[EXIT_LOOPEXIT]]:
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT]]:
+; N32-NEXT:    ret void
+;
+entry:
+  %start.int = ptrtoint ptr %start to i64
+  %end.int = ptrtoint ptr %end to i64
+  %ptr.diff = sub i64 %start.int, %end.int
+  %diff.cmp = icmp ult i64 %ptr.diff, 2
+  call void @llvm.assume(i1 %diff.cmp)
+  %computed.end = getelementptr i8, ptr %start, i64 %ptr.diff
+  %entry.cmp = icmp eq ptr %start, %end
+  br i1 %entry.cmp, label %exit, label %loop.body
+
+loop.body:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop.body ]
+  call i1 @cond()
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  %loop.cmp = icmp eq ptr %iv.next, %computed.end
+  br i1 %loop.cmp, label %exit, label %loop.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
index b1043b48d40ac..84ae79d53e25e 100644
--- a/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
+++ b/llvm/test/Transforms/IndVarSimplify/rewrite-loop-exit-values-phi.ll
@@ -68,3 +68,44 @@ latch:                                            ; preds = %inner_exit
 end:                                              ; preds = %header
   ret void
 }
+
+
+declare void @foo()
+
+define i64 @narow_canonical_iv_wide_multiplied_iv(i32 %x, i64 %y, ptr %0) {
+; CHECK-LABEL: @narow_canonical_iv_wide_multiplied_iv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 1)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SMAX]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[Y:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    ret i64 [[TMP6]]
+;
+entry:
+  %smax = tail call i32 @llvm.smax.i32(i32 %x, i32 1)
+  %mul.y = shl i64 %y, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul = phi i64 [ 1, %entry ], [ %iv.mul.next, %loop ]
+  %iv.mul.next = add i64 %iv.mul, %mul.y
+  call void @foo()
+  %iv.next = add i32 %iv, 1
+  %ec = icmp ult i32 %iv.next, %smax
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret i64 %iv.mul.next
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
diff --git a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
index d24f9a4e40e38..17921afc5ff06 100644
--- a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
+++ b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
@@ -15,11 +15,9 @@ define void @_Z3fn1v() {
 ; CHECK-NEXT:    [[J_SROA_0_0_COPYLOAD:%.*]] = load i8, ptr [[X5]], align 1
 ; CHECK-NEXT:    br label [[DOTPREHEADER4_LR_PH:%.*]]
 ; CHECK:       .preheader4.lr.ph:
-; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[X4]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i8 [[J_SROA_0_0_COPYLOAD]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[X4]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER4:%.*]]
 ; CHECK:       .preheader4:
 ; CHECK-NEXT:    [[K_09:%.*]] = phi ptr [ undef, [[DOTPREHEADER4_LR_PH]] ], [ [[X25:%.*]], [[X22:%.*]] ]
diff --git a/llvm/test/Transforms/InstCombine/fold-bin-operand.ll b/llvm/test/Transforms/InstCombine/fold-bin-operand.ll
index f28262b2a77e0..0023a10312451 100644
--- a/llvm/test/Transforms/InstCombine/fold-bin-operand.ll
+++ b/llvm/test/Transforms/InstCombine/fold-bin-operand.ll
@@ -30,6 +30,226 @@ define i32 @g(i32 %x) {
   ret i32 %b
 }
 
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp(ptr %src, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 10
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2]], [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_operands_swapped(ptr %src, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_operands_swapped(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 10
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2]], [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 10, %i
+  %p = inttoptr i64 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_constant_offset(ptr %src, i64 %off, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_constant_offset(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2]], [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, %off
+  %p = inttoptr i64 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_constant_offset_operands_swapped(ptr %src, i64 %off, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_constant_offset_operands_swapped(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[OFF:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2]], [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %off, %i
+  %p = inttoptr i64 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_different_src_address_spaces(ptr addrspace(1) %src, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_different_src_address_spaces(
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr addrspace(1) [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[I]], 10
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[A]] to ptr
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2:%.*]], [[P]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr addrspace(1) %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_different_dst_address_spaces(ptr %src, ptr addrspace(1) %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_different_dst_address_spaces(
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[I]], 10
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[A]] to ptr addrspace(1)
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr addrspace(1) [[P2:%.*]], [[P]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr addrspace(1)
+  %c = icmp eq ptr addrspace(1) %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_int_type_does_not_match_ptr_ty(ptr %src, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_int_type_does_not_match_ptr_ty(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[I:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[I]], 10
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[A]] to i64
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2:%.*]], [[P]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i8
+  %a = add i8 %i, 10
+  %p = inttoptr i8 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_multiple_users_ptrtoint_used_by_single_icmp(ptr %src, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_multiple_users_ptrtoint_used_by_single_icmp(
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[I]], 10
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[A]] to ptr
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2:%.*]], [[P]]
+; CHECK-NEXT:    call void @bar(i64 [[A]])
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %c = icmp eq ptr %p, %p2
+  call void @bar(i64 %a)
+  ret i1 %c
+}
+
+define i1 @multiple_inttoptr_add_ptrtoint_used_by_single_icmp(ptr %src) {
+; CHECK-LABEL: @multiple_inttoptr_add_ptrtoint_used_by_single_icmp(
+; CHECK-NEXT:    ret i1 false
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %a.2 = add i64 %i, 11
+  %p.2 = inttoptr i64 %a.2 to ptr
+  %c = icmp eq ptr %p, %p.2
+  ret i1 %c
+}
+
+define i1 @multiple_inttoptr_add_ptrtoint_used_by_single_icmp_non_constant_offset(ptr %src, i64 %off.1) {
+; CHECK-LABEL: @multiple_inttoptr_add_ptrtoint_used_by_single_icmp_non_constant_offset(
+; CHECK-NEXT:    ret i1 false
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %a.2 = add i64 %i, 11
+  %p.2 = inttoptr i64 %a.2 to ptr
+  %c = icmp eq ptr %p, %p.2
+  ret i1 %c
+}
+
+define i1 @multiple_inttoptr_add_ptrtoint_used_by_single_icmp_multiple_non_constant_offset(ptr %src, i64 %off.1, i64 %off.2) {
+; CHECK-LABEL: @multiple_inttoptr_add_ptrtoint_used_by_single_icmp_multiple_non_constant_offset(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[OFF_1:%.*]], [[OFF_2:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, %off.1
+  %p = inttoptr i64 %a to ptr
+  %a.2 = add i64 %i, %off.2
+  %p.2 = inttoptr i64 %a.2 to ptr
+  %c = icmp eq ptr %p, %p.2
+  ret i1 %c
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_single_icmp_in_different_bb(i1 %bc, ptr %src, ptr %p2) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_single_icmp_in_different_bb(
+; CHECK-NEXT:    br i1 [[BC:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 10
+; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[P2]], [[P:%.*]]
+; CHECK-NEXT:    ret i1 [[C]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  br i1 %bc, label %then, label %else
+
+then:
+  %c = icmp eq ptr %p, %p2
+  ret i1 %c
+
+else:
+  ret i1 false
+}
+
+define i1 @inttoptr_add_ptrtoint_used_by_multiple_icmps(ptr %src, ptr %p2, ptr %p3) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_multiple_icmps(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 10
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq ptr [[P2]], [[P:%.*]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[P2]], [[P3:%.*]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[C_1]], [[C_2]]
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %c.1 = icmp eq ptr %p, %p2
+  %c.2 = icmp eq ptr %p, %p3
+  %xor = xor i1 %c.1, %c.2
+  ret i1 %xor
+}
+
+declare void @foo(ptr)
+declare void @bar(i64)
+
+define i1 @inttoptr_add_ptrtoint_used_by_multiple_icmps_and_other_user(ptr %src, ptr %p2, ptr %p3) {
+; CHECK-LABEL: @inttoptr_add_ptrtoint_used_by_multiple_icmps_and_other_user(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 10
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq ptr [[P2:%.*]], [[P]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[P3:%.*]], [[P]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i1 [[C_1]], [[C_2]]
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret i1 [[XOR]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  %c.1 = icmp eq ptr %p, %p2
+  %c.2 = icmp eq ptr %p, %p3
+  %xor = xor i1 %c.1, %c.2
+  call void @foo(ptr %p)
+  ret i1 %xor
+}
+
 define i32 @h(i1 %A, i32 %B) {
 ; CHECK-LABEL: @h(
 ; CHECK-NEXT:  EntryBlock:
diff --git a/llvm/test/Transforms/InstCombine/inttoptr-add-phi.ll b/llvm/test/Transforms/InstCombine/inttoptr-add-phi.ll
new file mode 100644
index 0000000000000..acceead23a5e5
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/inttoptr-add-phi.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S %s | FileCheck %s
+
+define i64 @inttoptr_used_by_phi_with_ptrtoint(i1 %c, ptr %src, ptr %p2) {
+; CHECK-LABEL: define i64 @inttoptr_used_by_phi_with_ptrtoint(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[SRC:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[SRC]], i64 10
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP1]], %[[THEN]] ], [ 0, %[[ELSE]] ]
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  br i1 %c, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %phi = phi ptr [ %p, %then ], [ null, %else ]
+  %i.2 = ptrtoint ptr %phi to i64
+  ret i64 %i.2
+}
+
+declare void @foo(ptr)
+
+define i64 @inttoptr_used_by_phi_with_other_users(i1 %c, ptr %src, ptr %p2) {
+; CHECK-LABEL: define i64 @inttoptr_used_by_phi_with_other_users(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[SRC:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[I]], 10
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[A]], %[[THEN]] ], [ 0, %[[ELSE]] ]
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[A]] to ptr
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  br i1 %c, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %phi = phi ptr [ %p, %then ], [ null, %else ]
+  call void @foo(ptr %p)
+  %i.2 = ptrtoint ptr %phi to i64
+  ret i64 %i.2
+}
diff --git a/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll b/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
index ea2cfe74be264..0123f15334281 100644
--- a/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
+++ b/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
@@ -16,10 +16,10 @@ define void @test(ptr %ptr) {
 ; CHECK-NEXT:    [[LIM_0:%.*]] = phi i32 [ 65, [[ENTRY:%.*]] ], [ 1, [[DEAD:%.*]] ]
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 8
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[LIM_0]], i32 2)
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[UMAX]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[UMAX]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[TMP2]], -8
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SCEVGEP]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SCEVGEP]], i8 0, i64 [[TMP1]], i1 false)
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
diff --git a/llvm/test/Transforms/LoopIdiom/add-nsw-zext-fold.ll b/llvm/test/Transforms/LoopIdiom/add-nsw-zext-fold.ll
index df32e60d5065a..bc1543d8361a7 100644
--- a/llvm/test/Transforms/LoopIdiom/add-nsw-zext-fold.ll
+++ b/llvm/test/Transforms/LoopIdiom/add-nsw-zext-fold.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -p loop-idiom -S %s | FileCheck %s
 
 define void @fold_add_zext_to_sext(ptr %dst, i1 %start) {
@@ -9,10 +9,10 @@ define void @fold_add_zext_to_sext(ptr %dst, i1 %start) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 [[START]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 25, [[START_EXT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i32 [[START_EXT]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 100, [[TMP4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext nneg i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[SCEVGEP]], i8 0, i64 [[TMP4]], i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[SCEVGEP]], i8 0, i64 [[TMP3]], i1 false)
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[START_EXT]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -40,3 +40,56 @@ loop:
 exit:
   ret void
 }
+
+declare i16 @get()
+
+define void @test_memset_size_can_use_info_from_guards(i32 %x, ptr %dst) {
+; CHECK-LABEL: define void @test_memset_size_can_use_info_from_guards(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP1:.*]]
+; CHECK:       [[LOOP1_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[LOOP1_BACKEDGE:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[X]], %[[LOOP1_BACKEDGE]] ]
+; CHECK-NEXT:    [[L:%.*]] = call i16 @get()
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i16 [[L]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[L_EXT]], [[P]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[SUB]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[SUB]], 2
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP1_BACKEDGE]], label %[[LOOP2_PREHEADER:.*]]
+; CHECK:       [[LOOP1_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[LOOP1]]
+; CHECK:       [[LOOP2_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[SUB]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[DST]], i8 0, i64 [[UMAX]], i1 false)
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT:%.*]], %[[LOOP2]] ], [ 0, %[[LOOP2_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i32 [[IV_2]]
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp ult i32 [[IV_2_NEXT]], [[SHR]]
+; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP2]], label %[[LOOP1_LOOPEXIT]]
+;
+entry:
+  br label %loop1
+
+loop1:
+  %p = phi i32 [ 0, %entry ], [ %x, %loop1 ], [ %x, %loop2 ]
+  %l = call i16 @get()
+  %l.ext = zext i16 %l to i32
+  %sub = sub i32 %l.ext, %p
+  %shr = lshr i32 %sub, 1
+  %ec = icmp ult i32 %sub, 2
+  br i1 %ec, label %loop1, label %loop2
+
+loop2:
+  %iv.2 = phi i32 [ 0, %loop1 ], [ %iv.2.next, %loop2 ]
+  %gep.dst = getelementptr i8, ptr %dst, i32 %iv.2
+  store i8 0, ptr %gep.dst, align 1
+  %iv.2.next = add i32 %iv.2, 1
+  %ec.2 = icmp ult i32 %iv.2.next, %shr
+  br i1 %ec.2, label %loop2, label %loop1
+}
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index 8fdaac3fdffe3..e8ea912246728 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -682,10 +682,7 @@ define void @PR14241(ptr %s, i64 %size) {
 ; CHECK-NEXT:    [[END_PTR:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i64 [[END_IDX]]
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[S]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[SIZE]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], -8
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], -4
 ; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 4 [[S]], ptr align 4 [[SCEVGEP]], i64 [[TMP4]], i1 false)
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
diff --git a/llvm/test/Transforms/LoopIdiom/expand-scev-expand-simplifications.ll b/llvm/test/Transforms/LoopIdiom/expand-scev-expand-simplifications.ll
new file mode 100644
index 0000000000000..9a59e5a8ccabb
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/expand-scev-expand-simplifications.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-idiom -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+
+define void @test_simplify_scev_during_expansion_flags(i64 %start) {
+; CHECK-LABEL: define void @test_simplify_scev_during_expansion_flags(
+; CHECK-SAME: i64 [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START_NEG:%.*]] = sub i64 0, [[START]]
+; CHECK-NEXT:    [[START_MUL:%.*]] = ashr exact i64 [[START_NEG]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[START_MUL]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 404, [[TMP0]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[START_MUL]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ null, %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %start.neg = sub i64 0, %start
+  %start.mul = ashr exact i64 %start.neg, 2
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %start.mul, %entry ], [ %iv.next, %loop ]
+  %ptr.iv = phi ptr [ null, %entry ], [ %ptr.iv.next, %loop ]
+  store i32 0, ptr %ptr.iv, align 4
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
new file mode 100644
index 0000000000000..a15db620e0082
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-idiom -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+
+declare void @foo()
+declare void @bar()
+
+define void @scev_expand_ptrtoint(i8 %x, ptr %start) {
+; CHECK-LABEL: define void @scev_expand_ptrtoint(
+; CHECK-SAME: i8 [[X:%.*]], ptr [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START1:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1_LATCH:.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ule i8 [[X]], 1
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_1_LATCH]], label %[[MIDDLE:.*]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 1
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER]]
+; CHECK:       [[MIDDLE]]:
+; CHECK-NEXT:    [[PTR_IV_1_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1]], %[[LOOP_1_HEADER]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[CMP_EXT:%.*]] = zext i1 [[CMP]] to i64
+; CHECK-NEXT:    [[GEP_START:%.*]] = getelementptr i8, ptr [[PTR_IV_1_LCSSA]], i64 [[CMP_EXT]]
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
+; CHECK:       [[LOOP_2_HEADER]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], %[[LOOP_2_LATCH:.*]] ], [ 0, %[[MIDDLE]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[GEP_START]], %[[MIDDLE]] ], [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ]
+; CHECK-NEXT:    switch i8 [[X]], label %[[LOOP_2_LATCH]] [
+; CHECK-NEXT:      i8 1, label %[[LOOP_3_PREHEADER:.*]]
+; CHECK-NEXT:      i8 4, label %[[LOOP_3_PREHEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       [[LOOP_3_PREHEADER]]:
+; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], %[[LOOP_2_HEADER]] ], [ [[INDVAR]], %[[LOOP_2_HEADER]] ]
+; CHECK-NEXT:    [[PTR_IV_2_LCSSA:%.*]] = phi ptr [ [[PTR_IV_2]], %[[LOOP_2_HEADER]] ], [ [[PTR_IV_2]], %[[LOOP_2_HEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1, [[START1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[CMP_EXT]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDVAR_LCSSA]], [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_2_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i64 [ [[IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ 1, %[[LOOP_3_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[PTR_IV_2_LCSSA]], i64 [[IV_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[TMP6]], 0
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i64 [[IV_3]], 1
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_3]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  %ptr.iv.1 = phi ptr [ %start, %entry ], [ %ptr.iv.1.next, %loop.1.latch ]
+  %c = icmp ule i8 %x, 1
+  br i1 %c, label %loop.1.latch, label %middle
+
+loop.1.latch:
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 1
+  br label %loop.1.header
+
+middle:
+  %cmp = icmp eq i8 %x, 0
+  %cmp.ext = zext i1 %cmp to i64
+  %gep.start = getelementptr i8, ptr %ptr.iv.1, i64 %cmp.ext
+  br label %loop.2.header
+
+loop.2.header:
+  %ptr.iv.2 = phi ptr [ %gep.start, %middle ], [ %ptr.iv.2.next, %loop.2.latch ]
+  switch i8 %x, label %loop.2.latch [
+  i8 1, label %loop.3
+  i8 4, label %loop.3
+  ]
+
+loop.2.latch:
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 1
+  br label %loop.2.header
+
+loop.3:
+  %iv.3 = phi i64 [ 1, %loop.2.header ], [ 1, %loop.2.header ], [ %iv.3.next, %loop.3 ]
+  %gep = getelementptr i8, ptr %ptr.iv.2, i64 %iv.3
+  %1 = load i8, ptr %gep, align 1
+  %ec = icmp eq i8 %1, 0
+  %iv.3.next = add i64 %iv.3, 1
+  br i1 %ec, label %exit, label %loop.3
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define ptr @test_lcssa_reuse_preserve_lcssa() {
+; CHECK-LABEL: define ptr @test_lcssa_reuse_preserve_lcssa() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_0_HEADER:.*]]
+; CHECK:       [[LOOP_0_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi ptr [ null, %[[LOOP_0_HEADER]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_1_NEXT]] = getelementptr i8, ptr [[IV_1]], i64 1
+; CHECK-NEXT:    [[EC_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[EC_1]], label %[[THEN:.*]], label %[[LOOP_1]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[IV_1_LCSSA1:%.*]] = phi ptr [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_0_LATCH:.*]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi ptr [ [[IV_1_LCSSA1]], %[[THEN]] ]
+; CHECK-NEXT:    [[IV_1_LCSSA_LCSSA:%.*]] = phi ptr [ [[IV_1_LCSSA1]], %[[THEN]] ]
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr null)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[IV_1_LCSSA]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi ptr [ [[RES:%.*]], %[[LOOP_2]] ], [ [[IV_1_LCSSA_LCSSA]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[RES]] = getelementptr i8, ptr [[IV_2]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[IV_1_LCSSA_LCSSA]], align 1
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i8 [[L]], 0
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP_2]]
+; CHECK:       [[LOOP_0_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_0_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret ptr [[SCEVGEP]]
+;
+entry:
+  br label %loop.0.header
+
+loop.0.header:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi ptr [ null, %loop.0.header ], [ %iv.1.next, %loop.1 ]
+  %iv.1.next = getelementptr i8, ptr %iv.1, i64 1
+  %ec.1 = call i1 @cond()
+  br i1 %ec.1, label %then, label %loop.1
+
+then:
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %loop.2, label %loop.0.latch
+
+loop.2:
+  %iv.2 = phi ptr [ %res, %loop.2 ], [ %iv.1, %then ]
+  %res = getelementptr i8, ptr %iv.2, i64 1
+  %l = load i8, ptr %iv.1, align 1
+  %ec.2 = icmp eq i8 %l, 0
+  br i1 %ec.2, label %exit, label %loop.2
+
+loop.0.latch:
+  br label %loop.0.header
+
+exit:
+  ret ptr %res
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/155287.
+; Make sure we don't crash when trying to construct a PtrToInt SCEV expression
+; results in SCEVCouldNotCompute.
+define void @phi_ptr_addressspace_ptrtoint_fail(ptr addrspace(1) %arg) {
+; CHECK-LABEL: define void @phi_ptr_addressspace_ptrtoint_fail(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]]) {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[ADD5:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[ADD5]] = add i64 [[IV_1]], 1
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i16, ptr addrspace(1) [[ARG]], i64 [[ADD5]]
+; CHECK-NEXT:    br i1 false, label %[[LOOP_1]], label %[[LOOP_2_PH:.*]]
+; CHECK:       [[LOOP_2_PH]]:
+; CHECK-NEXT:    [[IV_1_LCSSA1:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr addrspace(1) [ [[GETELEMENTPTR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i64 [[IV_1_LCSSA1]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[ARG]], i64 [[TMP0]]
+; CHECK-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[SCEVGEP]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
+; CHECK:       [[LOOP_2_HEADER]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_1_LCSSA]], %[[LOOP_2_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2_LATCH:.*]] ]
+; CHECK-NEXT:    [[GREP_ARG:%.*]] = getelementptr i32, ptr addrspace(1) [[ARG]], i64 [[IV_2]]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV_2]], 1
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_2_LATCH]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_2_LATCH]]:
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i64 [ 0, %bb ], [ %add5, %loop.1]
+  %add5 = add i64 %iv.1, 1
+  %getelementptr = getelementptr i16, ptr addrspace(1) %arg, i64 %add5
+  br i1 false, label %loop.1, label %loop.2.ph
+
+loop.2.ph:
+  %phi = phi ptr addrspace(1) [ %getelementptr, %loop.1]
+  br label %loop.2.header
+
+loop.2.header:
+  %iv.2 = phi i64 [ %iv.1, %loop.2.ph ], [ %iv.2.next, %loop.2.latch ]
+  %grep.arg = getelementptr i32, ptr addrspace(1) %arg, i64 %iv.2
+  store i32 0, ptr addrspace(1) %grep.arg, align 4
+  %ec = icmp ult i64 %iv.2, 1
+  br i1 %ec, label %loop.2.latch, label %exit
+
+loop.2.latch:
+  %iv.2.next = add i64 %iv.2, 1
+  br label %loop.2.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll b/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
index 10e10653a431d..037851f364e24 100644
--- a/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
+++ b/llvm/test/Transforms/LoopLoadElim/invalidate-laa-after-versioning.ll
@@ -59,19 +59,14 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_1]], i64 1
 ; CHECK-NEXT:    br label [[INNER_2:%.*]]
 ; CHECK:       inner.2:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[INNER_2]] ], [ 0, [[INNER_1_EXIT]] ]
 ; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[GEP_5]], [[INNER_1_EXIT]] ], [ [[PTR_IV_2_NEXT:%.*]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr inbounds double, ptr [[PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 false, label [[INNER_3_LVER_CHECK:%.*]], label [[INNER_2]]
 ; CHECK:       inner.3.lver.check:
-; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[LCSSA_PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds double, ptr [[PTR_PHI]], i64 1
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVAR_LCSSA]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 24
-; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_1]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_2]], i64 16
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_7]], [[GEP_1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR_PHI]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -104,7 +99,7 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK-NEXT:    br i1 [[C_2]], label [[OUTER_LATCH_LOOPEXIT4:%.*]], label [[INNER_3]]
 ; CHECK:       outer.latch.loopexit:
 ; CHECK-NEXT:    br label [[OUTER_LATCH]]
-; CHECK:       outer.latch.loopexit4:
+; CHECK:       outer.latch.loopexit3:
 ; CHECK-NEXT:    br label [[OUTER_LATCH]]
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    br label [[INNER_1_LVER_CHECK]]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/zext-signed-addrec.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/zext-signed-addrec.ll
new file mode 100644
index 0000000000000..f2fa771ac6f29
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/zext-signed-addrec.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -loop-reduce -S %s | FileCheck %s
+; PR18000
+
+target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global i32 0, align 4
+@b = common global i32 0, align 4
+@e = common global i8 0, align 1
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+define i32 @foo() {
+; CHECK-LABEL: define i32 @foo() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, ptr @b, align 4
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[DOTPR]], 1
+; CHECK-NEXT:    br i1 [[CMP10]], label %[[OUTER_PH:.*]], label %[[ENTRY_ELSE:.*]]
+; CHECK:       [[ENTRY_ELSE]]:
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @c, align 4
+; CHECK-NEXT:    br label %[[MERGE:.*]]
+; CHECK:       [[OUTER_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[OUTER_HEADER_PREHEADER:.*]], label %[[P_ELSE:.*]]
+; CHECK:       [[OUTER_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[INC:%.*]], %[[OUTER_LATCH:.*]] ], [ [[DOTPR]], %[[OUTER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[INNER_LOOP:.*]]
+; CHECK:       [[INNER_LOOP]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 516, %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i8 [ 1, %[[OUTER_HEADER]] ], [ [[DEC:%.*]], %[[INNER_LOOP]] ]
+; CHECK-NEXT:    [[SHL:%.*]] = add i32 [[LSR_IV]], -258
+; CHECK-NEXT:    store i32 [[SHL]], ptr @c, align 4
+; CHECK-NEXT:    [[DEC]] = add i8 [[TMP2]], -1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], -258
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 [[DEC]], -1
+; CHECK-NEXT:    br i1 [[CMP2]], label %[[INNER_LOOP]], label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[LSR_IV_NEXT_LCSSA:%.*]] = phi i32 [ [[LSR_IV_NEXT]], %[[INNER_LOOP]] ]
+; CHECK-NEXT:    store i32 0, ptr @d, align 4
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr @b, align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[OUTER_HEADER]], label %[[OUTER_EXIT:.*]]
+; CHECK:       [[OUTER_EXIT]]:
+; CHECK-NEXT:    [[LSR_IV_NEXT_LCSSA_LCSSA:%.*]] = phi i32 [ [[LSR_IV_NEXT_LCSSA]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    store i8 [[DEC]], ptr @e, align 1
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[DOTPRE]], %[[ENTRY_ELSE]] ], [ [[LSR_IV_NEXT_LCSSA_LCSSA]], %[[OUTER_EXIT]] ]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @bar(i32 [[TMP3]])
+; CHECK-NEXT:    br label %[[RETURN:.*]]
+; CHECK:       [[P_ELSE]]:
+; CHECK-NEXT:    store i8 1, ptr @e, align 1
+; CHECK-NEXT:    store i32 0, ptr @d, align 4
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, %[[MERGE]] ], [ 1, %[[P_ELSE]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %.pr = load i32, ptr @b, align 4
+  %cmp10 = icmp slt i32 %.pr, 1
+  br i1 %cmp10, label %outer.ph, label %entry.else
+
+entry.else:
+  %.pre = load i32, ptr @c, align 4
+  br label %merge
+
+outer.ph:
+  %0 = load i32, ptr @a, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %outer.header, label %p.else
+
+outer.header:
+  %1 = phi i32 [ %.pr, %outer.ph ], [ %inc, %outer.latch ]
+  br label %inner.loop
+
+inner.loop:
+  %iv = phi i32 [ 1, %outer.header ], [ %iv.next, %inner.loop ]
+  %2 = phi i8 [ 1, %outer.header ], [ %dec, %inner.loop ]
+  %conv7 = mul i32 %iv, 258
+  %shl = and i32 %conv7, 510
+  store i32 %shl, ptr @c, align 4
+  %dec = add i8 %2, -1
+  %cmp2 = icmp sgt i8 %dec, -1
+  %iv.next = add i32 %iv, -1
+  br i1 %cmp2, label %inner.loop, label %outer.latch
+
+outer.latch:
+  store i32 0, ptr @d, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, ptr @b, align 4
+  %cmp = icmp slt i32 %1, 0
+  br i1 %cmp, label %outer.header, label %outer.exit
+
+outer.exit:
+  store i8 %dec, ptr @e, align 1
+  br label %merge
+
+merge:
+  %3 = phi i32 [ %.pre, %entry.else ], [ %shl, %outer.exit ]
+  %call = tail call i32 @bar(i32 %3)
+  br label %return
+
+p.else:
+  store i8 1, ptr @e, align 1
+  store i32 0, ptr @d, align 4
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %merge ], [ 1, %p.else ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(i32)
+
diff --git a/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll b/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll
index cee8c8abdb450..43389b5df8f00 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/duplicated-phis.ll
@@ -18,8 +18,7 @@ define i64 @test_duplicated_phis(i64 noundef %N) {
 ; CHECK:       [[FOR_BODY_PREHEADER_NEW]]:
 ; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[MUL]], -4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[UNROLL_ITER]], -4
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = sub i64 -3, [[TMP3]]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
@@ -84,3 +83,41 @@ for.end:
   %res.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.end.loopexit.unr-lcssa ]
   ret i64 %res.0.lcssa
 }
+
+define i64 @duplicated_phis_compare_uses_mul_udiv(i64 %x) {
+; CHECK-LABEL: define i64 @duplicated_phis_compare_uses_mul_udiv(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MUL_2:%.*]] = shl i64 [[X]], 1
+; CHECK-NEXT:    [[DIV_16:%.*]] = lshr i64 [[MUL_2]], 4
+; CHECK-NEXT:    [[MASKED:%.*]] = and i64 [[DIV_16]], 1152921504606846974
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[MASKED]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i64 [[IV_1_NEXT]]
+;
+entry:
+  %mul.2 = shl i64 %x, 1
+  %div.16 = lshr exact i64 %mul.2, 4
+  %masked = and i64 %div.16, 1152921504606846974
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i64 [ 0, %entry ], [ %iv.2.next, %loop ]
+  call void @clobber()
+  %iv.1.next = add i64 %iv.1, 2
+  %iv.2.next = add i64 %iv.2, 2
+  %ec = icmp eq i64 %iv.2.next, %masked
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %iv.1.next
+}
+
+declare void @clobber()
diff --git a/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll
new file mode 100644
index 0000000000000..f74fb14e397f3
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -S %s | FileCheck %s
+
+target triple = "hexagon-unknown-linux"
+
+declare void @foo()
+
+define void @preserve_lcssa_when_reusing_existing_phi() {
+; CHECK-LABEL: define void @preserve_lcssa_when_reusing_existing_phi() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
+; CHECK:       [[LOOP_2_HEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ 0, %[[LOOP_2_HEADER]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    br i1 false, label %[[PH:.*]], label %[[LOOP_3]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[IV_3_LCSSA:%.*]] = phi i32 [ [[IV_3]], %[[LOOP_3]] ]
+; CHECK-NEXT:    br i1 true, label %[[LOOP_2_LATCH:.*]], label %[[LOOP_4_PREHEADER:.*]]
+; CHECK:       [[LOOP_4_PREHEADER]]:
+; CHECK-NEXT:    [[IV_3_LCSSA_LCSSA1:%.*]] = phi i32 [ [[IV_3_LCSSA]], %[[PH]] ]
+; CHECK-NEXT:    [[IV_3_LCSSA_LCSSA:%.*]] = phi i32 [ [[IV_3_LCSSA]], %[[PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV_3_LCSSA_LCSSA1]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[LOOP_1_LATCH_UNR_LCSSA:.*]], label %[[LOOP_4_PREHEADER_NEW:.*]]
+; CHECK:       [[LOOP_4_PREHEADER_NEW]]:
+; CHECK-NEXT:    br label %[[LOOP_4:.*]]
+; CHECK:       [[LOOP_2_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_2_HEADER]]
+; CHECK:       [[LOOP_4]]:
+; CHECK-NEXT:    [[IV_4:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER_NEW]] ], [ [[INC_I_7:%.*]], %[[LOOP_4]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP_4]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[INC_I_7]] = add nuw nsw i32 [[IV_4]], 8
+; CHECK-NEXT:    [[NITER_NEXT_7]] = add nuw nsw i32 [[NITER]], 8
+; CHECK-NEXT:    br i1 true, label %[[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_4]]
+; CHECK:       [[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT]]:
+; CHECK-NEXT:    [[IV_4_UNR_PH:%.*]] = phi i32 [ [[INC_I_7]], %[[LOOP_4]] ]
+; CHECK-NEXT:    br label %[[LOOP_1_LATCH_UNR_LCSSA]]
+; CHECK:       [[LOOP_1_LATCH_UNR_LCSSA]]:
+; CHECK-NEXT:    [[IV_4_UNR:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER]] ], [ [[IV_4_UNR_PH]], %[[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_4_EPIL_PREHEADER:.*]], label %[[LOOP_1_LATCH:.*]]
+; CHECK:       [[LOOP_4_EPIL_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_4_EPIL:.*]]
+; CHECK:       [[LOOP_4_EPIL]]:
+; CHECK-NEXT:    [[IV_4_EPIL:%.*]] = phi i32 [ [[INC_I_EPIL:%.*]], %[[LOOP_4_EPIL]] ], [ [[IV_4_UNR]], %[[LOOP_4_EPIL_PREHEADER]] ]
+; CHECK-NEXT:    [[EPIL_ITER:%.*]] = phi i32 [ 0, %[[LOOP_4_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_4_EPIL]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[INC_I_EPIL]] = add i32 [[IV_4_EPIL]], 1
+; CHECK-NEXT:    [[EC_EPIL:%.*]] = icmp eq i32 [[IV_4_EPIL]], [[IV_3_LCSSA_LCSSA]]
+; CHECK-NEXT:    [[EPIL_ITER_NEXT]] = add i32 [[EPIL_ITER]], 1
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[LOOP_4_EPIL]], label %[[LOOP_1_LATCH_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[LOOP_1_LATCH_EPILOG_LCSSA]]:
+; CHECK-NEXT:    br label %[[LOOP_1_LATCH]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER]]
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  br label %loop.2.header
+
+loop.2.header:
+  br label %loop.3
+
+loop.3:
+  %iv.3 = phi i32 [ %iv.3.next, %loop.3 ], [ 0, %loop.2.header ]
+  call void @foo()
+  %iv.3.next = add i32 %iv.3, 1
+  br i1 false, label %ph, label %loop.3
+
+ph:
+  br i1 true, label %loop.2.latch, label %loop.4
+
+loop.2.latch:
+  br label %loop.2.header
+
+loop.4:
+  %iv.4 = phi i32 [ 0, %ph ], [ %inc.i, %loop.4 ]
+  call void @foo()
+  %inc.i = add i32 %iv.4, 1
+  %ec = icmp eq i32 %iv.4, %iv.3
+  br i1 %ec, label %loop.1.latch, label %loop.4
+
+loop.1.latch:
+  br label %loop.1.header
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+;.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-guards.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
index 824e23fcf3e6e..c1d8395965577 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
@@ -201,6 +201,78 @@ loop.latch:
   %ec = icmp eq i32 %iv.next, %n
   br i1 %ec, label %exit, label %loop.header
 
+exit:
+  ret void
+}
+
+define void @test_peel_guard_sub_1_btc(i32 %n) {
+; CHECK-LABEL: define void @test_peel_guard_sub_1_btc(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[PRE:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[PRE]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOOP_HEADER_PREHEADER_SPLIT:.*]], label %[[EXIT_LOOPEXIT_PEEL_BEGIN:.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 1, %[[LOOP_HEADER_PREHEADER_SPLIT]] ]
+; CHECK-NEXT:    br i1 false, label %[[LOOP_LATCH]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[CALL136:%.*]] = load volatile ptr, ptr null, align 4294967296
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_LOOPEXIT_PEEL_BEGIN_LOOPEXIT:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT_LOOPEXIT_PEEL_BEGIN_LOOPEXIT]]:
+; CHECK-NEXT:    [[DOTPH:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT_PEEL_BEGIN]]
+; CHECK:       [[EXIT_LOOPEXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 1, %[[LOOP_HEADER_PREHEADER]] ], [ [[DOTPH]], %[[EXIT_LOOPEXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PEEL:.*]]
+; CHECK:       [[LOOP_HEADER_PEEL]]:
+; CHECK-NEXT:    [[CMP115_PEEL:%.*]] = icmp eq i32 [[TMP3]], [[SUB]]
+; CHECK-NEXT:    br i1 [[CMP115_PEEL]], label %[[LOOP_LATCH_PEEL:.*]], label %[[THEN_PEEL:.*]]
+; CHECK:       [[THEN_PEEL]]:
+; CHECK-NEXT:    [[CALL136_PEEL:%.*]] = load volatile ptr, ptr null, align 4294967296
+; CHECK-NEXT:    br label %[[LOOP_LATCH_PEEL]]
+; CHECK:       [[LOOP_LATCH_PEEL]]:
+; CHECK-NEXT:    [[IV_NEXT_PEEL:%.*]] = add nuw i32 [[TMP3]], 1
+; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp eq i32 [[IV_NEXT_PEEL]], [[N]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_LOOPEXIT_PEEL_NEXT:.*]], label %[[EXIT_LOOPEXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_LOOPEXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_HEADER_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = add i32 %n, -1
+  %pre = icmp eq i32 %sub, 0
+  br i1 %pre, label %exit, label %loop.header
+
+loop.header:                                        ; preds = %loop.latch, %entry
+  %iv = phi i32 [ %iv.next, %loop.latch ], [ 1, %entry ]
+  %cmp115 = icmp eq i32 %iv, %sub
+  br i1 %cmp115, label %loop.latch, label %then
+
+then:
+  %call136 = load volatile ptr, ptr null, align 4294967296
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop.header
+
 exit:
   ret void
 }
@@ -208,4 +280,5 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
 ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopUnroll/scevunroll.ll b/llvm/test/Transforms/LoopUnroll/scevunroll.ll
index b6b14e365cc1d..995eb0c52ff36 100644
--- a/llvm/test/Transforms/LoopUnroll/scevunroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/scevunroll.ll
@@ -435,3 +435,61 @@ exit:
 }
 
 declare void @fn(i32)
+
+define void @peel_int_eq_condition(i32 %start) {
+; CHECK-LABEL: @peel_int_eq_condition(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_PEEL_BEGIN:%.*]]
+; CHECK:       loop.peel.begin:
+; CHECK-NEXT:    br label [[LOOP_PEEL:%.*]]
+; CHECK:       loop.peel:
+; CHECK-NEXT:    [[C_0_PEEL:%.*]] = icmp eq i32 [[START:%.*]], [[START]]
+; CHECK-NEXT:    br i1 [[C_0_PEEL]], label [[IF_THEN_PEEL:%.*]], label [[LOOP_LATCH_PEEL:%.*]]
+; CHECK:       if.then.peel:
+; CHECK-NEXT:    call void @fn(i32 [[START]])
+; CHECK-NEXT:    br label [[LOOP_LATCH_PEEL]]
+; CHECK:       loop.latch.peel:
+; CHECK-NEXT:    [[IV_NEXT_PEEL:%.*]] = add i32 [[START]], 1
+; CHECK-NEXT:    [[EXITCOND_PEEL:%.*]] = icmp slt i32 [[START]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL]], label [[LOOP_PEEL_NEXT:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.peel.next:
+; CHECK-NEXT:    br label [[LOOP_PEEL_NEXT1:%.*]]
+; CHECK:       loop.peel.next1:
+; CHECK-NEXT:    br label [[ENTRY_PEEL_NEWPH:%.*]]
+; CHECK:       entry.peel.newph:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT_PEEL]], [[ENTRY_PEEL_NEWPH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @fn(i32 [[IV]])
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp slt i32 [[IV]], 100
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %start, %entry ], [ %iv.next, %loop.latch ]
+  %c.0 = icmp eq i32 %iv, %start
+  br i1 %c.0, label %if.then, label %loop.latch
+
+if.then:
+  call void @fn(i32 %iv)
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp slt i32 %iv, 100
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 427a05cc1c843..2b60480afa476 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -63,7 +63,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index 1a8e5940d88e7..3c091ceb17144 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -63,7 +63,9 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 3bb20e2a81d83..bd84494d6c6c5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -37,7 +37,8 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze <vscale x 16 x i1> [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -118,7 +119,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <2 x i64> [[VEC_IND]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <2 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -194,7 +196,8 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -281,7 +284,8 @@ define i64 @loop_contains_safe_div() #1 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = udiv <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 20000)
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <vscale x 4 x i32> [[TMP13]], splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP9:%.*]] = freeze <vscale x 4 x i1> [[TMP15]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -363,7 +367,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -464,7 +469,8 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[TMP20:%.*]] = freeze <4 x i1> [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index 61ef3cef603fa..21cecea735ad5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -29,24 +29,25 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP29]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP32]])
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP34:%.*]] = freeze <vscale x 16 x i1> [[TMP59]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP34]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP12]], [[TMP35]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_INC:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index c4fc60908c7e0..132e77951abd6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -597,7 +597,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-LABEL: @test_gather_not_profitable_pr48429(
 ; AVX512-NEXT:  entry:
 ; AVX512-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
-; AVX512-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
+; AVX512-NEXT:    [[SCEVGEP1:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
 ; AVX512-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
 ; AVX512-NEXT:    br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]]
 ; AVX512:       iter.check:
@@ -616,17 +616,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 6
 ; AVX512-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8
 ; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]]
-; AVX512-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2
-; AVX512-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 4
-; AVX512-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]]
 ; AVX512-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4
 ; AVX512-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]]
-; AVX512-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]]
-; AVX512-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]]
 ; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]]
 ; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]]
 ; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; AVX512-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]]
+; AVX512-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[PTR]]
 ; AVX512-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
 ; AVX512-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
 ; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
@@ -651,9 +646,9 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP18]], align 4, !alias.scope [[META8:![0-9]+]]
 ; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope [[META15:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope [[META15:![0-9]+]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD8]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD6]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024
 ; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -687,16 +682,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX21]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META8]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD14]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
-; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
-; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
+; AVX512-NEXT:    [[WIDE_LOAD15:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META15]]
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD15]], <8 x ptr> [[TMP27]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
-; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
-; AVX512-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; AVX512-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
+; AVX512-NEXT:    br i1 [[TMP32]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
 ; AVX512-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
 ; AVX512-NEXT:    br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -715,15 +710,15 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    store float [[TMP35]], ptr [[ARRAYIDX5]], align 4
 ; AVX512-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
 ; AVX512-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
-; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
-; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[SCEVGEP1]]
+; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @test_gather_not_profitable_pr48429(
 ; FVW2-NEXT:  entry:
 ; FVW2-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
-; FVW2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
+; FVW2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
 ; FVW2-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
 ; FVW2-NEXT:    br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
 ; FVW2:       for.body.lr.ph:
@@ -742,17 +737,12 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; FVW2-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 6
 ; FVW2-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8
 ; FVW2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]]
-; FVW2-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2
-; FVW2-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 4
-; FVW2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]]
 ; FVW2-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4
 ; FVW2-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]]
-; FVW2-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]]
-; FVW2-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]]
 ; FVW2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]]
 ; FVW2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]]
 ; FVW2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; FVW2-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]]
+; FVW2-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[PTR]]
 ; FVW2-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
 ; FVW2-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
 ; FVW2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
@@ -810,7 +800,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; FVW2-NEXT:    store float [[TMP32]], ptr [[ARRAYIDX5]], align 4
 ; FVW2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
 ; FVW2-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
-; FVW2-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
+; FVW2-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[SCEVGEP1]]
 ; FVW2-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; FVW2:       for.end:
 ; FVW2-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size-needs-loop-guards.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size-needs-loop-guards.ll
new file mode 100644
index 0000000000000..37f0d5418b427
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size-needs-loop-guards.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
+
+define void @loop_guard_on_assume_needed_to_prove_dereferenceable_ptr_arg_noundef(i64 %x, ptr noalias noundef %A, ptr noalias %B, ptr noalias %C) nofree nosync {
+; CHECK-LABEL: define void @loop_guard_on_assume_needed_to_prove_dereferenceable_ptr_arg_noundef(
+; CHECK-SAME: i64 [[X:%.*]], ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[X]]) ]
+; CHECK-NEXT:    [[C_X:%.*]] = icmp uge i64 [[X]], 128
+; CHECK-NEXT:    br i1 [[C_X]], label %[[LOOP_HEADER_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %A, i64 4), "dereferenceable"(ptr %A, i64 %x) ]
+  %c.x = icmp uge i64 %x, 128
+  br i1 %c.x, label %loop.header, label %exit
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr i32, ptr %B, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp eq i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %gep.a = getelementptr i32, ptr %A, i64 %iv
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %C, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 32
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @loop_guard_on_assume_needed_to_prove_dereferenceable(i64 %x, ptr noalias %A, ptr noalias %B, ptr noalias %C) nofree nosync {
+; CHECK-LABEL: define void @loop_guard_on_assume_needed_to_prove_dereferenceable(
+; CHECK-SAME: i64 [[X:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "noundef"(ptr [[A]]), "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 [[X]]) ]
+; CHECK-NEXT:    [[C_X:%.*]] = icmp uge i64 [[X]], 128
+; CHECK-NEXT:    br i1 [[C_X]], label %[[LOOP_HEADER_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "noundef"(ptr %A), "align"(ptr %A, i64 4), "dereferenceable"(ptr %A, i64 %x) ]
+  %c.x = icmp uge i64 %x, 128
+  br i1 %c.x, label %loop.header, label %exit
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr i32, ptr %B, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp eq i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %gep.a = getelementptr i32, ptr %A, i64 %iv
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %C, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 32
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @loop_guard_on_trip_count_needed_to_prove_dereferenceable(i32 %x, ptr noalias dereferenceable(128) align 4 %A, ptr noalias %B, ptr noalias %C) {
+; CHECK-LABEL: define void @loop_guard_on_trip_count_needed_to_prove_dereferenceable(
+; CHECK-SAME: i32 [[X:%.*]], ptr noalias align 4 dereferenceable(128) [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[C_X:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_X]], label %[[PH:.*]], [[EXIT:label %.*]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[N:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X]], i32 31)
+; CHECK-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %c.x = icmp sgt i32 %x, 0
+  br i1 %c.x, label %ph, label %exit
+
+ph:
+  %n = tail call i32 @llvm.smin.i32(i32 %x, i32 31)
+  %n.ext = zext i32 %n to i64
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr i32, ptr %B, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp eq i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %gep.a = getelementptr i32, ptr %A, i64 %iv
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %C, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n.ext
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+
+
+declare i32 @llvm.smin.i32(i32, i32)
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 7a54519c7cdf8..27f956144aabd 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1587,6 +1587,385 @@ exit:
 declare ptr @get_ptr()
 declare void @may_free()
 
+define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD1]], <2 x i32> [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT1:.*]], label %[[SCALAR_PH1]]
+; CHECK:       [[SCALAR_PH1]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH1]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[SCALAR_PH]] ], [ [[L_B]], %[[EXIT]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT1]], label %[[EXIT]], !llvm.loop [[LOOP37:![0-9]+]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_may_free(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP8]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT1:.*]], label %[[SCALAR_PH1]]
+; CHECK:       [[SCALAR_PH1]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH1]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[SCALAR_PH]] ], [ [[L_B]], %[[EXIT]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT1]], label %[[EXIT]], !llvm.loop [[LOOP39:![0-9]+]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_but_missing_nosync(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_but_missing_nosync(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT1:.*]], label %[[SCALAR_PH1]]
+; CHECK:       [[SCALAR_PH1]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH1]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[SCALAR_PH]] ], [ [[L_B]], %[[EXIT]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT1]], label %[[EXIT]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predecessors(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i1 %pre) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predecessors(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i1 [[PRE:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br i1 [[PRE]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    store i32 0, ptr [[B]], align 4
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH1:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i1> [[TMP16]], splat (i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT1:.*]], label %[[SCALAR_PH1]]
+; CHECK:       [[SCALAR_PH1]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH1]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[SCALAR_PH]] ], [ [[L_B]], %[[EXIT]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT1]], label %[[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br i1 %pre, label %then, label %else
+
+then:
+  store i32 0, ptr %a
+  br label %loop.header
+
+else:
+  store i32 0, ptr %b
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1624,4 +2003,12 @@ declare void @may_free()
 ; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
+; CHECK: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]}
+; CHECK: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]}
+; CHECK: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]], [[META2]]}
+; CHECK: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]}
+; CHECK: [[LOOP40]] = distinct !{[[LOOP40]], [[META1]], [[META2]]}
+; CHECK: [[LOOP41]] = distinct !{[[LOOP41]], [[META2]], [[META1]]}
+; CHECK: [[LOOP42]] = distinct !{[[LOOP42]], [[META1]], [[META2]]}
+; CHECK: [[LOOP43]] = distinct !{[[LOOP43]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index a2eddad179216..8b95df4b96c48 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -63,7 +63,9 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 1ca5586942d7c..74ea22e0f8d6f 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -208,7 +208,8 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
@@ -276,7 +277,8 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
@@ -346,7 +348,8 @@ define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
@@ -417,7 +420,8 @@ define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 68bc8d0640a3f..6b1e411c3ca68 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -208,7 +208,8 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
@@ -276,7 +277,8 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/miniters.ll b/llvm/test/Transforms/LoopVectorize/miniters.ll
index 0b4c002045186..6d06a03d0d018 100644
--- a/llvm/test/Transforms/LoopVectorize/miniters.ll
+++ b/llvm/test/Transforms/LoopVectorize/miniters.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "vector.ph:" --version 5
 ; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 ; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s -check-prefix=UNROLL
 
@@ -8,37 +9,89 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @a = common global [1000 x i32] zeroinitializer, align 16
 
 ; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF.
-; CHECK-LABEL: foo(
-; CHECK: %min.iters.check = icmp ult i64 %N, 4
-; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-; UNROLL-LABEL: foo(
-; UNROLL: %min.iters.check = icmp ult i64 %N, 8
-; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
-
 define void @foo(i64 %N) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_PREHEADER:.*]], [[EXIT:label %.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+;
+; UNROLL-LABEL: define void @foo(
+; UNROLL-SAME: i64 [[N:%.*]]) {
+; UNROLL-NEXT:  [[ENTRY:.*:]]
+; UNROLL-NEXT:    [[C:%.*]] = icmp sgt i64 [[N]], 0
+; UNROLL-NEXT:    br i1 [[C]], label %[[LOOP_PREHEADER:.*]], [[EXIT:label %.*]]
+; UNROLL:       [[LOOP_PREHEADER]]:
+; UNROLL-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; UNROLL-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; UNROLL:       [[VECTOR_PH]]:
+;
 entry:
-  %cmp.8 = icmp sgt i64 %N, 0
-  br i1 %cmp.8, label %for.body.preheader, label %for.end
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
+  %c = icmp sgt i64 %N, 0
+  br i1 %c, label %loop, label %exit
 
-for.body:                                         ; preds = %for.body, %for.body.preheader
-  %i.09 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds [1000 x i32], ptr @b, i64 0, i64 %i.09
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds [1000 x i32], ptr @b, i64 0, i64 %iv
   %tmp = load i32, ptr %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds [1000 x i32], ptr @c, i64 0, i64 %i.09
+  %arrayidx1 = getelementptr inbounds [1000 x i32], ptr @c, i64 0, i64 %iv
   %tmp1 = load i32, ptr %arrayidx1, align 4
   %add = add nsw i32 %tmp1, %tmp
-  %arrayidx2 = getelementptr inbounds [1000 x i32], ptr @a, i64 0, i64 %i.09
+  %arrayidx2 = getelementptr inbounds [1000 x i32], ptr @a, i64 0, i64 %iv
   store i32 %add, ptr %arrayidx2, align 4
-  %inc = add nuw nsw i64 %i.09, 1
-  %exitcond = icmp eq i64 %inc, %N
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @min_iters_known_via_loop_guards_add(i32 %start, i32 %end, ptr %src) {
+; CHECK-LABEL: define void @min_iters_known_via_loop_guards_add(
+; CHECK-SAME: i32 [[START:%.*]], i32 [[END:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[END]], [[START]]
+; CHECK-NEXT:    [[PRE:%.*]] = icmp sgt i32 [[SUB]], 100
+; CHECK-NEXT:    call void @llvm.assume(i1 [[PRE]])
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[SUB]], 1
+; CHECK-NEXT:    [[IV_START:%.*]] = zext i32 [[ADD_1]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 101, [[IV_START]]
+; CHECK-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+;
+; UNROLL-LABEL: define void @min_iters_known_via_loop_guards_add(
+; UNROLL-SAME: i32 [[START:%.*]], i32 [[END:%.*]], ptr [[SRC:%.*]]) {
+; UNROLL-NEXT:  [[ENTRY:.*:]]
+; UNROLL-NEXT:    [[SUB:%.*]] = sub i32 [[END]], [[START]]
+; UNROLL-NEXT:    [[PRE:%.*]] = icmp sgt i32 [[SUB]], 100
+; UNROLL-NEXT:    call void @llvm.assume(i1 [[PRE]])
+; UNROLL-NEXT:    [[ADD_1:%.*]] = add i32 [[SUB]], 1
+; UNROLL-NEXT:    [[IV_START:%.*]] = zext i32 [[ADD_1]] to i64
+; UNROLL-NEXT:    [[TMP0:%.*]] = sub i64 101, [[IV_START]]
+; UNROLL-NEXT:    br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
+; UNROLL:       [[VECTOR_PH]]:
+;
+entry:
+  %sub = sub i32 %end, %start
+  %pre = icmp sgt i32 %sub, 100
+  call void @llvm.assume(i1 %pre)
+  %add.1 = add i32 %sub, 1
+  %iv.start = zext i32 %add.1 to i64
+  br label %loop
 
-for.end.loopexit:                                 ; preds = %for.body
-  br label %for.end
+loop:
+  %iv = phi i64 [ %iv.start, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i64, ptr %src, i64 %iv
+  store i64 %iv, ptr %gep
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
+exit:
   ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 2747895f06a7b..3eac7678bbc95 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -18,11 +18,9 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[PH:.*]], label %[[LOOP_1]]
 ; CHECK:       [[PH]]:
-; CHECK-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SRC_2:%.*]] = tail call noalias noundef dereferenceable_or_null(8) ptr @calloc(i64 1, i64 8)
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[IV_2_LCSSA]], 1
 ; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
@@ -106,27 +104,23 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-LABEL: define void @runtime_checks_ptr_inductions(
 ; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], i1 [[C:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[DST_11:%.*]] = ptrtoint ptr [[DST_1]] to i64
 ; CHECK-NEXT:    br label %[[LOOP_1:.*]]
 ; CHECK:       [[LOOP_1]]:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[DST_1]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @val()
 ; CHECK-NEXT:    [[SEL_DST:%.*]] = select i1 [[C]], ptr [[DST_1]], ptr [[DST_2]]
 ; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 1
 ; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i32 [[CALL]], 0
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[LOOP_2_HEADER_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_HEADER_PREHEADER]]:
-; CHECK-NEXT:    [[SEL_DST_LCSSA2:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[SEL_DST_LCSSA1:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[PTR_IV_1_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SEL_DST_LCSSA:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[SEL_DST_LCSSA23:%.*]] = ptrtoint ptr [[SEL_DST_LCSSA2]] to i64
+; CHECK-NEXT:    [[SEL_DST_LCSSA12:%.*]] = ptrtoint ptr [[SEL_DST_LCSSA1]] to i64
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDVAR_LCSSA]], [[DST_11]]
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SEL_DST_LCSSA23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SEL_DST_LCSSA12]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -148,13 +142,13 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1023, %[[MIDDLE_BLOCK]] ], [ 1, %[[LOOP_2_HEADER_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[LOOP_2_HEADER_PREHEADER]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
 ; CHECK:       [[LOOP_2_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC7:%.*]], %[[LOOP_2_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[EC_2]], label %[[EXIT:.*]], label %[[LOOP_2_LATCH]]
 ; CHECK:       [[LOOP_2_LATCH]]:
@@ -196,3 +190,288 @@ loop.2.latch:
 exit:
   ret void
 }
+
+
+declare void @foo()
+declare void @bar()
+
+define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprogress {
+; CHECK-LABEL: define void @expand_diff_scev_unknown(
+; CHECK-SAME: ptr [[DST:%.*]], i1 [[INVAR_C:%.*]], i32 [[STEP:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], %[[LOOP_1]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ [[STEP]], %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    [[INDVAR_LCSSA1:%.*]] = phi i32 [ [[INDVAR]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STEP]], -2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR_LCSSA1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1)
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[UMIN]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMIN]]
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[IV_1_LCSSA]], [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[IV_1_LCSSA]], [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[IV_1_LCSSA]], %[[LOOP_2_PREHEADER]] ], [ [[IV_1_LCSSA]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_2_NEXT]] = add nsw i32 [[IV_2]], [[STEP]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp slt i32 [[IV_2_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP_2]], label %[[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ %step, %entry ], [ %iv.1.next, %loop.1 ]
+  call void @foo()
+  %iv.1.next = add i32 %iv.1, 1
+  br i1 %invar.c, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.2 = phi i32 [ %iv.1, %loop.1 ], [ %iv.2.next, %loop.2 ]
+  %iv.2.next = add nsw i32 %iv.2, %step
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv.2
+  store i32 0, ptr %gep.dst
+  %ec.2 = icmp slt i32 %iv.2.next, 0
+  br i1 %ec.2, label %loop.2, label %exit
+
+exit:
+  ret void
+}
+
+define void @expand_diff_neg_ptrtoint_expr(ptr %src, ptr %start) {
+; CHECK-LABEL: define void @expand_diff_neg_ptrtoint_expr(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 8
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT]], 32
+; CHECK-NEXT:    br i1 [[EC_1]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    [[PTR_IV_1_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[IV_NEXT_1:%.*]], %[[LOOP_2]] ], [ 1, %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2]] ], [ [[PTR_IV_1_NEXT_LCSSA]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add i64 [[IV_1]], 1
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i64 [[IV_NEXT_1]], 32
+; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP_3_PREHEADER:.*]], label %[[LOOP_2]]
+; CHECK:       [[LOOP_3_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[PTR_IV_2_NEXT]], %[[LOOP_2]] ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 0, [[SRC2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP5]], [[TMP0]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -16
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[OFFSET_IDX5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], -2
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 1, %[[LOOP_3_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[TMP1]], %[[LOOP_3_PREHEADER]] ], [ [[TMP1]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_3:.*]]
+; CHECK:       [[LOOP_3]]:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_NEXT_2:%.*]], %[[LOOP_3]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[IV_2]], -1
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP12]]
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[PTR_IV_3_NEXT]] = getelementptr i8, ptr [[PTR_IV_3]], i64 8
+; CHECK-NEXT:    store i64 [[L]], ptr [[PTR_IV_3]], align 8
+; CHECK-NEXT:    [[IV_NEXT_2]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_2]], 0
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[EXIT]], label %[[LOOP_3]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.1 ]
+  %ptr.iv.1 = phi ptr [ %start, %entry ], [ %ptr.iv.1.next, %loop.1 ]
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 8
+  call void @foo()
+  %iv.next = add i64 %iv, 1
+  %ec.1 = icmp eq i64 %iv.next, 32
+  br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.1 = phi i64 [ 1, %loop.1 ], [ %iv.next.1, %loop.2 ]
+  %ptr.iv.2 = phi ptr [ %ptr.iv.1.next, %loop.1 ], [ %ptr.iv.2.next, %loop.2 ]
+  call void @bar()
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 8
+  %iv.next.1 = add i64 %iv.1, 1
+  %ec.2 = icmp eq i64 %iv.next.1, 32
+  br i1 %ec.2, label %loop.3, label %loop.2
+
+loop.3:
+  %iv.2 = phi i64 [ 1, %loop.2 ], [ %iv.next.2, %loop.3 ]
+  %ptr.iv.3 = phi ptr [ %ptr.iv.2.next, %loop.2 ], [ %ptr.iv.3.next, %loop.3 ]
+  %6 = add i64 %iv.2, -1
+  %gep.src = getelementptr double, ptr %src, i64 %6
+  %l = load i64, ptr %gep.src, align 8
+  %ptr.iv.3.next = getelementptr i8, ptr %ptr.iv.3, i64 8
+  store i64 %l, ptr %ptr.iv.3, align 8
+  %iv.next.2 = add i64 %iv.2, 1
+  %ec.3 = icmp eq i64 %iv.next.2, 0
+  br i1 %ec.3, label %exit, label %loop.3
+
+exit:
+  ret void
+}
+
+
+declare i1 @cond()
+
+define void @scev_exp_reuse_const_add(ptr %dst, ptr %src) {
+; CHECK-LABEL: define void @scev_exp_reuse_const_add(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 2
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_2_PH:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_2_PH]]:
+; CHECK-NEXT:    [[PTR_IV_1_NEXT_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1_NEXT]], %[[LOOP_1]] ]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -2, [[SRC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR_IV_1_NEXT_LCSSA]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_IV_1_NEXT_LCSSA]], i64 80
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_IV_1_NEXT_LCSSA]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    store <2 x i16> [[WIDE_LOAD]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 40
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 40, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_2_PH]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_NEXT_LCSSA]], %[[LOOP_2_PH]] ], [ [[PTR_IV_1_NEXT_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_1]], 1
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV_2_NEXT]]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP_SRC_1]], align 2
+; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr i8, ptr [[PTR_IV_2]], i64 2
+; CHECK-NEXT:    store i16 [[L]], ptr [[PTR_IV_2]], align 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], 40
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_2]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %ptr.iv.1 = phi ptr [ %dst, %entry ], [ %ptr.iv.1.next, %loop.1 ]
+  %ptr.iv.1.next = getelementptr i8, ptr %ptr.iv.1, i64 2
+  %c = call i1 @cond()
+  br i1 %c, label %loop.2.ph, label %loop.1
+
+loop.2.ph:
+  br label %loop.2
+
+loop.2:
+  %iv.1 = phi i64 [ 0, %loop.2.ph ], [ %iv.2.next, %loop.2 ]
+  %ptr.iv.2 = phi ptr [ %ptr.iv.1.next, %loop.2.ph ], [ %ptr.iv.2.next, %loop.2 ]
+  %iv.2.next = add i64 %iv.1, 1
+  %gep.src.1 = getelementptr i16, ptr %src, i64 %iv.2.next
+  %l = load i16, ptr %gep.src.1, align 2
+  %ptr.iv.2.next = getelementptr i8, ptr %ptr.iv.2, i64 2
+  store i16 %l, ptr %ptr.iv.2, align 2
+  %ec = icmp eq i64 %iv.1, 40
+  br i1 %ec, label %exit, label %loop.2
+
+exit:
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare double @llvm.cos.f64(double) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index 22d9a5363bee6..0e9d3bdae5c6f 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -475,6 +475,105 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !12
 }
 
+declare i1 @cond()
+
+define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32 %d) {
+; CHECK-LABEL: @test_scev_check_mul_add_expansion(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PRE_1:%.*]] = icmp samesign ugt i32 [[D:%.*]], 5
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[PRE_1]])
+; CHECK-NEXT:    [[PRE_2:%.*]] = icmp ult i32 [[D]], 7
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[PRE_2]])
+; CHECK-NEXT:    [[PRE_3:%.*]] = icmp slt i32 [[D]], [[LEN:%.*]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[PRE_3]])
+; CHECK-NEXT:    [[SMAX3:%.*]] = call i32 @llvm.smax.i32(i32 [[LEN]], i32 7)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[SMAX3]], -6
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i32 [[LEN]], 10
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 12
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[LEN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[IN]], [[TMP4]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP0]], -4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[N_VEC]], 6
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]]
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 6, [[ENTRY:%.*]] ], [ 6, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[IV]] to i64
+; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP9]]
+; CHECK-NEXT:    store i16 0, ptr [[ARRAYIDX80]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4
+; CHECK-NEXT:    [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+; FORCED_OPTSIZE-LABEL: @test_scev_check_mul_add_expansion(
+; FORCED_OPTSIZE-NEXT:  entry:
+; FORCED_OPTSIZE-NEXT:    [[PRE_1:%.*]] = icmp sgt i32 [[D:%.*]], 5
+; FORCED_OPTSIZE-NEXT:    tail call void @llvm.assume(i1 [[PRE_1]])
+; FORCED_OPTSIZE-NEXT:    [[PRE_2:%.*]] = icmp samesign ule i32 [[D]], 6
+; FORCED_OPTSIZE-NEXT:    tail call void @llvm.assume(i1 [[PRE_2]])
+; FORCED_OPTSIZE-NEXT:    [[PRE_3:%.*]] = icmp slt i32 [[D]], [[LEN:%.*]]
+; FORCED_OPTSIZE-NEXT:    tail call void @llvm.assume(i1 [[PRE_3]])
+; FORCED_OPTSIZE-NEXT:    br label [[LOOP:%.*]]
+; FORCED_OPTSIZE:       loop:
+; FORCED_OPTSIZE-NEXT:    [[IV:%.*]] = phi i32 [ 6, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; FORCED_OPTSIZE-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr i16, ptr [[OUT:%.*]], i32 [[IV]]
+; FORCED_OPTSIZE-NEXT:    store i16 0, ptr [[ARRAYIDX80]], align 2
+; FORCED_OPTSIZE-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; FORCED_OPTSIZE-NEXT:    store i32 0, ptr [[IN:%.*]], align 4
+; FORCED_OPTSIZE-NEXT:    [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
+; FORCED_OPTSIZE-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT:%.*]]
+; FORCED_OPTSIZE:       exit:
+; FORCED_OPTSIZE-NEXT:    ret void
+;
+entry:
+  %pre.1 = icmp sgt i32 %d, 5
+  tail call void @llvm.assume(i1 %pre.1)
+  %pre.2 = icmp samesign ule i32 %d, 6
+  tail call void @llvm.assume(i1 %pre.2)
+  %pre.3 = icmp slt i32 %d, %len
+  tail call void @llvm.assume(i1 %pre.3)
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 6, %entry ], [ %iv.next, %loop ]
+  %arrayidx80 = getelementptr i16, ptr %out, i32 %iv
+  store i16 0, ptr %arrayidx80, align 2
+  %iv.next = add nuw nsw i32 %iv, 1
+  store i32 0, ptr %in, align 4
+  %cmp7.not = icmp sgt i32 %len, %iv.next
+  br i1 %cmp7.not, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 ; CHECK: !9 = !DILocation(line: 101, column: 1, scope: !{{.*}})
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
index b640c1911cb0d..99a86c94b23dd 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll
@@ -434,3 +434,62 @@ loop:
 exit:
   ret void
 }
+
+
+define void @remove_diff_checks_via_guards(i32 %x, i32 %y, ptr %A) {
+; CHECK-LABEL: define void @remove_diff_checks_via_guards(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[OFFSET:%.*]] = sub i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[OFFSET]], 0
+; CHECK-NEXT:    br i1 [[CMP]], [[EXIT:label %.*]], label %[[LOOP_PREHEADER:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[SMAX2]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i32 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[X]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[SMAX]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[SMAX]], 4294967295
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[SMAX]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[OFFSET]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[TMP11]], [[OFFSET]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[SMAX]], 4294967295
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP9]], [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP15]], [[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP16]], 2
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP17]], 16
+; CHECK-NEXT:    br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH1:label %.*]]
+;
+entry:
+  %offset = sub i32 %x, %y
+  %cmp = icmp sge i32 %offset, 0
+  br i1 %cmp, label %exit, label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.ext = sext i32 %iv to i64
+  %gep.A = getelementptr i32, ptr %A, i64 %iv.ext
+  %l = load i32, ptr %gep.A, align 1
+  %iv.offset = add i32 %iv, %offset
+  %iv.offset.ext = sext i32 %iv.offset to i64
+  %gep.A.offset = getelementptr i32, ptr %A, i64 %iv.offset.ext
+  store i32 %l, ptr %gep.A.offset, align 1
+  %iv.next = add i32 %iv, 1
+  %ec = icmp sgt i32 %iv, %x
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
new file mode 100644
index 0000000000000..194d16e41f921
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF4IC2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF8IC1 %s
+
+; Test case from https://github.com/llvm/llvm-project/issues/153946.
+; %shr and thus %early.cond will be poison from %iv == 4 onwards.
+; Make sure the mask being poison does not propagate across lanes in the
+; OR reduction when computing the early exit condition in the vector loop.
+define noundef i32 @f(i32 noundef %g) {
+; VF4IC2-LABEL: define noundef i32 @f(
+; VF4IC2-SAME: i32 noundef [[G:%.*]]) {
+; VF4IC2-NEXT:  [[ENTRY:.*]]:
+; VF4IC2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF4IC2:       [[VECTOR_PH]]:
+; VF4IC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[G]], i64 0
+; VF4IC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; VF4IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4IC2:       [[VECTOR_BODY]]:
+; VF4IC2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC2-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4IC2-NEXT:    [[TMP0:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], splat (i32 3)
+; VF4IC2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[BROADCAST_SPLAT]], [[TMP0]]
+; VF4IC2-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
+; VF4IC2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; VF4IC2-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP5]]
+; VF4IC2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
+; VF4IC2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; VF4IC2-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP7]]
+; VF4IC2-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; VF4IC2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4IC2:       [[MIDDLE_SPLIT]]:
+; VF4IC2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; VF4IC2-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF4IC2:       [[MIDDLE_BLOCK]]:
+; VF4IC2-NEXT:    br i1 true, label %[[RETURN:.*]], label %[[SCALAR_PH]]
+; VF4IC2:       [[VECTOR_EARLY_EXIT]]:
+; VF4IC2-NEXT:    [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; VF4IC2-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; VF4IC2-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], [[TMP15]]
+; VF4IC2-NEXT:    br label %[[RETURN]]
+; VF4IC2:       [[SCALAR_PH]]:
+; VF4IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF4IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF4IC2:       [[LOOP_HEADER]]:
+; VF4IC2-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF4IC2-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IV]], 3
+; VF4IC2-NEXT:    [[SHR:%.*]] = ashr i32 [[G]], [[MUL]]
+; VF4IC2-NEXT:    [[EARLY_COND:%.*]] = icmp eq i32 [[SHR]], 0
+; VF4IC2-NEXT:    br i1 [[EARLY_COND]], label %[[LOOP_LATCH]], label %[[RETURN]]
+; VF4IC2:       [[LOOP_LATCH]]:
+; VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; VF4IC2-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 8
+; VF4IC2-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4IC2:       [[RETURN]]:
+; VF4IC2-NEXT:    [[RES:%.*]] = phi i32 [ [[SHR]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[TMP10]], %[[VECTOR_EARLY_EXIT]] ]
+; VF4IC2-NEXT:    ret i32 [[RES]]
+;
+; VF8IC1-LABEL: define noundef i32 @f(
+; VF8IC1-SAME: i32 noundef [[G:%.*]]) {
+; VF8IC1-NEXT:  [[ENTRY:.*]]:
+; VF8IC1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8IC1:       [[VECTOR_PH]]:
+; VF8IC1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[G]], i64 0
+; VF8IC1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
+; VF8IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8IC1:       [[VECTOR_BODY]]:
+; VF8IC1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8IC1-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8IC1-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 3)
+; VF8IC1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[BROADCAST_SPLAT]], [[TMP0]]
+; VF8IC1-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; VF8IC1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; VF8IC1-NEXT:    [[TMP8:%.*]] = freeze <8 x i1> [[TMP2]]
+; VF8IC1-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP8]])
+; VF8IC1-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
+; VF8IC1-NEXT:    br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8IC1:       [[MIDDLE_SPLIT]]:
+; VF8IC1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; VF8IC1-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8IC1:       [[MIDDLE_BLOCK]]:
+; VF8IC1-NEXT:    br i1 true, label %[[RETURN:.*]], label %[[SCALAR_PH]]
+; VF8IC1:       [[VECTOR_EARLY_EXIT]]:
+; VF8IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
+; VF8IC1-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; VF8IC1-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP6]]
+; VF8IC1-NEXT:    br label %[[RETURN]]
+; VF8IC1:       [[SCALAR_PH]]:
+; VF8IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF8IC1:       [[LOOP_HEADER]]:
+; VF8IC1-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8IC1-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IV]], 3
+; VF8IC1-NEXT:    [[SHR:%.*]] = ashr i32 [[G]], [[MUL]]
+; VF8IC1-NEXT:    [[EARLY_COND:%.*]] = icmp eq i32 [[SHR]], 0
+; VF8IC1-NEXT:    br i1 [[EARLY_COND]], label %[[LOOP_LATCH]], label %[[RETURN]]
+; VF8IC1:       [[LOOP_LATCH]]:
+; VF8IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; VF8IC1-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 8
+; VF8IC1-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8IC1:       [[RETURN]]:
+; VF8IC1-NEXT:    [[RES:%.*]] = phi i32 [ [[SHR]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[TMP9]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8IC1-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul = shl nuw nsw i32 %iv, 3
+  %shr = ashr i32 %g, %mul
+  %early.cond = icmp eq i32 %shr, 0
+  br i1 %early.cond, label %loop.latch, label %return
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 8
+  br i1 %ec, label %return, label %loop.header
+
+return:
+  %res = phi i32 [ %shr, %loop.latch ], [ %iv, %loop.header ]
+  ret i32 %res
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index e79995f673fb4..d949c77bb7ece 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -7,21 +7,49 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ -1, [[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP9]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -95,8 +123,94 @@ loop.end:
   ret i64 %retval
 }
 
-define i64 @early_exit_alignment_and_deref_known_via_assumption(ptr noalias %p1, ptr noalias %p2, i64 %n) nofree nosync {
-; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption(
+define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero(ptr noalias %p1, ptr noalias %p2, i64 %n) nofree nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero(
+; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 [[N]]) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 [[N]]) ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[N]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    br label [[LOOP_END_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END_LOOPEXIT]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       loop.end.loopexit:
+; CHECK-NEXT:    [[RETVAL_PH:%.*]] = phi i64 [ -1, [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RETVAL_PH]], [[LOOP_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %p1, i64 4), "dereferenceable"(ptr %p1, i64 %n) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 4), "dereferenceable"(ptr %p2, i64 %n) ]
+  %c = icmp ne i64 %n, 0
+  br i1 %c, label %loop, label %loop.end
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, %n
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %entry ], [ %index, %loop ], [ -1, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @early_exit_alignment_and_deref_known_via_assumption_n_may_be_zero(ptr noalias %p1, ptr noalias %p2, i64 %n) nofree nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_n_may_be_zero(
 ; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 [[N]]) ]
@@ -141,3 +255,419 @@ loop.end:
   %retval = phi i64 [ %index, %loop ], [ -1, %loop.inc ]
   ret i64 %retval
 }
+
+define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16(ptr noalias %p1, ptr noalias %p2, i32 %n) nofree nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16(
+; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[N_SCALED:%.*]] = shl nuw nsw i64 [[N_EXT]], 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 [[N_SCALED]]) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 [[N_SCALED]]) ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[N_EXT]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[N_EXT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END_LOOPEXIT]]
+; CHECK:       loop.end.loopexit:
+; CHECK-NEXT:    [[RETVAL_PH:%.*]] = phi i64 [ -1, [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RETVAL_PH]], [[LOOP_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %n.ext = zext i32 %n to i64
+  %n.scaled = shl nuw nsw i64 %n.ext, 1
+  call void @llvm.assume(i1 true) [ "align"(ptr %p1, i64 4), "dereferenceable"(ptr %p1, i64 %n.scaled) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 4), "dereferenceable"(ptr %p2, i64 %n.scaled) ]
+  %c = icmp ne i64 %n.ext, 0
+  br i1 %c, label %loop, label %loop.end
+
+loop:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop.inc ]
+  %gep.p1 = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %gep.p1, align 1
+  %gep.p2 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %gep.p2, align 1
+  %c.0 = icmp eq i8 %ld1, %ld2
+  br i1 %c.0, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, %n.ext
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %entry ], [ %index, %loop ], [ -1, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_ptr_iv(ptr %A, i32 noundef %n) nofree nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_ptr_iv(
+; CHECK-SAME: ptr [[A:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 2) ]
+; CHECK-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = shl i64 [[N_EXT]], 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 [[MUL]]) ]
+; CHECK-NEXT:    [[A_END:%.*]] = getelementptr i8, ptr [[A]], i64 [[MUL]]
+; CHECK-NEXT:    [[PRE:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[PRE]], label [[EXIT:%.*]], label [[LOOP_HEADER_PREHEADER:%.*]]
+; CHECK:       loop.header.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[MUL]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IV_NEXT1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[LOOP_HEADER1:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER1]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_LATCH1:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[IV1:%.*]] = phi ptr [ [[IV_NEXT1]], [[LOOP_LATCH1]] ], [ [[A]], [[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[IV1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
+; CHECK-NEXT:    [[C_0:%.*]] = icmp eq i16 [[L]], 0
+; CHECK-NEXT:    br i1 [[C_0]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV_NEXT]], [[A_END]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[P_PH:%.*]] = phi ptr [ [[A_END]], [[LOOP_LATCH]] ], [ [[IV]], [[LOOP_HEADER]] ], [ [[A_END]], [[LOOP_LATCH1]] ], [ [[TMP13]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[P_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RES:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %A, i64 2) ]
+  %n.ext = zext i32 %n to i64
+  %mul = shl nuw nsw i64 %n.ext, 1
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 %mul) ]
+  %A.end = getelementptr i8, ptr %A, i64 %mul
+  %pre = icmp eq i32 %n, 0
+  br i1 %pre, label %exit, label %loop.header
+
+loop.header:
+  %iv = phi ptr [ %iv.next, %loop.latch ], [ %A, %entry ]
+  %l = load i16, ptr %iv, align 2
+  %c.0 = icmp eq i16 %l, 0
+  br i1 %c.0, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = getelementptr inbounds nuw i8, ptr %iv, i64 2
+  %ec = icmp eq ptr %iv.next, %A.end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %p = phi ptr [ %A, %entry ], [ %iv, %loop.header ], [ %A.end, %loop.latch ]
+  %res = ptrtoint ptr %p to i64
+  ret i64 %res
+}
+
+; The existing assumptions is strong enough to vectorize this.
+define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %first, ptr align 2 %last) nofree nosync {
+; CHECK-LABEL: define ptr @find_deref_pointer_distance_align_attribute_argument(
+; CHECK-SAME: ptr align 2 [[FIRST:%.*]], ptr align 2 [[LAST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST]], i64 2) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[LAST]], i64 2) ]
+; CHECK-NEXT:    [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[FIRST_I64:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT:    [[DISTANCE:%.*]] = sub i64 [[LAST_I64]], [[FIRST_I64]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST]], i64 [[DISTANCE]]) ]
+; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[EXIT:%.*]], label [[LOOP_HEADER_PREHEADER:%.*]]
+; CHECK:       loop.header.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST_I64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP15]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[FIRST]], [[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[L]], 1
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[IV_NEXT]], [[LAST]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], [[LOOP_LATCH]] ], [ [[IV]], [[LOOP_HEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I:%.*]] = phi ptr [ [[FIRST]], [[ENTRY:%.*]] ], [ [[FIRST_ADDR_0_LCSSA_I_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret ptr [[FIRST_ADDR_0_LCSSA_I]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %first, i64 2) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %last, i64 2) ]
+  %last.i64 = ptrtoint ptr %last to i64
+  %first.i64 = ptrtoint ptr %first to i64
+  %distance = sub i64 %last.i64, %first.i64
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %first, i64 %distance) ]
+  %c.0 = icmp eq ptr %first, %last
+  br i1 %c.0, label %exit, label %loop.header
+
+loop.header:
+  %iv = phi ptr [ %first, %entry ], [ %iv.next, %loop.latch ]
+  %l = load i16, ptr %iv, align 2
+  %c.1 = icmp eq i16 %l, 1
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = getelementptr inbounds nuw i8, ptr %iv, i64 2
+  %c.2 = icmp eq ptr %iv.next, %last
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %first.addr.0.lcssa.i = phi ptr [ %first, %entry ], [ %iv, %loop.header ], [ %iv.next, %loop.latch ]
+  ret ptr %first.addr.0.lcssa.i
+}
+
+; The existing assumptions is strong enough to vectorize this.
+define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last) nofree nosync {
+; CHECK-LABEL: define ptr @find_deref_pointer_distance_align_assumption(
+; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[FIRST_I64:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT:    [[DISTANCE:%.*]] = sub i64 [[LAST_I64]], [[FIRST_I64]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST]], i64 2) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[LAST]], i64 2) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST]], i64 [[DISTANCE]]) ]
+; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[EXIT:%.*]], label [[LOOP_HEADER_PREHEADER:%.*]]
+; CHECK:       loop.header.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST_I64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP15]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[FIRST]], [[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[L]], 1
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[IV_NEXT]], [[LAST]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], [[LOOP_LATCH]] ], [ [[IV]], [[LOOP_HEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I:%.*]] = phi ptr [ [[FIRST]], [[ENTRY:%.*]] ], [ [[FIRST_ADDR_0_LCSSA_I_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret ptr [[FIRST_ADDR_0_LCSSA_I]]
+;
+entry:
+  %last.i64 = ptrtoint ptr %last to i64
+  %first.i64 = ptrtoint ptr %first to i64
+  %distance = sub i64 %last.i64, %first.i64
+  call void @llvm.assume(i1 true) [ "align"(ptr %first, i64 2) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %last, i64 2) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %first, i64 %distance) ]
+  %c.0 = icmp eq ptr %first, %last
+  br i1 %c.0, label %exit, label %loop.header
+
+loop.header:
+  %iv = phi ptr [ %first, %entry ], [ %iv.next, %loop.latch ]
+  %l = load i16, ptr %iv, align 2
+  %c.1 = icmp eq i16 %l, 1
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = getelementptr inbounds nuw i8, ptr %iv, i64 2
+  %c.2 = icmp eq ptr %iv.next, %last
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %first.addr.0.lcssa.i = phi ptr [ %first, %entry ], [ %iv, %loop.header ], [ %iv.next, %loop.latch ]
+  ret ptr %first.addr.0.lcssa.i
+}
+
+define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(ptr noalias %p1, ptr noalias %p2) nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(
+; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %p1, i64 4), "dereferenceable"(ptr %p1, i64 1024) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 4), "dereferenceable"(ptr %p2, i64 1024) ]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 1024
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ -1, %loop.inc ]
+  ret i64 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
index de8a3c5a8eaf2..9bd26841146fb 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
@@ -23,7 +23,8 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; VF4IC4-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 10)
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
new file mode 100644
index 0000000000000..aeb2ad5df46fa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S %s | FileCheck %s
+
+
+; FIXME: currently the live-outs are not handled correctly.
+; Test for https://github.com/llvm/llvm-project/issues/154967.
+define i8 @iv_used_in_exit_with_math(i8 noundef %g) {
+; CHECK-LABEL: define i8 @iv_used_in_exit_with_math(
+; CHECK-SAME: i8 noundef [[G:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 1, [[IV]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[S]], [[G]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[RETURN:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RES_IV1:%.*]] = phi i8 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[RES_IV2:%.*]] = phi i8 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i8 [[RES_IV1]], [[RES_IV2]]
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %s = shl nuw i8 1, %iv
+  %a = and i8 %s, %g
+  %c = icmp eq i8 %a, 0
+  br i1 %c, label %loop.latch, label %return
+
+loop.latch:
+  %iv.next = add nuw nsw i8 %iv, 1
+  %ec = icmp eq i8 %iv.next, 4
+  br i1 %ec, label %return, label %loop.header
+
+return:
+  %res.iv1 = phi i8 [ 32, %loop.latch ], [ %iv, %loop.header ]
+  %res.iv2 = phi i8 [ 0, %loop.latch ], [ %iv, %loop.header ]
+  %res = add i8 %res.iv1, %res.iv2
+  ret i8 %res
+}
+
+define i32 @iv_used_in_exit_with_loads(ptr align 4 dereferenceable(128) %src) {
+; CHECK-LABEL: define i32 @iv_used_in_exit_with_loads(
+; CHECK-SAME: ptr align 4 dereferenceable(128) [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[RETURN:.*]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 32
+; CHECK-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RES_IV1:%.*]] = phi i32 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[RES_IV2:%.*]] = phi i32 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RES_IV1]], [[RES_IV2]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l = load i32, ptr %gep
+  %c = icmp eq i32 %l, 0
+  br i1 %c, label %loop.latch, label %return
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 32
+  br i1 %ec, label %return, label %loop.header
+
+return:
+  %res.iv1 = phi i32 [ 32, %loop.latch ], [ %iv, %loop.header ]
+  %res.iv2 = phi i32 [ 0, %loop.latch ], [ %iv, %loop.header ]
+  %res = add i32 %res.iv1, %res.iv2
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index 0f99ed576f1fe..01f65546e3534 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -18,7 +18,8 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD3]], splat (i32 10)
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; VF4IC4-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP8]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,7 +96,8 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = freeze <4 x i1> [[TMP13]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -170,7 +172,8 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; VF4IC4-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], splat (i8 72)
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; VF4IC4-NEXT:    [[TMP9:%.*]] = freeze <4 x i1> [[TMP17]]
+; VF4IC4-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]])
 ; VF4IC4-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4IC4-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
 ; VF4IC4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -242,7 +245,8 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = freeze <4 x i1> [[TMP13]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -321,7 +325,8 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = freeze <4 x i1> [[TMP13]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -407,7 +412,8 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP11:%.*]] = freeze <4 x i1> [[TMP13]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -497,7 +503,8 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD14]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE14]], [[REVERSE15]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP21]])
+; VF4IC4-NEXT:    [[TMP13:%.*]] = freeze <4 x i1> [[TMP21]]
+; VF4IC4-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; VF4IC4-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -576,7 +583,8 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1
 ; VF4IC4-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; VF4IC4-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP13]]
+; VF4IC4-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; VF4IC4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -659,7 +667,8 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD13]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i8> [[REVERSE6]], [[REVERSE14]]
 ; VF4IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP21]])
+; VF4IC4-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP21]]
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = or i1 [[TMP25]], [[TMP26]]
 ; VF4IC4-NEXT:    br i1 [[TMP27]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 2069570d4a10f..aa7136b71fae8 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -25,7 +25,8 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -102,7 +103,8 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -200,7 +202,8 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[TMP20:%.*]] = freeze <4 x i1> [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -283,7 +286,8 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -334,6 +338,345 @@ early.exit:
 for.end:
   ret i32 0
 }
+
+define void @inner_loop_trip_count_depends_on_outer_iv(ptr align 8 dereferenceable(1792) %this, ptr %dst) {
+; CHECK-LABEL: define void @inner_loop_trip_count_depends_on_outer_iv(
+; CHECK-SAME: ptr align 8 dereferenceable(1792) [[THIS:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[THIS]], i64 1000
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i64 [[OUTER_IV]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label [[THEN:%.*]], label [[INNER_HEADER_PREHEADER:%.*]]
+; CHECK:       inner.header.preheader:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[OUTER_IV]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[OUTER_IV]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[OUTER_IV]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr ptr, ptr [[GEP_SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x ptr>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[OUTER_IV]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTER_LATCH_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    br label [[THEN_LOOPEXIT:%.*]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[INNER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
+; CHECK:       inner.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[INNER_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_IV:%.*]] = getelementptr ptr, ptr [[GEP_SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[GEP_IV]], align 8
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[L]], null
+; CHECK-NEXT:    br i1 [[C_2]], label [[THEN_LOOPEXIT]], label [[INNER_LATCH]]
+; CHECK:       inner.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[OUTER_IV]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[OUTER_LATCH_LOOPEXIT]], label [[INNER_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       then.loopexit:
+; CHECK-NEXT:    br label [[THEN]]
+; CHECK:       then:
+; CHECK-NEXT:    store i32 0, ptr [[DST]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH]]
+; CHECK:       outer.latch.loopexit:
+; CHECK-NEXT:    br label [[OUTER_LATCH]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1
+; CHECK-NEXT:    [[OUTER_EC:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], 100
+; CHECK-NEXT:    br i1 [[OUTER_EC]], label [[EXIT:%.*]], label [[OUTER_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep.src = getelementptr i8, ptr %this, i64 1000
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  %c.1 = icmp eq i64 %outer.iv, 0
+  br i1 %c.1, label %then, label %inner.header
+
+inner.header:
+  %iv = phi i64 [ %iv.next, %inner.latch ], [ 0, %outer.header ]
+  %gep.iv = getelementptr ptr, ptr %gep.src, i64 %iv
+  %l = load ptr, ptr %gep.iv, align 8
+  %c.2 = icmp eq ptr %l, null
+  br i1 %c.2, label %then, label %inner.latch
+
+inner.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %outer.iv
+  br i1 %ec, label %outer.latch, label %inner.header
+
+then:
+  store i32 0, ptr %dst, align 4
+  br label %outer.latch
+
+outer.latch:
+  %outer.iv.next = add i64 %outer.iv, 1
+  %outer.ec = icmp eq i64 %outer.iv.next, 100
+  br i1 %outer.ec, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+define i64 @loop_guard_needed_to_prove_dereferenceable(i32 %x, i1 %cmp2) {
+; CHECK-LABEL: define i64 @loop_guard_needed_to_prove_dereferenceable(
+; CHECK-SAME: i32 [[X:%.*]], i1 [[CMP2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca [32 x i32], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[A]], i64 128)
+; CHECK-NEXT:    [[C_X:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_X]], label [[PH:%.*]], label [[EXIT:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[N:%.*]] = tail call i32 @llvm.smin.i32(i32 [[X]], i32 31)
+; CHECK-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [32 x i32], ptr [[A]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr [32 x i32], ptr [[A]], i64 0, i64 [[IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX42]], align 4
+; CHECK-NEXT:    [[CMP43:%.*]] = icmp eq i32 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[CMP43]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i64 [ [[IV]], [[LOOP_HEADER]] ], [ -1, [[LOOP_LATCH]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %A = alloca [32 x i32], align 4
+  call void @init_mem(ptr %A, i64 128)
+  %c.x = icmp sgt i32 %x, 0
+  br i1 %c.x, label %ph, label %exit
+
+ph:
+  %n = tail call i32 @llvm.smin.i32(i32 %x, i32 31)
+  %n.ext = zext i32 %n to i64
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop.latch ]
+  %arrayidx42 = getelementptr [32 x i32], ptr %A, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx42, align 4
+  %cmp43 = icmp eq i32 %0, 0
+  br i1 %cmp43, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %n.ext
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %res = phi i64 [ 0, %entry ], [ -1, %loop.latch ], [ %iv, %loop.header ]
+  ret i64 %res
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+
+@A = external global [100 x {i32, i8} ]
+
+define ptr @btc_and_max_btc_require_predicates(ptr noalias %start, i64 %offset) {
+; CHECK-LABEL: define ptr @btc_and_max_btc_require_predicates(
+; CHECK-SAME: ptr noalias [[START:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i32, ptr [[START]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[PRE_1:%.*]] = icmp ult i64 [[OFFSET]], 100
+; CHECK-NEXT:    call void @llvm.assume(i1 [[PRE_1]])
+; CHECK-NEXT:    [[PRE_2:%.*]] = icmp ugt i64 [[OFFSET]], 1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[PRE_2]])
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi ptr [ @A, [[ENTRY:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi ptr [ [[START]], [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[IV_1]], align 4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_2_NEXT]] = getelementptr i8, ptr [[IV_2]], i64 40
+; CHECK-NEXT:    [[IV_1_NEXT]] = getelementptr i8, ptr [[IV_1]], i64 40
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV_2]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[IV_1]], [[LOOP_HEADER]] ], [ [[IV_2]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret ptr [[RES]]
+;
+entry:
+  %end = getelementptr i32, ptr %start, i64 %offset
+  %pre.1 = icmp ult i64 %offset, 100
+  call void @llvm.assume(i1 %pre.1)
+  %pre.2 = icmp ugt i64 %offset, 1
+  call void @llvm.assume(i1 %pre.2)
+  br label %loop.header
+
+loop.header:
+  %iv.1 = phi ptr [ @A, %entry ], [ %iv.1.next, %loop.latch ]
+  %iv.2 = phi ptr [ %start, %entry ], [ %iv.2.next, %loop.latch ]
+  %l = load i32, ptr %iv.1, align 4
+  %c = icmp eq i32 %l, 0
+  br i1 %c, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.2.next = getelementptr i8, ptr %iv.2, i64 40
+  %iv.1.next = getelementptr i8, ptr %iv.1, i64 40
+  %ec = icmp eq ptr %iv.2, %end
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %iv.1, %loop.header ], [ %iv.2, %loop.latch ]
+  ret ptr %res
+}
+
+define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr dereferenceable(1024) %src) {
+; CHECK-LABEL: define i64 @loop_guards_needed_to_prove_deref_multiple(
+; CHECK-SAME: i32 [[X:%.*]], i1 [[C:%.*]], ptr dereferenceable(1024) [[SRC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_AND:%.*]] = and i32 [[X]], -2
+; CHECK-NEXT:    [[PRE_0:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[PRE_0]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C]], i32 [[X_AND]], i32 0
+; CHECK-NEXT:    [[PRE_1:%.*]] = icmp ugt i32 [[SEL]], 1024
+; CHECK-NEXT:    br i1 [[PRE_1]], label [[EXIT]], label [[PH:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[PRE_2:%.*]] = icmp ne i32 [[SEL]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[PRE_2]])
+; CHECK-NEXT:    [[N:%.*]] = add i32 [[SEL]], -1
+; CHECK-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[SEL]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[IV_NEXT]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       vector.early.exit:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[LOOP_LATCH]] ], [ 0, [[PH]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER1:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[IV_NEXT1:%.*]], [[LOOP_LATCH1:%.*]] ], [ [[IV]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I]], align 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_LOOPEXIT]], label [[LOOP_LATCH1]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV1]], [[N_EXT]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i64 [ [[IV1]], [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH1]] ], [ 0, [[LOOP_LATCH]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ -1, [[ENTRY:%.*]] ], [ -2, [[THEN]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %x.and = and i32 %x, -2
+  %pre.0 = icmp eq i32 %x, 0
+  br i1 %pre.0, label %then, label %exit
+
+then:
+  %sel = select i1 %c, i32 %x.and,  i32 0
+  %pre.1 = icmp ugt i32 %sel, 1024
+  br i1 %pre.1, label %exit, label %ph
+
+ph:
+  %pre.2 = icmp ne i32 %sel, 0
+  call void @llvm.assume(i1 %pre.2)
+  %n = add i32 %sel, -1
+  %n.ext = zext i32 %n to i64
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %ph ]
+  %gep.src.i = getelementptr i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src.i, align 1
+  %c.1 = icmp eq i8 %l, 0
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %n.ext
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %res = phi i64 [ -1, %entry ], [ -2, %then ], [ 0, %loop.latch ], [ %iv, %loop.header ]
+  ret i64 %res
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -345,4 +688,10 @@ for.end:
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
 ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index a21666a31b6a2..6050119b67ff4 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -24,7 +24,8 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -104,7 +105,8 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -187,7 +189,8 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i128 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i128 [[INDEX_NEXT2]], 64
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -272,7 +275,8 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -360,7 +364,8 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 64
 ; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -442,7 +447,8 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -518,7 +524,8 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[COND]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -600,7 +607,8 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -680,7 +688,8 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -760,7 +769,8 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -837,7 +847,8 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -916,7 +927,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
@@ -993,7 +1005,8 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
@@ -1069,7 +1082,8 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], splat (i8 72)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP15]])
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -1143,7 +1157,8 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -1224,7 +1239,8 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
@@ -1311,7 +1327,8 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
@@ -1395,7 +1412,8 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -1480,7 +1498,8 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
@@ -1567,7 +1586,8 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
@@ -1656,7 +1676,8 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP19]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64
 ; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -1748,7 +1769,8 @@ define i64 @loop_contains_safe_call() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
@@ -1824,7 +1846,8 @@ define i64 @loop_contains_safe_div() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <4 x i32> [[WIDE_LOAD]], splat (i32 20000)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP3]], splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
@@ -1901,7 +1924,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
@@ -1984,7 +2008,8 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE3]]
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 1020
 ; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
@@ -2111,7 +2136,8 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index fdd5e0e7958ec..69d0076497c66 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -21,7 +21,8 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF1-NEXT:    [[TMP7:%.*]] = freeze <8 x i1> [[TMP3]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP7]])
 ; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; VF8UF1-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
 ; VF8UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -55,18 +56,19 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF2:       [[VECTOR_PH]]:
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF2-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
-; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
-; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P_SRC1]], i32 0
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV1]], 8
+; VF8UF2-NEXT:    [[TMP3:%.*]] = freeze <8 x i1> [[TMP2]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; VF8UF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
-; VF8UF2-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
@@ -75,13 +77,13 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; VF8UF2:       [[LOOP_HEADER]]:
-; VF8UF2-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8UF2-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
 ; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
 ; VF8UF2-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
 ; VF8UF2:       [[LOOP_LATCH]]:
-; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
 ; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
@@ -98,7 +100,8 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
 ; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
-; VF16UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; VF16UF1-NEXT:    [[TMP1:%.*]] = freeze <16 x i1> [[TMP3]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP1]])
 ; VF16UF1-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF16UF1:       [[MIDDLE_SPLIT]]:
 ; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
@@ -157,7 +160,8 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
 ; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF1-NEXT:    [[TMP7:%.*]] = freeze <8 x i1> [[TMP3]]
+; VF8UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP7]])
 ; VF8UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; VF8UF1-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
 ; VF8UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -196,15 +200,16 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
-; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
 ; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; VF8UF2-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
-; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
-; VF8UF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT:    [[TMP3:%.*]] = freeze <8 x i1> [[TMP6]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    [[TMP9:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; VF8UF2-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF8UF2:       [[MIDDLE_SPLIT]]:
-; VF8UF2-NEXT:    br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
@@ -238,7 +243,8 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF16UF1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 0
 ; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; VF16UF1-NEXT:    [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
-; VF16UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; VF16UF1-NEXT:    [[TMP1:%.*]] = freeze <16 x i1> [[TMP3]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP1]])
 ; VF16UF1-NEXT:    br label %[[MIDDLE_SPLIT:.*]]
 ; VF16UF1:       [[MIDDLE_SPLIT]]:
 ; VF16UF1-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
@@ -284,3 +290,147 @@ exit:
   %res = phi i64 [ %iv, %loop.header ], [ 1, %loop.latch ]
   ret i64 %res
 }
+
+define i8 @test_early_exit_max_vector_tc_eq_16(ptr dereferenceable(17) %A) nosync nofree {
+; VF8UF1-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16(
+; VF8UF1-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; VF8UF1-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP5:%.*]] = freeze <8 x i1> [[TMP1]]
+; VF8UF1-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF1-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; VF8UF1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF1:       [[MIDDLE_SPLIT]]:
+; VF8UF1-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
+; VF8UF1-NEXT:    br label %[[EXIT]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF8UF1:       [[LOOP_HEADER]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; VF8UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; VF8UF1:       [[LOOP_LATCH]]:
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF1-NEXT:    ret i8 [[RES]]
+;
+; VF8UF2-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16(
+; VF8UF2-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF2-NEXT:    [[TMP6:%.*]] = freeze <8 x i1> [[TMP3]]
+; VF8UF2-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP6]])
+; VF8UF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF8UF2-NEXT:    [[TMP7:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; VF8UF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF8UF2:       [[MIDDLE_SPLIT]]:
+; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
+; VF8UF2-NEXT:    br label %[[EXIT]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF8UF2:       [[LOOP_HEADER]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; VF8UF2-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; VF8UF2:       [[LOOP_LATCH]]:
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    ret i8 [[RES]]
+;
+; VF16UF1-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16(
+; VF16UF1-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; VF16UF1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; VF16UF1-NEXT:    [[TMP1:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; VF16UF1-NEXT:    [[TMP3:%.*]] = freeze <16 x i1> [[TMP1]]
+; VF16UF1-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; VF16UF1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; VF16UF1-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
+; VF16UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1:       [[MIDDLE_SPLIT]]:
+; VF16UF1-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
+; VF16UF1-NEXT:    br label %[[EXIT]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
+; VF16UF1:       [[LOOP_HEADER]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
+; VF16UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; VF16UF1:       [[LOOP_LATCH]]:
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF16UF1-NEXT:    ret i8 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %p.src = getelementptr inbounds i8, ptr %A, i64 %iv
+  %l = load i8, ptr %p.src, align 1
+  %c = icmp eq i8 %l, 0
+  br i1 %c, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, 17
+  br i1 %cmp, label %exit, label %loop.header
+
+exit:
+  %res = phi i8 [ 0, %loop.header ], [ 1, %loop.latch ]
+  ret i8 %res
+}
diff --git a/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll b/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
index 8075314a65b49..4148c3541ae7a 100644
--- a/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
+++ b/llvm/test/Transforms/LoopVersioning/invalidate-laa-after-versioning.ll
@@ -56,19 +56,14 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_1]], i64 1
 ; CHECK-NEXT:    br label [[INNER_2:%.*]]
 ; CHECK:       inner.2:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[INNER_2]] ], [ 0, [[INNER_1_EXIT]] ]
 ; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[GEP_5]], [[INNER_1_EXIT]] ], [ [[PTR_IV_2_NEXT:%.*]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[PTR_IV_2_NEXT]] = getelementptr inbounds double, ptr [[PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 false, label [[INNER_3_LVER_CHECK:%.*]], label [[INNER_2]]
 ; CHECK:       inner.3.lver.check:
-; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i64 [ [[INDVAR]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[LCSSA_PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2]], [[INNER_2]] ]
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds double, ptr [[PTR_PHI]], i64 1
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds double, ptr [[LCSSA_PTR_IV_2]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVAR_LCSSA]], 3
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 24
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_1]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[LCSSA_PTR_IV_2]], i64 16
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_7]], [[GEP_1]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR_PHI]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
@@ -90,10 +85,10 @@ define void @test(ptr %arg, i64 %arg1) {
 ; CHECK:       inner.3:
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ 0, [[INNER_3_PH]] ], [ [[IV_2_NEXT:%.*]], [[INNER_3]] ]
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds double, ptr [[GEP_6]], i64 [[IV_2]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_7]], align 8, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_8]], align 8, !alias.scope !3
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_7]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_8]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr double, ptr [[PTR_PHI]], i64 [[IV_2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[GEP_9]], align 8, !alias.scope !3
+; CHECK-NEXT:    [[TMP18:%.*]] = load double, ptr [[GEP_9]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 1
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i64 [[IV_2]], 1
 ; CHECK-NEXT:    br i1 [[C_2]], label [[OUTER_LATCH_LOOPEXIT3:%.*]], label [[INNER_3]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
new file mode 100644
index 0000000000000..471b1aa5dce0f
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O3 -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx15.0.0"
+
+define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16 noundef signext %s) nofree nosync {
+; CHECK-LABEL: define i64 @std_find_i16_constant_offset_with_assumptions(
+; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ]
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <8 x i1> [[TMP9]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 [[TMP7]]
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[__FIRST_ADDR_0_LCSSA_I_I_PH:%.*]] = phi ptr [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ], [ [[COERCE_VAL_IP]], %[[MIDDLE_SPLIT]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[__FIRST_ADDR_0_LCSSA_I_I_PH]] to i64
+; CHECK-NEXT:    ret i64 [[DOTPRE]]
+;
+entry:
+  %first = alloca { ptr }, align 8
+  %s.addr = alloca i16, align 2
+  store ptr %first.coerce, ptr %first, align 8
+  store i16 %s, ptr %s.addr, align 2
+  %0 = load ptr, ptr %first, align 8
+  call void @llvm.assume(i1 true) [ "align"(ptr %0, i64 2) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %0, i64 256) ]
+  %start.ptr = load ptr, ptr %first, align 8
+  %1 = load i64, ptr %first, align 8
+  %coerce.val.pi.i = add i64 %1, 256
+  %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
+  %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
+  br i1 %cmp.not6.i.i, label %return, label %loop.ph
+
+loop.ph:
+  %2 = load i16, ptr %s.addr, align 2
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
+  %3 = load i16, ptr %ptr.iv, align 2
+  %cmp2.i.i = icmp eq i16 %3, %2
+  br i1 %cmp2.i.i, label %return, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
+  %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+  br i1 %cmp.not.i.i, label %return, label %loop.header
+
+return:
+  %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
+  %res = ptrtoint ptr %merge to i64
+  ret i64 %res
+}
+
+define i64 @std_find_i16_constant_offset_no_assumptions(ptr %first.coerce, i16 noundef signext %s) nofree nosync {
+; CHECK-LABEL: define i64 @std_find_i16_constant_offset_no_assumptions(
+; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[FIRST_COERCE]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PTR_IV]], align 2
+; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp eq i16 [[TMP1]], [[S]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label %[[RETURN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 2
+; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[COERCE_VAL_IP]]
+; CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[RETURN]], label %[[LOOP_HEADER]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[MERGE_PH:%.*]] = phi ptr [ [[COERCE_VAL_IP]], %[[LOOP_LATCH]] ], [ [[PTR_IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[MERGE_PH]] to i64
+; CHECK-NEXT:    ret i64 [[DOTPRE]]
+;
+entry:
+  %first = alloca { ptr }, align 8
+  %s.addr = alloca i16, align 2
+  store ptr %first.coerce, ptr %first, align 8
+  store i16 %s, ptr %s.addr, align 2
+  %0 = load ptr, ptr %first, align 8
+  %start.ptr = load ptr, ptr %first, align 8
+  %1 = load i64, ptr %first, align 8
+  %coerce.val.pi.i = add i64 %1, 256
+  %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
+  %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
+  br i1 %cmp.not6.i.i, label %return, label %loop.ph
+
+loop.ph:
+  %2 = load i16, ptr %s.addr, align 2
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
+  %3 = load i16, ptr %ptr.iv, align 2
+  %cmp2.i.i = icmp eq i16 %3, %2
+  br i1 %cmp2.i.i, label %return, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
+  %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+  br i1 %cmp.not.i.i, label %return, label %loop.header
+
+return:
+  %merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
+  %res = ptrtoint ptr %merge to i64
+  ret i64 %res
+}
+
+declare void @llvm.assume(i1 noundef)
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 678960418d7d7..d2b506af8d398 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
@@ -26,6 +27,8 @@
 
 namespace llvm {
 
+using namespace SCEVPatternMatch;
+
 // We use this fixture to ensure that we clean up ScalarEvolution before
 // deleting the PassManager.
 class ScalarEvolutionsTest : public testing::Test {
@@ -64,11 +67,6 @@ static std::optional<APInt> computeConstantDifference(ScalarEvolution &SE,
   return SE.computeConstantDifference(LHS, RHS);
 }
 
-  static bool matchURem(ScalarEvolution &SE, const SCEV *Expr, const SCEV *&LHS,
-                        const SCEV *&RHS) {
-    return SE.matchURem(Expr, LHS, RHS);
-  }
-
   static bool isImpliedCond(
       ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS,
       const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS,
@@ -1524,7 +1522,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) {
       auto *URemI = getInstructionByName(F, N);
       auto *S = SE.getSCEV(URemI);
       const SCEV *LHS, *RHS;
-      EXPECT_TRUE(matchURem(SE, S, LHS, RHS));
+      EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE)));
       EXPECT_EQ(LHS, SE.getSCEV(URemI->getOperand(0)));
       EXPECT_EQ(RHS, SE.getSCEV(URemI->getOperand(1)));
       EXPECT_EQ(LHS->getType(), S->getType());
@@ -1537,7 +1535,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) {
     auto *URem1 = getInstructionByName(F, "rem4");
     auto *S = SE.getSCEV(Ext);
     const SCEV *LHS, *RHS;
-    EXPECT_TRUE(matchURem(SE, S, LHS, RHS));
+    EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE)));
     EXPECT_NE(LHS, SE.getSCEV(URem1->getOperand(0)));
     // RHS and URem1->getOperand(1) have different widths, so compare the
     // integer values.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 118bf67320a3b..7471355603640 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -41,7 +41,8 @@ class VPlanSlpTest : public VPlanTestIRBase {
     AARes.reset(new AAResults(*TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI));
+    LAI.reset(
+        new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, &*AC));
     IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
     IAI->analyzeInterleaving(false);
     return {Plan, *IAI};