diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index ca0b7823558ff..cd2be3373f556 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -636,8 +636,12 @@ class ScalarEvolution {
   /// \p GEP The GEP. The indices contained in the GEP itself are ignored,
   /// instead we use IndexExprs.
   /// \p IndexExprs The expressions for the indices.
-  LLVM_ABI const SCEV *
-  getGEPExpr(GEPOperator *GEP, const SmallVectorImpl<const SCEV *> &IndexExprs);
+  LLVM_ABI const SCEV *getGEPExpr(GEPOperator *GEP,
+                                  ArrayRef<const SCEV *> IndexExprs);
+  LLVM_ABI const SCEV *getGEPExpr(const SCEV *BaseExpr,
+                                  ArrayRef<const SCEV *> IndexExprs,
+                                  Type *SrcElementTy,
+                                  GEPNoWrapFlags NW = GEPNoWrapFlags::none());
   LLVM_ABI const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW);
   LLVM_ABI const SCEV *getMinMaxExpr(SCEVTypes Kind,
                                      SmallVectorImpl<const SCEV *> &Operands);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 98b793aace7a3..e4076084d5332 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -961,12 +961,10 @@ class TargetTransformInfo {
       TTI::TargetCostKind CostKind, bool ForPoisonSrc = true,
       ArrayRef<Value *> VL = {}) const;
 
-  /// Estimate the overhead of scalarizing an instructions unique
-  /// non-constant operands. The (potentially vector) types to use for each of
-  /// argument are passes via Tys.
+  /// Estimate the overhead of scalarizing operands with the given types. The
+  /// (potentially vector) types to use for each of argument are passes via Tys.
   LLVM_ABI InstructionCost getOperandsScalarizationOverhead(
-      ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
-      TTI::TargetCostKind CostKind) const;
+      ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const;
 
   /// If target has efficient vector element load/store instructions, it can
   /// return true here so that insertion/extraction costs are not added to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index ddc8a5eaffa94..97c9e509f1df2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -459,8 +459,7 @@ class TargetTransformInfoImplBase {
   }
 
   virtual InstructionCost
-  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                   ArrayRef<Type *> Tys,
+  getOperandsScalarizationOverhead(ArrayRef<Type *> Tys,
                                    TTI::TargetCostKind CostKind) const {
     return 0;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index f6936b98bf3e4..91a2dd7eb612e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -347,6 +348,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return Cost;
   }
 
+  /// Filter out constant and duplicated entries in \p Ops and return a vector
+  /// containing the types from \p Tys corresponding to the remaining operands.
+  static SmallVector<Type *, 4>
+  filterConstantAndDuplicatedOperands(ArrayRef<const Value *> Ops,
+                                      ArrayRef<Type *> Tys) {
+    SmallPtrSet<const Value *, 4> UniqueOperands;
+    SmallVector<Type *, 4> FilteredTys;
+    for (const auto &[Op, Ty] : zip_equal(Ops, Tys)) {
+      if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second)
+        continue;
+      FilteredTys.push_back(Ty);
+    }
+    return FilteredTys;
+  }
+
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
@@ -935,29 +951,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                              CostKind);
   }
 
-  /// Estimate the overhead of scalarizing an instructions unique
-  /// non-constant operands. The (potentially vector) types to use for each of
+  /// Estimate the overhead of scalarizing an instruction's
+  /// operands. The (potentially vector) types to use for each of
   /// argument are passes via Tys.
   InstructionCost getOperandsScalarizationOverhead(
-      ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
-      TTI::TargetCostKind CostKind) const override {
-    assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
-
+      ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const override {
     InstructionCost Cost = 0;
-    SmallPtrSet<const Value*, 4> UniqueOperands;
-    for (int I = 0, E = Args.size(); I != E; I++) {
+    for (Type *Ty : Tys) {
       // Disregard things like metadata arguments.
-      const Value *A = Args[I];
-      Type *Ty = Tys[I];
       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
           !Ty->isPtrOrPtrVectorTy())
         continue;
 
-      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
-        if (auto *VecTy = dyn_cast<VectorType>(Ty))
-          Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
-                                           /*Extract*/ true, CostKind);
-      }
+      if (auto *VecTy = dyn_cast<VectorType>(Ty))
+        Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
+                                         /*Extract*/ true, CostKind);
     }
 
     return Cost;
@@ -974,7 +982,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     InstructionCost Cost = getScalarizationOverhead(
         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
     if (!Args.empty())
-      Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
+      Cost += getOperandsScalarizationOverhead(
+          filterConstantAndDuplicatedOperands(Args, Tys), CostKind);
     else
       // When no information on arguments is provided, we add the cost
       // associated with one argument as a heuristic.
@@ -2158,8 +2167,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
               /*Insert=*/true, /*Extract=*/false, CostKind);
         }
       }
-      ScalarizationCost +=
-          getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
+      ScalarizationCost += getOperandsScalarizationOverhead(
+          filterConstantAndDuplicatedOperands(Args, ICA.getArgTypes()),
+          CostKind);
     }
 
     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 239849e670350..53494e1aee1cc 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3493,17 +3493,25 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
           /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0.
           // We can currently only fold X%N if X is constant.
           const SCEVConstant *StartC = dyn_cast<SCEVConstant>(AR->getStart());
-          if (StartC && !DivInt.urem(StepInt) &&
-              getZeroExtendExpr(AR, ExtTy) ==
-              getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
-                            getZeroExtendExpr(Step, ExtTy),
-                            AR->getLoop(), SCEV::FlagAnyWrap)) {
+          if (StartC && !DivInt.urem(StepInt)) {
             const APInt &StartInt = StartC->getAPInt();
             const APInt &StartRem = StartInt.urem(StepInt);
-            if (StartRem != 0) {
-              const SCEV *NewLHS =
-                  getAddRecExpr(getConstant(StartInt - StartRem), Step,
-                                AR->getLoop(), SCEV::FlagNW);
+            bool NoWrap =
+                getZeroExtendExpr(AR, ExtTy) ==
+                getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
+                              getZeroExtendExpr(Step, ExtTy), AR->getLoop(),
+                              SCEV::FlagAnyWrap);
+
+            // With N <= C and both N, C as powers-of-2, the transformation
+            // {X,+,N}/C => {(X - X%N),+,N}/C preserves division results even
+            // if wrapping occurs, as the division results remain equivalent for
+            // all offsets in [[(X - X%N), X).
+            bool CanFoldWithWrap = StepInt.ule(DivInt) && // N <= C
+                                   StepInt.isPowerOf2() && DivInt.isPowerOf2();
+            if (StartRem != 0 && (NoWrap || CanFoldWithWrap)) {
+              const SCEV *NewLHS = getAddRecExpr(
+                  getConstant(StartInt - StartRem), Step, AR->getLoop(),
+                  NoWrap ? SCEV::FlagNW : SCEV::FlagAnyWrap);
               if (LHS != NewLHS) {
                 LHS = NewLHS;
 
@@ -3770,13 +3778,11 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   return getOrCreateAddRecExpr(Operands, L, Flags);
 }
 
-const SCEV *
-ScalarEvolution::getGEPExpr(GEPOperator *GEP,
-                            const SmallVectorImpl<const SCEV *> &IndexExprs) {
+const SCEV *ScalarEvolution::getGEPExpr(GEPOperator *GEP,
+                                        ArrayRef<const SCEV *> IndexExprs) {
   const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand());
   // getSCEV(Base)->getType() has the same address space as Base->getType()
   // because SCEV::getType() preserves the address space.
-  Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
   GEPNoWrapFlags NW = GEP->getNoWrapFlags();
   if (NW != GEPNoWrapFlags::none()) {
     // We'd like to propagate flags from the IR to the corresponding SCEV nodes,
@@ -3789,13 +3795,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       NW = GEPNoWrapFlags::none();
   }
 
+  return getGEPExpr(BaseExpr, IndexExprs, GEP->getSourceElementType(), NW);
+}
+
+const SCEV *ScalarEvolution::getGEPExpr(const SCEV *BaseExpr,
+                                        ArrayRef<const SCEV *> IndexExprs,
+                                        Type *SrcElementTy, GEPNoWrapFlags NW) {
   SCEV::NoWrapFlags OffsetWrap = SCEV::FlagAnyWrap;
   if (NW.hasNoUnsignedSignedWrap())
     OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNSW);
   if (NW.hasNoUnsignedWrap())
     OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNUW);
 
-  Type *CurTy = GEP->getType();
+  Type *CurTy = BaseExpr->getType();
+  Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
   bool FirstIter = true;
   SmallVector<const SCEV *, 4> Offsets;
   for (const SCEV *IndexExpr : IndexExprs) {
@@ -3814,7 +3827,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       if (FirstIter) {
         assert(isa<PointerType>(CurTy) &&
                "The first index of a GEP indexes a pointer");
-        CurTy = GEP->getSourceElementType();
+        CurTy = SrcElementTy;
         FirstIter = false;
       } else {
         CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 8a470ebf85a16..ae7e738ea5a26 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -637,9 +637,8 @@ InstructionCost TargetTransformInfo::getScalarizationOverhead(
 }
 
 InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
-    ArrayRef<const Value *> Args, ArrayRef<Type *> Tys,
-    TTI::TargetCostKind CostKind) const {
-  return TTIImpl->getOperandsScalarizationOverhead(Args, Tys, CostKind);
+    ArrayRef<Type *> Tys, TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind);
 }
 
 bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b5193f65593de..699f31f744342 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1152,7 +1152,10 @@ class LoopVectorizationCostModel {
   CallWideningDecision getCallWideningDecision(CallInst *CI,
                                                ElementCount VF) const {
     assert(!VF.isScalar() && "Expected vector VF");
-    return CallWideningDecisions.at({CI, VF});
+    auto I = CallWideningDecisions.find({CI, VF});
+    if (I == CallWideningDecisions.end())
+      return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+    return I->second;
   }
 
   /// Return True if instruction \p I is an optimizable truncate whose operand
@@ -1562,7 +1565,7 @@ class LoopVectorizationCostModel {
   /// A type representing the costs for instructions if they were to be
   /// scalarized rather than vectorized. The entries are Instruction-Cost
   /// pairs.
-  using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
+  using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
 
   /// A set containing all BasicBlocks that are known to present after
   /// vectorization as a predicated block.
@@ -1596,7 +1599,7 @@ class LoopVectorizationCostModel {
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
-  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
+  MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
 
   /// Holds the instructions known to be uniform after vectorization.
   /// The data is collected per VF.
@@ -1665,7 +1668,9 @@ class LoopVectorizationCostModel {
     Instruction *I = dyn_cast<Instruction>(V);
     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
         TheLoop->isLoopInvariant(I) ||
-        getWideningDecision(I, VF) == CM_Scalarize)
+        getWideningDecision(I, VF) == CM_Scalarize ||
+        (isa<CallInst>(I) &&
+         getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
       return false;
 
     // Assume we can vectorize V (and hence we need extraction) if the
@@ -1680,8 +1685,16 @@ class LoopVectorizationCostModel {
   /// Returns a range containing only operands needing to be extracted.
   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
                                                    ElementCount VF) const {
-    return SmallVector<Value *, 4>(make_filter_range(
-        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+
+    SmallPtrSet<const Value *, 4> UniqueOperands;
+    SmallVector<Value *, 4> Res;
+    for (Value *Op : Ops) {
+      if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
+          !needsExtract(Op, VF))
+        continue;
+      Res.push_back(Op);
+    }
+    return Res;
   }
 
 public:
@@ -3037,10 +3050,9 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
     // likely.
     ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
   }
-  InstructionCost SafeDivisorCost = 0;
 
+  InstructionCost SafeDivisorCost = 0;
   auto *VecTy = toVectorTy(I->getType(), VF);
-
   // The cost of the select guard to ensure all lanes are well defined
   // after we speculate above any internal control flow.
   SafeDivisorCost +=
@@ -3048,19 +3060,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
                              toVectorTy(Type::getInt1Ty(I->getContext()), VF),
                              CmpInst::BAD_ICMP_PREDICATE, CostKind);
 
-  // Certain instructions can be cheaper to vectorize if they have a constant
-  // second vector operand. One example of this are shifts on x86.
-  Value *Op2 = I->getOperand(1);
-  auto Op2Info = TTI.getOperandInfo(Op2);
-  if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
-      Legal->isInvariant(Op2))
-    Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
-
   SmallVector<const Value *, 4> Operands(I->operand_values());
   SafeDivisorCost += TTI.getArithmeticInstrCost(
-    I->getOpcode(), VecTy, CostKind,
-    {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-    Op2Info, Operands, I);
+      I->getOpcode(), VecTy, CostKind,
+      {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+      {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+      Operands, I);
   return {ScalarizationCost, SafeDivisorCost};
 }
 
@@ -3311,6 +3316,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
       if (!Ptr)
         continue;
 
+      // If the pointer can be proven to be uniform, always add it to the
+      // worklist.
+      if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
+        AddToWorklistIfAllowed(cast<Instruction>(Ptr));
+
       if (IsUniformMemOpUse(&I))
         AddToWorklistIfAllowed(&I);
 
@@ -4284,6 +4294,25 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
           if (!VPI)
             continue;
           switch (VPI->getOpcode()) {
+          // Selects are only modelled in the legacy cost model for safe
+          // divisors.
+          case Instruction::Select: {
+            VPValue *VPV = VPI->getVPSingleValue();
+            if (VPV->getNumUsers() == 1) {
+              if (auto *WR = dyn_cast<VPWidenRecipe>(*VPV->user_begin())) {
+                switch (WR->getOpcode()) {
+                case Instruction::UDiv:
+                case Instruction::SDiv:
+                case Instruction::URem:
+                case Instruction::SRem:
+                  continue;
+                default:
+                  break;
+                }
+              }
+            }
+            [[fallthrough]];
+          }
           case VPInstruction::ActiveLaneMask:
           case VPInstruction::ExplicitVectorLength:
             C += VPI->cost(VF, CostCtx);
@@ -4925,7 +4954,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
         if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
             !useEmulatedMaskMemRefHack(&I, VF) &&
             computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {
-          ScalarCostsVF.insert_range(ScalarCosts);
+          for (const auto &[I, IC] : ScalarCosts)
+            ScalarCostsVF.insert({I, IC});
           // Check if we decided to scalarize a call. If so, update the widening
           // decision of the call to CM_Scalarize with the computed scalar cost.
           for (const auto &[I, Cost] : ScalarCosts) {
@@ -5564,8 +5594,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   SmallVector<Type *> Tys;
   for (auto *V : filterExtractingOperands(Ops, VF))
     Tys.push_back(maybeVectorizeType(V->getType(), VF));
-  return Cost + TTI.getOperandsScalarizationOverhead(
-                    filterExtractingOperands(Ops, VF), Tys, CostKind);
+  return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind);
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
@@ -5687,9 +5716,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       // If the instructions belongs to an interleave group, the whole group
       // receives the same decision. The whole group receives the cost, but
       // the cost will actually be assigned to one instruction.
-      if (const auto *Group = getInterleavedAccessGroup(&I))
-        setWideningDecision(Group, VF, Decision, Cost);
-      else
+      if (const auto *Group = getInterleavedAccessGroup(&I)) {
+        if (Decision == CM_Scalarize) {
+          for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
+            if (auto *I = Group->getMember(Idx)) {
+              setWideningDecision(I, VF, Decision,
+                                  getMemInstScalarizationCost(I, VF));
+            }
+          }
+        } else {
+          setWideningDecision(Group, VF, Decision, Cost);
+        }
+      } else
         setWideningDecision(&I, VF, Decision, Cost);
     }
   }
@@ -5725,6 +5763,20 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           Worklist.push_back(InstOp);
   }
 
+  auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
+    // If there are direct memory op users of the newly scalarized load,
+    // their cost may have changed because there's no scalarization
+    // overhead for the operand. Update it.
+    for (User *U : LI->users()) {
+      if (!isa<LoadInst, StoreInst>(U))
+        continue;
+      if (getWideningDecision(cast<Instruction>(U), VF) != CM_Scalarize)
+        continue;
+      setWideningDecision(
+          cast<Instruction>(U), VF, CM_Scalarize,
+          getMemInstScalarizationCost(cast<Instruction>(U), VF));
+    }
+  };
   for (auto *I : AddrDefs) {
     if (isa<LoadInst>(I)) {
       // Setting the desired widening decision should ideally be handled in
@@ -5732,20 +5784,26 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       // if the loaded register is involved in an address computation, it is
       // instead changed here when we know this is the case.
       InstWidening Decision = getWideningDecision(I, VF);
-      if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
-        // Scalarize a widened load of address.
+      if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
+          (!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) &&
+           Decision == CM_Scalarize)) {
+        // Scalarize a widened load of address or update the cost of a scalar
+        // load of an address.
         setWideningDecision(
             I, VF, CM_Scalarize,
             (VF.getKnownMinValue() *
              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
-      else if (const auto *Group = getInterleavedAccessGroup(I)) {
+        UpdateMemOpUserCost(cast<LoadInst>(I));
+      } else if (const auto *Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
-          if (Instruction *Member = Group->getMember(I))
+          if (Instruction *Member = Group->getMember(I)) {
             setWideningDecision(
                 Member, VF, CM_Scalarize,
                 (VF.getKnownMinValue() *
                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+            UpdateMemOpUserCost(cast<LoadInst>(Member));
+          }
         }
       }
     } else {
@@ -6912,6 +6970,28 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
     return nullptr;
   };
 
+  // Check if a select for a safe divisor was hoisted to the pre-header. If so,
+  // the select doesn't need to be considered for the vector loop cost; go with
+  // the more accurate VPlan-based cost model.
+  for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
+    auto *VPI = dyn_cast<VPInstruction>(&R);
+    if (!VPI || VPI->getOpcode() != Instruction::Select ||
+        VPI->getNumUsers() != 1)
+      continue;
+
+    if (auto *WR = dyn_cast<VPWidenRecipe>(*VPI->user_begin())) {
+      switch (WR->getOpcode()) {
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+        return true;
+      default:
+        break;
+      }
+    }
+  }
+
   DenseSet<Instruction *> SeenInstrs;
   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -6952,12 +7032,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
       if (Instruction *UI = GetInstructionForCost(&R)) {
         // If we adjusted the predicate of the recipe, the cost in the legacy
         // cost model may be different.
-        if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
-          if ((WidenCmp->getOpcode() == Instruction::ICmp ||
-               WidenCmp->getOpcode() == Instruction::FCmp) &&
-              WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
-            return true;
-        }
+        using namespace VPlanPatternMatch;
+        CmpPredicate Pred;
+        if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
+            cast<VPRecipeWithIRFlags>(R).getPredicate() !=
+                cast<CmpInst>(UI)->getPredicate())
+          return true;
         SeenInstrs.insert(UI);
       }
     }
@@ -7217,8 +7297,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
                            OrigLoop->getHeader()->getContext());
-  VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
+  VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
+  VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
   bool HasBranchWeights =
       hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
   if (HasBranchWeights) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 40a55656bfa7e..a0f3b97b4b9c2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -372,6 +372,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     set(Def, VectorValue);
   } else {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    assert(isa<VPInstruction>(Def) &&
+           "Explicit BuildVector recipes must have"
+           "handled packing for non-VPInstructions.");
     // Initialize packing with insertelements to start from poison.
     VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
@@ -1074,12 +1077,17 @@ void VPlan::execute(VPTransformState *State) {
 
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   // For now only return the cost of the vector loop region, ignoring any other
-  // blocks, like the preheader or middle blocks.
+  // blocks, like the preheader or middle blocks, expect for checking them for
+  // recipes with invalid costs.
   InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
 
-  // If any instructions in the middle block are invalid return invalid.
-  // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
-  if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+  // If the cost of the loop region is invalid or any recipe in the skeleton
+  // outside loop regions are invalid return an invalid cost.
+  if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+                                    vp_depth_first_shallow(getEntry())),
+                                [&VF, &Ctx](VPBasicBlock *VPBB) {
+                                  return !VPBB->cost(VF, Ctx).isValid();
+                                }))
     return InstructionCost::getInvalid();
 
   return Cost;
@@ -1684,3 +1692,38 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 
   return TTI::getOperandInfo(V->getLiveInIRValue());
 }
+
+InstructionCost VPCostContext::getScalarizationOverhead(
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+    bool AlwaysIncludeReplicatingR) {
+  if (VF.isScalar())
+    return 0;
+
+  InstructionCost ScalarizationCost = 0;
+  // Compute the cost of scalarizing the result if needed.
+  if (!ResultTy->isVoidTy()) {
+    for (Type *VectorTy :
+         to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
+      ScalarizationCost += TTI.getScalarizationOverhead(
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+          /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+  }
+  // Compute the cost of scalarizing the operands, skipping ones that do not
+  // require extraction/scalarization and do not incur any overhead.
+  SmallPtrSet<const VPValue *, 4> UniqueOperands;
+  SmallVector<Type *> Tys;
+  for (auto *Op : Operands) {
+    if (Op->isLiveIn() ||
+        (!AlwaysIncludeReplicatingR &&
+         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
+        (isa<VPReplicateRecipe>(Op) &&
+         cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
+        !UniqueOperands.insert(Op).second)
+      continue;
+    Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
+  }
+  return ScalarizationCost +
+         TTI.getOperandsScalarizationOverhead(Tys, CostKind);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0115c6f4331c4..9ddc040cf6c70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -806,6 +806,9 @@ class VPIRFlags {
 
   GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; }
 
+  /// Returns true if the recipe has a comparison predicate.
+  bool hasPredicate() const { return OpType == OperationType::Cmp; }
+
   /// Returns true if the recipe has fast-math flags.
   bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; }
 
@@ -899,6 +902,10 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
   }
 
   void execute(VPTransformState &State) override = 0;
+
+  /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.
+  InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF,
+                                             VPCostContext &Ctx) const;
 };
 
 /// Helper to access the operand that contains the unroll part for this recipe
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 7075fb1f39613..d7f74fc0df7b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -374,6 +374,15 @@ struct VPCostContext {
   /// legacy cost model for \p VF. Only used to check for additional VPlan
   /// simplifications.
   bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const;
+
+  /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
+  /// and \p Operands with \p VF. This is a convenience wrapper for the
+  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+  /// is true, always compute the cost of scalarizing replicating operands.
+  InstructionCost
+  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+                           ElementCount VF,
+                           bool AlwaysIncludeReplicatingR = false);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 0d206993cb043..4b940c28b158a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -461,6 +461,99 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
 }
 
+/// Cmp_match is a variant of BinaryRecipe_match that also binds the comparison
+/// predicate. Opcodes must either be Instruction::ICmp or Instruction::FCmp, or
+/// both.
+template <typename Op0_t, typename Op1_t, unsigned... Opcodes>
+struct Cmp_match {
+  static_assert((sizeof...(Opcodes) == 1 || sizeof...(Opcodes) == 2) &&
+                "Expected one or two opcodes");
+  static_assert(
+      ((Opcodes == Instruction::ICmp || Opcodes == Instruction::FCmp) && ...) &&
+      "Expected a compare instruction opcode");
+
+  CmpPredicate *Predicate = nullptr;
+  Op0_t Op0;
+  Op1_t Op1;
+
+  Cmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1)
+      : Predicate(&Pred), Op0(Op0), Op1(Op1) {}
+  Cmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {}
+
+  bool match(const VPValue *V) const {
+    auto *DefR = V->getDefiningRecipe();
+    return DefR && match(DefR);
+  }
+
+  bool match(const VPRecipeBase *V) const {
+    if ((m_Binary<Opcodes>(Op0, Op1).match(V) || ...)) {
+      if (Predicate)
+        *Predicate = cast<VPRecipeWithIRFlags>(V)->getPredicate();
+      return true;
+    }
+    return false;
+  }
+};
+
+/// SpecificCmp_match is a variant of Cmp_match that matches the comparison
+/// predicate, instead of binding it.
+template <typename Op0_t, typename Op1_t, unsigned... Opcodes>
+struct SpecificCmp_match {
+  const CmpPredicate Predicate;
+  Op0_t Op0;
+  Op1_t Op1;
+
+  SpecificCmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS)
+      : Predicate(Pred), Op0(LHS), Op1(RHS) {}
+
+  bool match(const VPValue *V) const {
+    CmpPredicate CurrentPred;
+    return Cmp_match<Op0_t, Op1_t, Opcodes...>(CurrentPred, Op0, Op1)
+               .match(V) &&
+           CmpPredicate::getMatching(CurrentPred, Predicate);
+  }
+};
+
+template <typename Op0_t, typename Op1_t>
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp> m_ICmp(const Op0_t &Op0,
+                                                         const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp>
+m_ICmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp>(Pred, Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp>
+m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
+  return SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp>(MatchPred, Op0,
+                                                            Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>
+m_Cmp(const Op0_t &Op0, const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>(Op0,
+                                                                       Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>
+m_Cmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) {
+  return Cmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>(
+      Pred, Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>
+m_SpecificCmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) {
+  return SpecificCmp_match<Op0_t, Op1_t, Instruction::ICmp, Instruction::FCmp>(
+      MatchPred, Op0, Op1);
+}
+
 template <typename Op0_t, typename Op1_t>
 using GEPLikeRecipe_match =
     BinaryRecipe_match<Op0_t, Op1_t, Instruction::GetElementPtr, false,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a1d572251f86a..87e07a83bdce4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -41,6 +41,7 @@
 #include <cassert>
 
 using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
 
 using VectorParts = SmallVector<Value *, 2>;
 
@@ -308,7 +309,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   VPRecipeBase *OpR = Op->getDefiningRecipe();
 
   // If the partial reduction is predicated, a select will be operand 0
-  using namespace llvm::VPlanPatternMatch;
   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
     OpR = Op->getDefiningRecipe();
   }
@@ -465,6 +465,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case Instruction::Load:
   case VPInstruction::AnyOf:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::BuildStructVector:
+  case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExplicitVectorLength:
@@ -901,31 +903,105 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
 }
 
+InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
+    unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
+  Type *ScalarTy = Ctx.Types.inferScalarType(this);
+  Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy;
+  switch (Opcode) {
+  case Instruction::FNeg:
+    return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    TargetTransformInfo::OperandValueInfo RHSInfo = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+
+    if (VF.isVector()) {
+      // Certain instructions can be cheaper to vectorize if they have a
+      // constant second vector operand. One example of this are shifts on x86.
+      VPValue *RHS = getOperand(1);
+      RHSInfo = Ctx.getOperandInfo(RHS);
+
+      if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
+          getOperand(1)->isDefinedOutsideLoopRegions())
+        RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
+    }
+
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, ResultTy, Ctx.CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        RHSInfo, Operands, CtxI, &Ctx.TLI);
+  }
+  case Instruction::Freeze:
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
+                                          Ctx.CostKind);
+  case Instruction::ExtractValue:
+    return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
+                                             Ctx.CostKind);
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0));
+    Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy;
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+    return Ctx.TTI.getCmpSelInstrCost(
+        Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(),
+        Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+        {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
+  }
+  }
+  llvm_unreachable("called for unsupported opcode");
+}
+
 InstructionCost VPInstruction::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
   if (Instruction::isBinaryOp(getOpcode())) {
-    Type *ResTy = Ctx.Types.inferScalarType(this);
-    if (!vputils::onlyFirstLaneUsed(this))
-      ResTy = toVectorTy(ResTy, VF);
-
-    if (!getUnderlyingValue()) {
-      switch (getOpcode()) {
-      case Instruction::FMul:
-        return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
-      default:
-        // TODO: Compute cost for VPInstructions without underlying values once
-        // the legacy cost model has been retired.
-        return 0;
-      }
+    if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
+      // TODO: Compute cost for VPInstructions without underlying values once
+      // the legacy cost model has been retired.
+      return 0;
     }
 
     assert(!doesGeneratePerAllLanes() &&
            "Should only generate a vector value or single scalar, not scalars "
            "for all lanes.");
-    return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind);
+    return getCostForRecipeWithOpcode(
+        getOpcode(),
+        vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx);
   }
 
   switch (getOpcode()) {
+  case Instruction::Select: {
+    llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+    match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue()));
+    auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
+    auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
+    if (!vputils::onlyFirstLaneUsed(this)) {
+      CondTy = toVectorTy(CondTy, VF);
+      VecTy = toVectorTy(VecTy, VF);
+    }
+    return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
+                                      Ctx.CostKind);
+  }
   case Instruction::ExtractElement: {
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
@@ -1542,18 +1618,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
     State.set(this, V);
 }
 
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
-                                                    VPCostContext &Ctx) const {
+/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
+static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
+                                            ArrayRef<const VPValue *> Operands,
+                                            const VPRecipeWithIRFlags &R,
+                                            ElementCount VF,
+                                            VPCostContext &Ctx) {
   // Some backends analyze intrinsic arguments to determine cost. Use the
   // underlying value for the operand if it has one. Otherwise try to use the
   // operand of the underlying call instruction, if there is one. Otherwise
   // clear Arguments.
   // TODO: Rework TTI interface to be independent of concrete IR values.
   SmallVector<const Value *> Arguments;
-  for (const auto &[Idx, Op] : enumerate(operands())) {
+  for (const auto &[Idx, Op] : enumerate(Operands)) {
     auto *V = Op->getUnderlyingValue();
     if (!V) {
-      if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
+      if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
         Arguments.push_back(UI->getArgOperand(Idx));
         continue;
       }
@@ -1563,21 +1643,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
     Arguments.push_back(V);
   }
 
-  Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
+  Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+  Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
   SmallVector<Type *> ParamTys;
-  for (unsigned I = 0; I != getNumOperands(); ++I)
-    ParamTys.push_back(
-        toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+  for (const VPValue *Op : Operands) {
+    ParamTys.push_back(VF.isVector()
+                           ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
+                           : Ctx.Types.inferScalarType(Op));
+  }
 
   // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
-  FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+  FastMathFlags FMF =
+      R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
   IntrinsicCostAttributes CostAttrs(
-      VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
-      dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+      ID, RetTy, Arguments, ParamTys, FMF,
+      dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
       InstructionCost::getInvalid(), &Ctx.TLI);
   return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
 }
 
+InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
+                                                    VPCostContext &Ctx) const {
+  SmallVector<const VPValue *> ArgOps(operands());
+  return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
+}
+
 StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
@@ -1741,7 +1831,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
 
   VPValue *Op0, *Op1;
-  using namespace llvm::VPlanPatternMatch;
   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -1963,20 +2052,15 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) const {
   switch (Opcode) {
-  case Instruction::FNeg: {
-    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-    return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, VectorTy, Ctx.CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
-  }
-
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
-    // More complex computation, let the legacy cost-model handle this for now.
-    return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
+    // If the div/rem operation isn't safe to speculate and requires
+    // predication, then the only way we can even create a vplan is to insert
+    // a select on the second input operand to ensure we use the value of 1
+    // for the inactive lanes. The select will be costed separately.
+  case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -1990,45 +2074,12 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor: {
-    VPValue *RHS = getOperand(1);
-    // Certain instructions can be cheaper to vectorize if they have a constant
-    // second vector operand. One example of this are shifts on x86.
-    TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS);
-
-    if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
-        getOperand(1)->isDefinedOutsideLoopRegions())
-      RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
-    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-
-    SmallVector<const Value *, 4> Operands;
-    if (CtxI)
-      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
-    return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, VectorTy, Ctx.CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        RHSInfo, Operands, CtxI, &Ctx.TLI);
-  }
-  case Instruction::Freeze: {
-    // This opcode is unknown. Assume that it is the same as 'mul'.
-    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
-    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
-                                          Ctx.CostKind);
-  }
-  case Instruction::ExtractValue: {
-    return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
-                                             Ctx.CostKind);
-  }
+  case Instruction::Xor:
+  case Instruction::Freeze:
+  case Instruction::ExtractValue:
   case Instruction::ICmp:
-  case Instruction::FCmp: {
-    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-    Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getCmpSelInstrCost(
-        Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(),
-        Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None},
-        {TTI::OK_AnyValue, TTI::OP_None}, CtxI);
-  }
+  case Instruction::FCmp:
+    return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx);
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -2863,6 +2914,9 @@ static void scalarizeInstruction(const Instruction *Instr,
   RepRecipe->applyFlags(*Cloned);
   RepRecipe->applyMetadata(*Cloned);
 
+  if (RepRecipe->hasPredicate())
+    cast<CmpInst>(Cloned)->setPredicate(RepRecipe->getPredicate());
+
   if (auto DL = RepRecipe->getDebugLoc())
     State.setDebugLocFrom(DL);
 
@@ -2931,6 +2985,83 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
+/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
+/// which the legacy cost model computes a SCEV expression when computing the
+/// address cost. Computing SCEVs for VPValues is incomplete and returns
+/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
+/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
+static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE,
+                                        const Loop *L) {
+  using namespace llvm::VPlanPatternMatch;
+  auto *PtrR = Ptr->getDefiningRecipe();
+  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+                      Instruction::GetElementPtr) ||
+                 isa<VPWidenGEPRecipe>(PtrR) ||
+                 match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
+    return nullptr;
+
+  // We are looking for a GEP where all indices are either loop invariant or
+  // inductions.
+  for (VPValue *Opd : drop_begin(PtrR->operands())) {
+    if (!Opd->isDefinedOutsideLoopRegions() &&
+        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+      return nullptr;
+  }
+
+  return vputils::getSCEVExprForVPValue(Ptr, SE, L);
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+  SmallPtrSet<const VPUser *, 4> Seen;
+  SmallVector<const VPUser *> WorkList = {V};
+
+  while (!WorkList.empty()) {
+    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+    if (!Cur || !Seen.insert(Cur).second)
+      continue;
+
+    auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
+    // Skip blends that use V only through a compare by checking if any incoming
+    // value was already visited.
+    if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
+                         [&](unsigned I) {
+                           return Seen.contains(
+                               Blend->getIncomingValue(I)->getDefiningRecipe());
+                         }))
+      continue;
+
+    for (VPUser *U : Cur->users()) {
+      if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(U))
+        if (InterleaveR->getAddr() == Cur)
+          return true;
+      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+        if (RepR->getOpcode() == Instruction::Load &&
+            RepR->getOperand(0) == Cur)
+          return true;
+        if (RepR->getOpcode() == Instruction::Store &&
+            RepR->getOperand(1) == Cur)
+          return true;
+      }
+      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+        if (MemR->getAddr() == Cur && MemR->isConsecutive())
+          return true;
+      }
+    }
+
+    // The legacy cost model only supports scalarization loads/stores with phi
+    // addresses, if the phi is directly used as load/store address. Don't
+    // traverse further for Blends.
+    if (Blend)
+      continue;
+
+    append_range(WorkList, Cur->users());
+  }
+  return false;
+}
+
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -2938,8 +3069,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   // transform, avoid computing their cost multiple times for now.
   Ctx.SkipCostComputation.insert(UI);
 
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  Type *ResultTy = Ctx.Types.inferScalarType(this);
   switch (UI->getOpcode()) {
   case Instruction::GetElementPtr:
     // We mark this instruction as zero-cost because the cost of GEPs in
@@ -2947,6 +3076,52 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // is scalarized or not. Therefore, we handle GEPs with the memory
     // instruction cost.
     return 0;
+  case Instruction::Call: {
+    auto *CalledFn =
+        cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
+
+    SmallVector<const VPValue *> ArgOps(drop_end(operands()));
+    SmallVector<Type *, 4> Tys;
+    for (const VPValue *ArgOp : ArgOps)
+      Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+
+    if (CalledFn->isIntrinsic())
+      // Various pseudo-intrinsics with costs of 0 are scalarized instead of
+      // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
+      switch (CalledFn->getIntrinsicID()) {
+      case Intrinsic::assume:
+      case Intrinsic::lifetime_end:
+      case Intrinsic::lifetime_start:
+      case Intrinsic::sideeffect:
+      case Intrinsic::pseudoprobe:
+      case Intrinsic::experimental_noalias_scope_decl: {
+        assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+                                    ElementCount::getFixed(1), Ctx) == 0 &&
+               "scalarizing intrinsic should be free");
+        return InstructionCost(0);
+      }
+      default:
+        break;
+      }
+
+    Type *ResultTy = Ctx.Types.inferScalarType(this);
+    InstructionCost ScalarCallCost =
+        Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+    if (isSingleScalar()) {
+      if (CalledFn->isIntrinsic())
+        ScalarCallCost = std::min(
+            ScalarCallCost,
+            getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+                                 ElementCount::getFixed(1), Ctx));
+      return ScalarCallCost;
+    }
+
+    if (VF.isScalable())
+      return InstructionCost::getInvalid();
+
+    return ScalarCallCost * VF.getFixedValue() +
+           Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF);
+  }
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::FAdd:
@@ -2960,14 +3135,91 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor: {
-    auto Op2Info = Ctx.getOperandInfo(getOperand(1));
-    SmallVector<const Value *, 4> Operands(UI->operand_values());
-    return Ctx.TTI.getArithmeticInstrCost(
-               UI->getOpcode(), ResultTy, CostKind,
-               {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-               Op2Info, Operands, UI, &Ctx.TLI) *
+  case Instruction::Xor:
+    return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
+                                      Ctx) *
            (isSingleScalar() ? 1 : VF.getFixedValue());
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem: {
+    InstructionCost ScalarCost =
+        getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx);
+    if (isSingleScalar())
+      return ScalarCost;
+
+    ScalarCost = ScalarCost * VF.getFixedValue() +
+                 Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this),
+                                              to_vector(operands()), VF);
+    // If the recipe is not predicated (i.e. not in a replicate region), return
+    // the scalar cost. Otherwise handle predicated cost.
+    if (!getParent()->getParent()->isReplicator())
+      return ScalarCost;
+
+    // Account for the phi nodes that we will create.
+    ScalarCost += VF.getFixedValue() *
+                  Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
+    // Scale the cost by the probability of executing the predicated blocks.
+    // This assumes the predicated block for each vector lane is equally
+    // likely.
+    ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
+    return ScalarCost;
+  }
+  case Instruction::Load:
+  case Instruction::Store: {
+    if (VF.isScalable() && !isSingleScalar())
+      return InstructionCost::getInvalid();
+
+    // TODO: See getMemInstScalarizationCost for how to handle replicating and
+    // predicated cases.
+    const VPRegionBlock *ParentRegion = getParent()->getParent();
+    if (ParentRegion && ParentRegion->isReplicator())
+      break;
+
+    bool IsLoad = UI->getOpcode() == Instruction::Load;
+    const VPValue *PtrOp = getOperand(!IsLoad);
+    const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L);
+    if (isa_and_nonnull<SCEVCouldNotCompute>(PtrSCEV))
+      break;
+
+    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+    const Align Alignment = getLoadStoreAlignment(UI);
+    unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
+    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+    bool UsedByLoadStoreAddress =
+        !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this);
+    InstructionCost ScalarCost =
+        ScalarMemOpCost +
+        Ctx.TTI.getAddressComputationCost(
+            PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, PtrSCEV);
+    if (isSingleScalar())
+      return ScalarCost;
+
+    SmallVector<const VPValue *> OpsToScalarize;
+    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+    // don't assign scalarization overhead in general, if the target prefers
+    // vectorized addressing or the loaded value is used as part of an address
+    // of another load or store.
+    if (!UsedByLoadStoreAddress) {
+      bool EfficientVectorLoadStore =
+          Ctx.TTI.supportsEfficientVectorElementLoadStore();
+      if (!(IsLoad && !PreferVectorizedAddressing) &&
+          !(!IsLoad && EfficientVectorLoadStore))
+        append_range(OpsToScalarize, operands());
+
+      if (!EfficientVectorLoadStore)
+        ResultTy = Ctx.Types.inferScalarType(this);
+    }
+
+    return (ScalarCost * VF.getFixedValue()) +
+           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
   }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b612ac3b19c0f..479b9d3371169 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1296,6 +1296,20 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
         continue;
 
       auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
+      if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+          vputils::isSingleScalar(RepR->getOperand(1))) {
+        auto *Clone = new VPReplicateRecipe(
+            RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
+            true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
+        Clone->insertBefore(RepOrWidenR);
+        auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
+                                      {Clone->getOperand(0)});
+        Ext->insertBefore(Clone);
+        Clone->setOperand(0, Ext);
+        RepR->eraseFromParent();
+        continue;
+      }
+
       // Skip recipes that aren't single scalars or don't have only their
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
@@ -1988,9 +2002,8 @@ void VPlanTransforms::optimize(VPlan &Plan) {
 
   runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
   runPass(simplifyBlends, Plan);
-  runPass(removeDeadRecipes, Plan);
-  runPass(narrowToSingleScalarRecipes, Plan);
   runPass(legalizeAndOptimizeInductions, Plan);
+  runPass(narrowToSingleScalarRecipes, Plan);
   runPass(removeRedundantExpandSCEVRecipes, Plan);
   runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType());
   runPass(removeBranchOnConst, Plan);
@@ -3417,6 +3430,52 @@ void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
   }
 }
 
+void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+  if (Plan.hasScalarVFOnly())
+    return;
+
+  VPTypeAnalysis TypeInfo(Plan);
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(Plan.getEntry()));
+  auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
+      vp_depth_first_shallow(LoopRegion->getEntry()));
+  // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
+  // excluding ones in replicate regions. Those are not materialized explicitly
+  // yet. Those vector users are still handled in VPReplicateRegion::execute(),
+  // via shouldPack().
+  // TODO: materialize build vectors for replicating recipes in replicating
+  // regions.
+  // TODO: materialize build vectors for VPInstructions.
+  for (VPBasicBlock *VPBB :
+       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
+        VPRegionBlock *ParentRegion =
+            cast<VPRecipeBase>(U)->getParent()->getParent();
+        return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
+      };
+      if (!RepR || RepR->isSingleScalar() ||
+          none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
+        continue;
+
+      Type *ScalarTy = TypeInfo.inferScalarType(RepR);
+      unsigned Opcode = ScalarTy->isStructTy()
+                            ? VPInstruction::BuildStructVector
+                            : VPInstruction::BuildVector;
+      auto *BuildVector = new VPInstruction(Opcode, {RepR});
+      BuildVector->insertAfter(RepR);
+
+      RepR->replaceUsesWithIf(
+          BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
+                           VPUser &U, unsigned) {
+            return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
+          });
+    }
+  }
+}
+
 /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
 /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
 /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index dfd9fd09ff9d4..71af37b0af48b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -248,6 +248,10 @@ struct VPlanTransforms {
   static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
                                    const Loop *L);
 
+  /// Add explicit Build[Struct]Vector recipes that combine multiple scalar
+  /// values into single vectors.
+  static void materializeBuildVectors(VPlan &Plan);
+
   /// Try to convert a plan with interleave groups with VF elements to a plan
   /// with the interleave groups replaced by wide loads and stores processing VF
   /// elements, if all transformed interleave groups access the full vector
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index b89cd21595efd..9061bcbc98f47 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -455,10 +455,12 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) {
   VPlanTransforms::removeDeadRecipes(Plan);
 }
 
-/// Create a single-scalar clone of \p RepR for lane \p Lane.
-static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
-                                       Type *IdxTy, VPReplicateRecipe *RepR,
-                                       VPLane Lane) {
+/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
+/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
+static VPReplicateRecipe *
+cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
+             VPReplicateRecipe *RepR, VPLane Lane,
+             const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
   // Collect the operands at Lane, creating extracts as needed.
   SmallVector<VPValue *> NewOps;
   for (VPValue *Op : RepR->operands()) {
@@ -471,6 +473,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
           Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
       continue;
     }
+    // If Op is a definition that has been unrolled, directly use the clone for
+    // the corresponding lane.
+    auto LaneDefs = Def2LaneDefs.find(Op);
+    if (LaneDefs != Def2LaneDefs.end()) {
+      NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
+      continue;
+    }
+
     // Look through buildvector to avoid unnecessary extracts.
     if (match(Op, m_BuildVector())) {
       NewOps.push_back(
@@ -503,6 +513,13 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
       vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
   auto VPBBsToUnroll =
       concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
+  // A mapping of current VPValue definitions to collections of new VPValues
+  // defined per lane. Serves to hook-up potential users of current VPValue
+  // definition that are replicated-per-VF later.
+  DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
+  // The removal of current recipes being replaced by new ones needs to be
+  // delayed after Def2LaneDefs is no longer in use.
+  SmallVector<VPRecipeBase *> ToRemove;
   for (VPBasicBlock *VPBB : VPBBsToUnroll) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@@ -511,39 +528,42 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
 
       VPBuilder Builder(RepR);
       if (RepR->getNumUsers() == 0) {
-        if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
-            vputils::isSingleScalar(RepR->getOperand(1))) {
-          // Stores to invariant addresses need to store the last lane only.
-          cloneForLane(Plan, Builder, IdxTy, RepR,
-                       VPLane::getLastLaneForVF(VF));
-        } else {
-          // Create single-scalar version of RepR for all lanes.
-          for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
-        }
+        // Create single-scalar version of RepR for all lanes.
+        for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
+          cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
         RepR->eraseFromParent();
         continue;
       }
       /// Create single-scalar version of RepR for all lanes.
       SmallVector<VPValue *> LaneDefs;
       for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
-        LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
+        LaneDefs.push_back(
+            cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
 
+      Def2LaneDefs[RepR] = LaneDefs;
       /// Users that only demand the first lane can use the definition for lane
       /// 0.
       RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
         return U.onlyFirstLaneUsed(RepR);
       });
 
-      // If needed, create a Build(Struct)Vector recipe to insert the scalar
-      // lane values into a vector.
-      Type *ResTy = RepR->getUnderlyingInstr()->getType();
-      VPValue *VecRes = Builder.createNaryOp(
-          ResTy->isStructTy() ? VPInstruction::BuildStructVector
-                              : VPInstruction::BuildVector,
-          LaneDefs);
-      RepR->replaceAllUsesWith(VecRes);
-      RepR->eraseFromParent();
+      // Update each build vector user that currently has RepR as its only
+      // operand, to have all LaneDefs as its operands.
+      for (VPUser *U : to_vector(RepR->users())) {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
+                     VPI->getOpcode() != VPInstruction::BuildStructVector))
+          continue;
+        assert(VPI->getNumOperands() == 1 &&
+               "Build(Struct)Vector must have a single operand before "
+               "replicating by VF");
+        VPI->setOperand(0, LaneDefs[0]);
+        for (VPValue *LaneDef : drop_begin(LaneDefs))
+          VPI->addOperand(LaneDef);
+      }
+      ToRemove.push_back(RepR);
     }
   }
+  for (auto *R : reverse(ToRemove))
+    R->eraseFromParent();
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index a1c71512a1ed2..1e45380db9759 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -137,8 +137,9 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V,
           IndexExprs.push_back(IndexExpr);
         }
 
-        auto *GEP = cast<GEPOperator>(R->getUnderlyingInstr());
-        return SE.getGEPExpr(const_cast<GEPOperator *>(GEP), IndexExprs);
+        Type *SrcElementTy = cast<GetElementPtrInst>(R->getUnderlyingInstr())
+                                 ->getSourceElementType();
+        return SE.getGEPExpr(Base, IndexExprs, SrcElementTy);
       })
       .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
 }
diff --git a/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll b/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll
new file mode 100644
index 0000000000000..9a9a6a7d45931
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll
@@ -0,0 +1,402 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes='print<scalar-evolution>' -disable-output 2>&1 | FileCheck %s
+
+declare void @use(i64)
+
+define void @test_step2_div4(i64 %n) {
+; CHECK-LABEL: 'test_step2_div4'
+; CHECK-NEXT:  Classifying expressions for: @test_step2_div4
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.0 = udiv i64 %iv, 4
+; CHECK-NEXT:    --> ({0,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
+; CHECK-NEXT:    --> {1,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 4
+; CHECK-NEXT:    --> ({0,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
+; CHECK-NEXT:    --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 4
+; CHECK-NEXT:    --> ({2,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.neg.1 = add i64 %iv, -1
+; CHECK-NEXT:    --> {-1,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.neg.1 = udiv i64 %iv.neg.1, 4
+; CHECK-NEXT:    --> ({-2,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 2
+; CHECK-NEXT:    --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_step2_div4
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %div.0 = udiv i64 %iv, 4
+  call void @use(i64 %div.0)
+  %iv.1 = add i64 %iv, 1
+  %div.1 = udiv i64 %iv.1, 4
+  call void @use(i64 %div.1)
+  %iv.2 = add i64 %iv, 2
+  %div.2 = udiv i64 %iv.2, 4
+  call void @use(i64 %div.2)
+  %iv.neg.1 = add i64 %iv, -1
+  %div.neg.1 = udiv i64 %iv.neg.1, 4
+  call void @use(i64 %div.neg.1)
+  %iv.next = add i64 %iv, 2
+  %cond = icmp slt i64 %iv, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_step3_div6(i64 %n) {
+; CHECK-LABEL: 'test_step3_div6'
+; CHECK-NEXT:  Classifying expressions for: @test_step3_div6
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,3}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.0 = udiv i64 %iv, 6
+; CHECK-NEXT:    --> ({0,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
+; CHECK-NEXT:    --> {1,+,3}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 6
+; CHECK-NEXT:    --> ({1,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
+; CHECK-NEXT:    --> {2,+,3}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 6
+; CHECK-NEXT:    --> ({2,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.neg.1 = add i64 %iv, -1
+; CHECK-NEXT:    --> {-1,+,3}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.neg.1 = udiv i64 %iv.neg.1, 6
+; CHECK-NEXT:    --> ({-1,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 3
+; CHECK-NEXT:    --> {3,+,3}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_step3_div6
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %div.0 = udiv i64 %iv, 6
+  call void @use(i64 %div.0)
+  %iv.1 = add i64 %iv, 1
+  %div.1 = udiv i64 %iv.1, 6
+  call void @use(i64 %div.1)
+  %iv.2 = add i64 %iv, 2
+  %div.2 = udiv i64 %iv.2, 6
+  call void @use(i64 %div.2)
+  %iv.neg.1 = add i64 %iv, -1
+  %div.neg.1 = udiv i64 %iv.neg.1, 6
+  call void @use(i64 %div.neg.1)
+  %iv.next = add i64 %iv, 3
+  %cond = icmp slt i64 %iv, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+
+define void @test_step4_div4(i64 %n) {
+; CHECK-LABEL: 'test_step4_div4'
+; CHECK-NEXT:  Classifying expressions for: @test_step4_div4
+; CHECK-NEXT:    %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.0 = udiv i64 %iv, 4
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
+; CHECK-NEXT:    --> {1,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 4
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
+; CHECK-NEXT:    --> {2,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 4
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.3 = add i64 %iv, 3
+; CHECK-NEXT:    --> {3,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.3 = udiv i64 %iv.3, 4
+; CHECK-NEXT:    --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.4 = add i64 %iv, 4
+; CHECK-NEXT:    --> {4,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.4 = udiv i64 %iv.4, 4
+; CHECK-NEXT:    --> ({4,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.5 = add i64 %iv, 5
+; CHECK-NEXT:    --> {5,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %div.5 = udiv i64 %iv.5, 4
+; CHECK-NEXT:    --> ({4,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 4
+; CHECK-NEXT:    --> {4,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_step4_div4
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %div.0 = udiv i64 %iv, 4
+  call void @use(i64 %div.0)
+  %iv.1 = add i64 %iv, 1
+  %div.1 = udiv i64 %iv.1, 4
+  call void @use(i64 %div.1)
+  %iv.2 = add i64 %iv, 2
+  %div.2 = udiv i64 %iv.2, 4
+  call void @use(i64 %div.2)
+  %iv.3 = add i64 %iv, 3
+  %div.3 = udiv i64 %iv.3, 4
+  call void @use(i64 %div.3)
+  %iv.4 = add i64 %iv, 4
+  %div.4 = udiv i64 %iv.4, 4
+  call void @use(i64 %div.4)
+  %iv.5 = add i64 %iv, 5
+  %div.5 = udiv i64 %iv.5, 4
+  call void @use(i64 %div.5)
+  %iv.next = add i64 %iv, 4
+  %cond = icmp slt i64 %iv, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_step2_start_outer_add_rec_step_16(i64 %n, i64 %m) {
+; CHECK-LABEL: 'test_step2_start_outer_add_rec_step_16'
+; CHECK-NEXT:  Classifying expressions for: @test_step2_start_outer_add_rec_step_16
+; CHECK-NEXT:    %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+; CHECK-NEXT:    --> {0,+,16}<%outer.header> U: [0,-15) S: [-9223372036854775808,9223372036854775793) Exits: <<Unknown>> LoopDispositions: { %outer.header: Computable, %loop: Invariant }
+; CHECK-NEXT:    %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {{\{\{}}0,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.0 = udiv i64 %iv, 4
+; CHECK-NEXT:    --> ({{\{\{}}0,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
+; CHECK-NEXT:    --> {{\{\{}}1,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 4
+; CHECK-NEXT:    --> ({{\{\{}}1,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
+; CHECK-NEXT:    --> {{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 4
+; CHECK-NEXT:    --> ({{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.3 = add i64 %iv, 3
+; CHECK-NEXT:    --> {{\{\{}}3,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.3 = udiv i64 %iv.3, 4
+; CHECK-NEXT:    --> ({{\{\{}}3,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.4 = add i64 %iv, 4
+; CHECK-NEXT:    --> {{\{\{}}4,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.4 = udiv i64 %iv.4, 4
+; CHECK-NEXT:    --> ({{\{\{}}4,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.5 = add i64 %iv, 5
+; CHECK-NEXT:    --> {{\{\{}}5,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.5 = udiv i64 %iv.5, 4
+; CHECK-NEXT:    --> ({{\{\{}}5,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.neg.1 = add i64 %iv, -1
+; CHECK-NEXT:    --> {{\{\{}}-1,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.neg.1 = udiv i64 %iv.neg.1, 4
+; CHECK-NEXT:    --> ({{\{\{}}-1,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.0 = udiv i64 %iv, 3
+; CHECK-NEXT:    --> ({{\{\{}}0,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.1 = udiv i64 %iv.1, 3
+; CHECK-NEXT:    --> ({{\{\{}}1,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.2 = udiv i64 %iv.2, 3
+; CHECK-NEXT:    --> ({{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.4 = udiv i64 %iv.4, 3
+; CHECK-NEXT:    --> ({{\{\{}}4,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.5 = udiv i64 %iv.5, 3
+; CHECK-NEXT:    --> ({{\{\{}}5,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 2
+; CHECK-NEXT:    --> {{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %outer.iv.next = add i64 %outer.iv, 16
+; CHECK-NEXT:    --> {16,+,16}<%outer.header> U: [0,-15) S: [-9223372036854775808,9223372036854775793) Exits: <<Unknown>> LoopDispositions: { %outer.header: Computable, %loop: Invariant }
+; CHECK-NEXT:  Determining loop execution counts for: @test_step2_start_outer_add_rec_step_16
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Predicated backedge-taken count is (%m /u 16)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i4 (trunc i64 %m to i4) to i64) == 0
+; CHECK-NEXT:  Loop %outer.header: Predicated constant max backedge-taken count is i64 1152921504606846975
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i4 (trunc i64 %m to i4) to i64) == 0
+; CHECK-NEXT:  Loop %outer.header: Predicated symbolic max backedge-taken count is (%m /u 16)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i4 (trunc i64 %m to i4) to i64) == 0
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ]
+  %div.0 = udiv i64 %iv, 4
+  call void @use(i64 %div.0)
+  %iv.1 = add i64 %iv, 1
+  %div.1 = udiv i64 %iv.1, 4
+  call void @use(i64 %div.1)
+  %iv.2 = add i64 %iv, 2
+  %div.2 = udiv i64 %iv.2, 4
+  call void @use(i64 %div.2)
+  %iv.3 = add i64 %iv, 3
+  %div.3 = udiv i64 %iv.3, 4
+  call void @use(i64 %div.3)
+  %iv.4 = add i64 %iv, 4
+  %div.4 = udiv i64 %iv.4, 4
+  call void @use(i64 %div.4)
+  %iv.5 = add i64 %iv, 5
+  %div.5 = udiv i64 %iv.5, 4
+  call void @use(i64 %div.5)
+  %iv.neg.1 = add i64 %iv, -1
+  %div.neg.1 = udiv i64 %iv.neg.1, 4
+  call void @use(i64 %div.neg.1)
+  %div3.0 = udiv i64 %iv, 3
+  call void @use(i64 %div3.0)
+  %div3.1 = udiv i64 %iv.1,3
+  call void @use(i64 %div3.1)
+  %div3.2 = udiv i64 %iv.2, 3
+  call void @use(i64 %div3.2)
+  %div3.4 = udiv i64 %iv.4, 3
+  call void @use(i64 %div3.4)
+  %div3.5 = udiv i64 %iv.5, 3
+  call void @use(i64 %div3.5)
+  %iv.next = add i64 %iv, 2
+  %cond = icmp slt i64 %iv, %n
+  br i1 %cond, label %loop, label %outer.latch
+
+outer.latch:
+  %outer.iv.next = add i64 %outer.iv, 16
+  %outer.ec = icmp eq i64 %outer.iv, %m
+  br i1 %outer.ec, label %exit, label %outer.header
+
+exit:
+  ret void
+}
+
+define void @test_step2_div4_start_outer_add_rec_step_2(i64 %n, i64 %m) {
+; CHECK-LABEL: 'test_step2_div4_start_outer_add_rec_step_2'
+; CHECK-NEXT:  Classifying expressions for: @test_step2_div4_start_outer_add_rec_step_2
+; CHECK-NEXT:    %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+; CHECK-NEXT:    --> {0,+,2}<%outer.header> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %outer.header: Computable, %loop: Invariant }
+; CHECK-NEXT:    %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {{\{\{}}0,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.0 = udiv i64 %iv, 4
+; CHECK-NEXT:    --> ({{\{\{}}0,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.1 = add i64 %iv, 1
+; CHECK-NEXT:    --> {{\{\{}}1,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.1 = udiv i64 %iv.1, 4
+; CHECK-NEXT:    --> ({{\{\{}}1,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.2 = add i64 %iv, 2
+; CHECK-NEXT:    --> {{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.2 = udiv i64 %iv.2, 4
+; CHECK-NEXT:    --> ({{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.3 = add i64 %iv, 3
+; CHECK-NEXT:    --> {{\{\{}}3,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.3 = udiv i64 %iv.3, 4
+; CHECK-NEXT:    --> ({{\{\{}}3,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.4 = add i64 %iv, 4
+; CHECK-NEXT:    --> {{\{\{}}4,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.4 = udiv i64 %iv.4, 4
+; CHECK-NEXT:    --> ({{\{\{}}4,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.5 = add i64 %iv, 5
+; CHECK-NEXT:    --> {{\{\{}}5,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.5 = udiv i64 %iv.5, 4
+; CHECK-NEXT:    --> ({{\{\{}}5,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.neg.1 = add i64 %iv, -1
+; CHECK-NEXT:    --> {{\{\{}}-1,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div.neg.1 = udiv i64 %iv.neg.1, 4
+; CHECK-NEXT:    --> ({{\{\{}}-1,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.0 = udiv i64 %iv, 3
+; CHECK-NEXT:    --> ({{\{\{}}0,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.1 = udiv i64 %iv.1, 3
+; CHECK-NEXT:    --> ({{\{\{}}1,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.2 = udiv i64 %iv.2, 3
+; CHECK-NEXT:    --> ({{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.4 = udiv i64 %iv.4, 3
+; CHECK-NEXT:    --> ({{\{\{}}4,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %div3.5 = udiv i64 %iv.5, 3
+; CHECK-NEXT:    --> ({{\{\{}}5,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %iv.next = add i64 %iv, 2
+; CHECK-NEXT:    --> {{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable, %outer.header: Variant }
+; CHECK-NEXT:    %outer.iv.next = add i64 %outer.iv, 2
+; CHECK-NEXT:    --> {2,+,2}<%outer.header> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %outer.header: Computable, %loop: Invariant }
+; CHECK-NEXT:  Determining loop execution counts for: @test_step2_div4_start_outer_add_rec_step_2
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %outer.header: Predicated backedge-taken count is (%m /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %m to i1) to i64) == 0
+; CHECK-NEXT:  Loop %outer.header: Predicated constant max backedge-taken count is i64 9223372036854775807
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %m to i1) to i64) == 0
+; CHECK-NEXT:  Loop %outer.header: Predicated symbolic max backedge-taken count is (%m /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      Equal predicate: (zext i1 (trunc i64 %m to i1) to i64) == 0
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ]
+  %div.0 = udiv i64 %iv, 4
+  call void @use(i64 %div.0)
+  %iv.1 = add i64 %iv, 1
+  %div.1 = udiv i64 %iv.1, 4
+  call void @use(i64 %div.1)
+  %iv.2 = add i64 %iv, 2
+  %div.2 = udiv i64 %iv.2, 4
+  call void @use(i64 %div.2)
+  %iv.3 = add i64 %iv, 3
+  %div.3 = udiv i64 %iv.3, 4
+  call void @use(i64 %div.3)
+  %iv.4 = add i64 %iv, 4
+  %div.4 = udiv i64 %iv.4, 4
+  call void @use(i64 %div.4)
+  %iv.5 = add i64 %iv, 5
+  %div.5 = udiv i64 %iv.5, 4
+  call void @use(i64 %div.5)
+  %iv.neg.1 = add i64 %iv, -1
+  %div.neg.1 = udiv i64 %iv.neg.1, 4
+  call void @use(i64 %div.neg.1)
+  %div3.0 = udiv i64 %iv, 3
+  call void @use(i64 %div3.0)
+  %div3.1 = udiv i64 %iv.1,3
+  call void @use(i64 %div3.1)
+  %div3.2 = udiv i64 %iv.2, 3
+  call void @use(i64 %div3.2)
+  %div3.4 = udiv i64 %iv.4, 3
+  call void @use(i64 %div3.4)
+  %div3.5 = udiv i64 %iv.5, 3
+  call void @use(i64 %div3.5)
+  call void @use(i64 %div.neg.1)
+  %iv.next = add i64 %iv, 2
+  %cond = icmp slt i64 %iv, %n
+  br i1 %cond, label %loop, label %outer.latch
+
+outer.latch:
+  %outer.iv.next = add i64 %outer.iv, 2
+  %outer.ec = icmp eq i64 %outer.iv, %m
+  br i1 %outer.ec, label %exit, label %outer.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 6029095bbe7b1..6af3b9841a64c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -1002,12 +1002,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]]
-; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
-; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP7]], true
-; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP8]], true
-; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP9]], i1 false
-; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP10]], i1 false
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP9]], true
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = xor i1 [[TMP10]], true
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP12]], double 1.000000e+00, double 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[PREDPHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
index 1bacae764f760..e925c253439fe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
@@ -5,8 +5,8 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; CHECK-COST: Checking a loop in 'fixed_width'
-; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}>
-; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}>
+; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<{{.+}}>, ir<2>, vp<{{.+}}>
+; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<{{.+}}>, ir<2>, vp<{{.+}}>
 ; CHECK-COST: Selecting VF: 1.
 
 ; We should decide this loop is not worth vectorising using fixed width vectors
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
new file mode 100644
index 0000000000000..f4047c5822094
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 5
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64 -mattr=+sve -S %s | FileCheck %s
+
+define void @cost_hoisted_vector_code(ptr %p, float %arg) {
+; CHECK-LABEL: define void @cost_hoisted_vector_code(
+; CHECK-SAME: ptr [[P:%.*]], float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[ARG]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add i64 1, [[INDEX1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[P]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], -8
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %res = tail call float @llvm.minimumnum.f32(float %arg, float 0.0)
+  %gep.p.red = getelementptr float, ptr %p, i64 %iv
+  store float %res, ptr %gep.p.red, align 4
+  %iv.next = add i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, 0
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret void
+}
+
+declare float @llvm.minimumnum.f32(float, float)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 019d2ee9886a6..1e46e212d44b9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux"
 ; Test case from https://github.com/llvm/llvm-project/issues/148431.
 define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 %n, i64 %off) #0 {
 ; CHECK-LABEL: define void @test_predicated_load_cast_hint(
-; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) {
+; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[N_EXT:%.*]] = sext i8 [[N]] to i32
 ; CHECK-NEXT:    [[N_SUB:%.*]] = add i32 [[N_EXT]], -15
@@ -67,205 +67,67 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8
 ; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP2]], 15
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP2]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP2]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE50:.*]] ]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i8 [[DOTCAST]], 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT17]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT18]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ule <16 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <16 x i8> poison, i8 [[TMP26]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT19]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT20]] to <16 x i64>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP25]], i32 0
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 4, i8 8, i8 12>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64>
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
-; CHECK-NEXT:    [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP30:%.*]] = zext i8 [[TMP29]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP30]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], 1
-; CHECK-NEXT:    store i64 [[TMP33]], ptr [[TMP31]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP25]], i32 1
-; CHECK-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
-; CHECK:       [[PRED_STORE_IF21]]:
-; CHECK-NEXT:    [[TMP35:%.*]] = add i8 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP36:%.*]] = zext i8 [[TMP35]] to i64
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP36]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1
-; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP38]], 1
-; CHECK-NEXT:    store i64 [[TMP39]], ptr [[TMP37]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
-; CHECK:       [[PRED_STORE_CONTINUE22]]:
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP25]], i32 2
-; CHECK-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
-; CHECK:       [[PRED_STORE_IF23]]:
-; CHECK-NEXT:    [[TMP41:%.*]] = add i8 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP42:%.*]] = zext i8 [[TMP41]] to i64
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP42]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2
-; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP44]], 1
-; CHECK-NEXT:    store i64 [[TMP45]], ptr [[TMP43]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
-; CHECK:       [[PRED_STORE_CONTINUE24]]:
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <16 x i1> [[TMP25]], i32 3
-; CHECK-NEXT:    br i1 [[TMP46]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
-; CHECK:       [[PRED_STORE_IF25]]:
-; CHECK-NEXT:    [[TMP47:%.*]] = add i8 [[OFFSET_IDX]], 12
-; CHECK-NEXT:    [[TMP48:%.*]] = zext i8 [[TMP47]] to i64
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP48]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3
-; CHECK-NEXT:    [[TMP51:%.*]] = or i64 [[TMP50]], 1
-; CHECK-NEXT:    store i64 [[TMP51]], ptr [[TMP49]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
-; CHECK:       [[PRED_STORE_CONTINUE26]]:
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP25]], i32 4
-; CHECK-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
-; CHECK:       [[PRED_STORE_IF27]]:
-; CHECK-NEXT:    [[TMP53:%.*]] = add i8 [[OFFSET_IDX]], 16
-; CHECK-NEXT:    [[TMP54:%.*]] = zext i8 [[TMP53]] to i64
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP54]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4
-; CHECK-NEXT:    [[TMP57:%.*]] = or i64 [[TMP56]], 1
-; CHECK-NEXT:    store i64 [[TMP57]], ptr [[TMP55]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
-; CHECK:       [[PRED_STORE_CONTINUE28]]:
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <16 x i1> [[TMP25]], i32 5
-; CHECK-NEXT:    br i1 [[TMP58]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
-; CHECK:       [[PRED_STORE_IF29]]:
-; CHECK-NEXT:    [[TMP59:%.*]] = add i8 [[OFFSET_IDX]], 20
-; CHECK-NEXT:    [[TMP60:%.*]] = zext i8 [[TMP59]] to i64
-; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP60]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5
-; CHECK-NEXT:    [[TMP63:%.*]] = or i64 [[TMP62]], 1
-; CHECK-NEXT:    store i64 [[TMP63]], ptr [[TMP61]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
-; CHECK:       [[PRED_STORE_CONTINUE30]]:
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP25]], i32 6
-; CHECK-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]]
-; CHECK:       [[PRED_STORE_IF31]]:
-; CHECK-NEXT:    [[TMP65:%.*]] = add i8 [[OFFSET_IDX]], 24
-; CHECK-NEXT:    [[TMP66:%.*]] = zext i8 [[TMP65]] to i64
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP66]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6
-; CHECK-NEXT:    [[TMP69:%.*]] = or i64 [[TMP68]], 1
-; CHECK-NEXT:    store i64 [[TMP69]], ptr [[TMP67]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
-; CHECK:       [[PRED_STORE_CONTINUE32]]:
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <16 x i1> [[TMP25]], i32 7
-; CHECK-NEXT:    br i1 [[TMP70]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]]
-; CHECK:       [[PRED_STORE_IF33]]:
-; CHECK-NEXT:    [[TMP71:%.*]] = add i8 [[OFFSET_IDX]], 28
-; CHECK-NEXT:    [[TMP72:%.*]] = zext i8 [[TMP71]] to i64
-; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP72]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7
-; CHECK-NEXT:    [[TMP75:%.*]] = or i64 [[TMP74]], 1
-; CHECK-NEXT:    store i64 [[TMP75]], ptr [[TMP73]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE34]]
-; CHECK:       [[PRED_STORE_CONTINUE34]]:
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <16 x i1> [[TMP25]], i32 8
-; CHECK-NEXT:    br i1 [[TMP76]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]]
-; CHECK:       [[PRED_STORE_IF35]]:
-; CHECK-NEXT:    [[TMP77:%.*]] = add i8 [[OFFSET_IDX]], 32
-; CHECK-NEXT:    [[TMP78:%.*]] = zext i8 [[TMP77]] to i64
-; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP78]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8
-; CHECK-NEXT:    [[TMP81:%.*]] = or i64 [[TMP80]], 1
-; CHECK-NEXT:    store i64 [[TMP81]], ptr [[TMP79]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
-; CHECK:       [[PRED_STORE_CONTINUE36]]:
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <16 x i1> [[TMP25]], i32 9
-; CHECK-NEXT:    br i1 [[TMP82]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]]
-; CHECK:       [[PRED_STORE_IF37]]:
-; CHECK-NEXT:    [[TMP83:%.*]] = add i8 [[OFFSET_IDX]], 36
-; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP83]] to i64
-; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP84]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9
-; CHECK-NEXT:    [[TMP87:%.*]] = or i64 [[TMP86]], 1
-; CHECK-NEXT:    store i64 [[TMP87]], ptr [[TMP85]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE38]]
-; CHECK:       [[PRED_STORE_CONTINUE38]]:
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <16 x i1> [[TMP25]], i32 10
-; CHECK-NEXT:    br i1 [[TMP88]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]]
-; CHECK:       [[PRED_STORE_IF39]]:
-; CHECK-NEXT:    [[TMP89:%.*]] = add i8 [[OFFSET_IDX]], 40
-; CHECK-NEXT:    [[TMP90:%.*]] = zext i8 [[TMP89]] to i64
-; CHECK-NEXT:    [[TMP91:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP90]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10
-; CHECK-NEXT:    [[TMP93:%.*]] = or i64 [[TMP92]], 1
-; CHECK-NEXT:    store i64 [[TMP93]], ptr [[TMP91]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE40]]
-; CHECK:       [[PRED_STORE_CONTINUE40]]:
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <16 x i1> [[TMP25]], i32 11
-; CHECK-NEXT:    br i1 [[TMP94]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]]
-; CHECK:       [[PRED_STORE_IF41]]:
-; CHECK-NEXT:    [[TMP95:%.*]] = add i8 [[OFFSET_IDX]], 44
-; CHECK-NEXT:    [[TMP96:%.*]] = zext i8 [[TMP95]] to i64
-; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP96]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11
-; CHECK-NEXT:    [[TMP99:%.*]] = or i64 [[TMP98]], 1
-; CHECK-NEXT:    store i64 [[TMP99]], ptr [[TMP97]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE42]]
-; CHECK:       [[PRED_STORE_CONTINUE42]]:
-; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <16 x i1> [[TMP25]], i32 12
-; CHECK-NEXT:    br i1 [[TMP100]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]]
-; CHECK:       [[PRED_STORE_IF43]]:
-; CHECK-NEXT:    [[TMP101:%.*]] = add i8 [[OFFSET_IDX]], 48
-; CHECK-NEXT:    [[TMP102:%.*]] = zext i8 [[TMP101]] to i64
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP105:%.*]] = or i64 [[TMP104]], 1
 ; CHECK-NEXT:    store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE44]]
-; CHECK:       [[PRED_STORE_CONTINUE44]]:
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <16 x i1> [[TMP25]], i32 13
-; CHECK-NEXT:    br i1 [[TMP106]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46:.*]]
-; CHECK:       [[PRED_STORE_IF45]]:
-; CHECK-NEXT:    [[TMP107:%.*]] = add i8 [[OFFSET_IDX]], 52
-; CHECK-NEXT:    [[TMP108:%.*]] = zext i8 [[TMP107]] to i64
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK:       [[PRED_STORE_IF17]]:
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13
+; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
 ; CHECK-NEXT:    [[TMP111:%.*]] = or i64 [[TMP110]], 1
 ; CHECK-NEXT:    store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE46]]
-; CHECK:       [[PRED_STORE_CONTINUE46]]:
-; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <16 x i1> [[TMP25]], i32 14
-; CHECK-NEXT:    br i1 [[TMP112]], label %[[PRED_STORE_IF47:.*]], label %[[PRED_STORE_CONTINUE48:.*]]
-; CHECK:       [[PRED_STORE_IF47]]:
-; CHECK-NEXT:    [[TMP113:%.*]] = add i8 [[OFFSET_IDX]], 56
-; CHECK-NEXT:    [[TMP114:%.*]] = zext i8 [[TMP113]] to i64
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; CHECK:       [[PRED_STORE_CONTINUE18]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK:       [[PRED_STORE_IF19]]:
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2
 ; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14
+; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
 ; CHECK-NEXT:    [[TMP117:%.*]] = or i64 [[TMP116]], 1
 ; CHECK-NEXT:    store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE48]]
-; CHECK:       [[PRED_STORE_CONTINUE48]]:
-; CHECK-NEXT:    [[TMP118:%.*]] = extractelement <16 x i1> [[TMP25]], i32 15
-; CHECK-NEXT:    br i1 [[TMP118]], label %[[PRED_STORE_IF49:.*]], label %[[PRED_STORE_CONTINUE50]]
-; CHECK:       [[PRED_STORE_IF49]]:
-; CHECK-NEXT:    [[TMP119:%.*]] = add i8 [[OFFSET_IDX]], 60
-; CHECK-NEXT:    [[TMP120:%.*]] = zext i8 [[TMP119]] to i64
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; CHECK:       [[PRED_STORE_CONTINUE20]]:
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]]
+; CHECK:       [[PRED_STORE_IF21]]:
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3
 ; CHECK-NEXT:    [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]]
-; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15
+; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
 ; CHECK-NEXT:    [[TMP123:%.*]] = or i64 [[TMP122]], 1
 ; CHECK-NEXT:    store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE50]]
-; CHECK:       [[PRED_STORE_CONTINUE50]]:
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; CHECK:       [[PRED_STORE_CONTINUE22]]:
 ; CHECK-NEXT:    store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP124:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP124]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16)
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP48]], i32 0
+; CHECK-NEXT:    br i1 [[TMP49]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -310,6 +172,219 @@ exit:
   ret void
 }
 
+; Check computing costs for sdiv/udiv with invariant divisor and tail folding.
+; From https://github.com/llvm/llvm-project/issues/160354.
+define void @srem_sdiv_with_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #0 {
+; CHECK-LABEL: define void @srem_sdiv_with_tail_folding(
+; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]]
+; CHECK-NEXT:    [[REM_1:%.*]] = add nsw i32 [[REM]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]]
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[DIV]], 1
+; CHECK-NEXT:    [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.sub = add nsw i32 %iv, -1
+  %rem = srem i32 %iv.sub, %d.0
+  %rem.1 = add nsw i32 %rem, 1
+  %c = icmp eq i32 %rem.1, %d.0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %div = sdiv i32 %iv.sub, %d.1
+  %add.1 = add i32 %div, 1
+  %add.1.ext = sext i32 %add.1 to i64
+  %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext
+  store i32 %iv, ptr %gep.dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, %end
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+; Check computing costs for predicated sdiv/udiv with invariant divisor without tail folding.
+; From https://github.com/llvm/llvm-project/issues/160356.
+define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #1 {
+; CHECK-LABEL: define void @srem_sdiv_without_tail_folding(
+; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[END]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[END]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[END]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[D_0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP1:%.*]] = srem <4 x i32> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], splat (i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_SDIV_IF]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i32 [[TMP5]], [[D_1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
+; CHECK:       [[PRED_SDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]]
+; CHECK:       [[PRED_SDIV_IF1]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[D_1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP12]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
+; CHECK:       [[PRED_SDIV_IF3]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[D_1]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE4]]
+; CHECK:       [[PRED_SDIV_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP17]], %[[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 [[TMP19]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]]
+; CHECK:       [[PRED_SDIV_IF5]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[D_1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE6]]
+; CHECK:       [[PRED_SDIV_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP22]], %[[PRED_SDIV_IF5]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = add <4 x i32> [[TMP23]], splat (i32 1)
+; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i32> [[TMP24]] to <4 x i64>
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK:       [[PRED_STORE_IF7]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK:       [[PRED_STORE_IF9]]:
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    store i32 [[TMP37]], ptr [[TMP36]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 [[TMP38]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; CHECK:       [[PRED_STORE_IF11]]:
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; CHECK:       [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[END]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]]
+; CHECK-NEXT:    [[REM_1:%.*]] = add nsw i32 [[REM]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]]
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[DIV]], 1
+; CHECK-NEXT:    [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.sub = add nsw i32 %iv, -1
+  %rem = srem i32 %iv.sub, %d.0
+  %rem.1 = add nsw i32 %rem, 1
+  %c = icmp eq i32 %rem.1, %d.0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %div = sdiv i32 %iv.sub, %d.1
+  %add.1 = add i32 %div, 1
+  %add.1.ext = sext i32 %add.1 to i64
+  %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext
+  store i32 %iv, ptr %gep.dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, %end
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="neoverse-v1" }
+attributes #1 = { "target-cpu"="neoverse-v2" }
+
 !0 = distinct !{!0, !1, !2, !3}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
@@ -328,4 +403,6 @@ exit:
 ; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META11]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META10]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
new file mode 100644
index 0000000000000..cce9596c5bb39
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -0,0 +1,622 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx15.0.0"
+
+define void @replicating_load_used_as_store_addr(ptr noalias %A) {
+; CHECK-LABEL: define void @replicating_load_used_as_store_addr(
+; CHECK-SAME: ptr noalias [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr ptr, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store i32 [[TMP8]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.A = getelementptr ptr, ptr %A, i64 %iv
+  %l.p = load ptr, ptr %gep.A, align 8
+  %iv.trunc = trunc i64 %iv.next to i32
+  store i32 %iv.trunc, ptr %l.p, align 4
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @replicating_load_used_as_store_addr_2(ptr noalias %invar.dst, ptr noalias %invar.src, ptr noalias %src) {
+; CHECK-LABEL: define void @replicating_load_used_as_store_addr_2(
+; CHECK-SAME: ptr noalias [[INVAR_DST:%.*]], ptr noalias [[INVAR_SRC:%.*]], ptr noalias [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[INVAR_SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 123
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[INVAR_DST]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %l.offset = load i32, ptr %invar.src, align 4
+  %offset.ext = sext i32 %l.offset to i64
+  %gep.src = getelementptr i128, ptr %src, i64 %offset.ext
+  %l.v = load i32, ptr %gep.src, align 4
+  %add = add i32 %l.v, 123
+  store i32 %add, ptr %invar.dst, align 8
+  %iv.next = add i64 %iv, 1
+  %exitcond41.not = icmp eq i64 %iv.next, 100
+  br i1 %exitcond41.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @replicating_load_used_as_store_addr_3(ptr noalias %src, ptr noalias %dst, ptr noalias %invar.dst, i8 %x) {
+; CHECK-LABEL: define void @replicating_load_used_as_store_addr_3(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], ptr noalias [[INVAR_DST:%.*]], i8 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i8 [[X]], 10
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP11]], 111
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    store i8 0, ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 0, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP5]] to i8
+; CHECK-NEXT:    store i8 [[TMP8]], ptr [[INVAR_DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %xor = xor i8 %x, 10
+  %ext = zext i8 %xor to i64
+  %gep.src = getelementptr i8, ptr %src, i64 %ext
+  %l = load i8, ptr %gep.src, align 1
+  %l.ext = zext i8 %l to i32
+  %xor.2 = xor i32 %l.ext, 111
+  %idx2.ext = zext i32 %l.ext to i64
+  %gep.dst = getelementptr i8, ptr %dst, i64 %idx2.ext
+  store i8 0, ptr %gep.dst, align 1
+  %xor.2.trunc = trunc i32 %xor.2 to i8
+  store i8 %xor.2.trunc, ptr %invar.dst, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @uniform_gep_for_replicating_gep(ptr %dst) {
+; CHECK-LABEL: define void @uniform_gep_for_replicating_gep(
+; CHECK-SAME: ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i8> [[TMP4]], i32 1
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %c = icmp eq i32 %iv, 0
+  %shift = lshr i32 %iv, 1
+  %ext = zext i1 %c to i8
+  %ext.shift = zext i32 %shift to i64
+  %gep = getelementptr i64, ptr %dst, i64 %ext.shift
+  store i8 %ext, ptr %gep, align 1
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 128
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_load_gep_widen_induction(ptr noalias %dst, ptr noalias %dst2) #0 {
+; CHECK-LABEL: define void @test_load_gep_widen_induction(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[DST2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
+; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2)
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    store ptr null, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    store ptr null, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
+; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    store ptr null, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr ptr, ptr [[DST2]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 6
+; CHECK-NEXT:    store <2 x ptr> [[TMP0]], ptr [[TMP17]], align 8
+; CHECK-NEXT:    store <2 x ptr> [[TMP1]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    store <2 x ptr> [[TMP2]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store <2 x ptr> [[TMP3]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:                                         ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst.iv = getelementptr i128, ptr %dst, i64 %iv
+  %iv.next = add i64 %iv, 1
+  store ptr null, ptr %gep.dst.iv, align 8
+  %gep.dst2.iv = getelementptr ptr, ptr %dst2, i64 %iv
+  store ptr %gep.dst.iv, ptr %gep.dst2.iv
+  %exitcond.not = icmp eq i64 %iv.next, 100
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define ptr @replicating_store_in_conditional_latch(ptr %p, i32 %n) #0 {
+; CHECK-LABEL: define ptr @replicating_store_in_conditional_latch(
+; CHECK-SAME: ptr [[P:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 0, [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[DOTCAST]], -2
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 48
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 48
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 48
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 144
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 24
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 24
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 24
+; CHECK-NEXT:    store ptr null, ptr [[TMP12]], align 8
+; CHECK-NEXT:    store ptr null, ptr [[TMP13]], align 8
+; CHECK-NEXT:    store ptr null, ptr [[TMP14]], align 8
+; CHECK-NEXT:    store ptr null, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %ptr.iv = phi ptr [ %p, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %gep.ptr.iv = getelementptr i8, ptr %ptr.iv, i64 24
+  %c = icmp eq i32 %iv, %n
+  br i1 %c, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i32 %iv, -2
+  store ptr null, ptr %gep.ptr.iv, align 8
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 48
+  br label %loop.header
+
+exit:
+  ret ptr %gep.ptr.iv
+}
+
+declare void @init(ptr)
+
+define void @scalar_store_cost_after_discarding_interleave_group(ptr %dst, i32 %x, ptr %src) {
+; CHECK-LABEL: define void @scalar_store_cost_after_discarding_interleave_group(
+; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TEMP1:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT:    call void @init(ptr [[TEMP1]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TEMP1]], align 4
+; CHECK-NEXT:    [[SHR_0:%.*]] = lshr i32 [[X]], 1
+; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[X]], -171254
+; CHECK-NEXT:    [[SHR_1:%.*]] = lshr i32 [[MUL_0]], 1
+; CHECK-NEXT:    [[ADD_0:%.*]] = add i32 [[SHR_0]], [[SHR_1]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP30]], align 2
+; CHECK-NEXT:    [[GEP_0_1:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[GEP_0_1]], i64 14
+; CHECK-NEXT:    store i16 0, ptr [[TMP38]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD_0]], 1
+; CHECK-NEXT:    [[SHR_2:%.*]] = lshr i32 [[ADD_1]], 1
+; CHECK-NEXT:    [[TMP54:%.*]] = trunc i32 [[SHR_2]] to i16
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[TMP30]], i64 2
+; CHECK-NEXT:    store i16 [[TMP54]], ptr [[TMP46]], align 2
+; CHECK-NEXT:    [[SUB_0:%.*]] = sub i32 0, [[MUL_0]]
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i32 [[SUB_0]], 1
+; CHECK-NEXT:    [[TMP70:%.*]] = trunc i32 [[SHR_3]] to i16
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[TMP30]], i64 12
+; CHECK-NEXT:    store i16 [[TMP70]], ptr [[TMP62]], align 2
+; CHECK-NEXT:    [[OR_0:%.*]] = or i32 [[X]], 1
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[OR_0]], 1
+; CHECK-NEXT:    [[SHR_4:%.*]] = lshr i32 [[ADD_2]], 1
+; CHECK-NEXT:    [[TMP86:%.*]] = trunc i32 [[SHR_4]] to i16
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[TMP30]], i64 4
+; CHECK-NEXT:    store i16 [[TMP86]], ptr [[TMP78]], align 2
+; CHECK-NEXT:    [[GEP_0_2:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr i8, ptr [[GEP_0_2]], i64 10
+; CHECK-NEXT:    store i16 0, ptr [[TMP94]], align 2
+; CHECK-NEXT:    [[TRUNC_3:%.*]] = trunc i32 [[TMP22]] to i16
+; CHECK-NEXT:    [[OR_1:%.*]] = or i16 [[TRUNC_3]], 1
+; CHECK-NEXT:    [[TMP113:%.*]] = add i16 [[OR_1]], 1
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8
+; CHECK-NEXT:    store i16 [[TMP113]], ptr [[TMP105]], align 2
+; CHECK-NEXT:    [[TMP121:%.*]] = getelementptr i8, ptr [[TMP30]], i64 6
+; CHECK-NEXT:    store i16 0, ptr [[TMP121]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[TMP21]], 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[TMP21]], 128
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp1 = alloca [64 x i32], align 4
+  call void @init(ptr %temp1)
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %1 = load i32, ptr %temp1, align 4
+  %shr.0 = lshr i32 %x, 1
+  %mul.0 = mul i32 %x, -171254
+  %shr.1 = lshr i32 %mul.0, 1
+  %add.0 = add i32 %shr.0, %shr.1
+  %gep.0 = getelementptr i16, ptr %dst, i64 %iv
+  store i16 0, ptr %gep.0, align 2
+  %gep.0.1 = getelementptr i16, ptr %dst, i64 %iv
+  %gep.14 = getelementptr i8, ptr %gep.0.1, i64 14
+  store i16 0, ptr %gep.14, align 2
+  %add.1 = add i32 %add.0, 1
+  %shr.2 = lshr i32 %add.1, 1
+  %trunc.0 = trunc i32 %shr.2 to i16
+  %gep.2 = getelementptr i8, ptr %gep.0, i64 2
+  store i16 %trunc.0, ptr %gep.2, align 2
+  %sub.0 = sub i32 0, %mul.0
+  %shr.3 = lshr i32 %sub.0, 1
+  %trunc.1 = trunc i32 %shr.3 to i16
+  %gep.12 = getelementptr i8, ptr %gep.0, i64 12
+  store i16 %trunc.1, ptr %gep.12, align 2
+  %or.0 = or i32 %x, 1
+  %add.2 = add i32 %or.0, 1
+  %shr.4 = lshr i32 %add.2, 1
+  %trunc.2 = trunc i32 %shr.4 to i16
+  %gep.4 = getelementptr i8, ptr %gep.0, i64 4
+  store i16 %trunc.2, ptr %gep.4, align 2
+  %gep.0.2 = getelementptr i16, ptr %dst, i64 %iv
+  %gep.10 = getelementptr i8, ptr %gep.0.2, i64 10
+  store i16 0, ptr %gep.10, align 2
+  %trunc.3 = trunc i32 %1 to i16
+  %or.1 = or i16 %trunc.3, 1
+  %add.3 = add i16 %or.1, 1
+  %gep.8 = getelementptr i8, ptr %gep.0, i64 8
+  store i16 %add.3, ptr %gep.8, align 2
+  %gep.6 = getelementptr i8, ptr %gep.0, i64 6
+  store i16 0, ptr %gep.6, align 2
+  %iv.next = add i64 %iv, 8
+  %ec = icmp ult i64 %iv, 128
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_prefer_vector_addressing(ptr %start, ptr %ms, ptr noalias %src) #0 {
+; CHECK-LABEL: define void @test_prefer_vector_addressing(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[MS:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[MS1:%.*]] = ptrtoint ptr [[MS]] to i64
+; CHECK-NEXT:    [[GEP_START:%.*]] = getelementptr i8, ptr [[START]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[START2]], 3
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[MS1]], i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[UMAX]], -3
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[START2]]
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2]], [[UMIN]]
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[UMIN]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[GEP_START]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP9]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 9
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP11]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA14:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA19:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA19]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA19]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA19]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %gep.start = getelementptr i8, ptr %start, i64 3
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %gep.start, %entry ], [ %ptr.iv.next, %loop ]
+  %recur = phi ptr [ %start, %entry ], [ %ptr.iv, %loop ]
+  %l = load i64, ptr %recur, align 1, !tbaa !0
+  %gep.src = getelementptr i8, ptr %src, i64 %l
+  store i32 0, ptr %gep.src, align 4, !tbaa !5
+  %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 3
+  %ec = icmp ult ptr %ptr.iv, %ms
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @cost_scalar_load_of_address(ptr noalias %src, ptr %dst) {
+; CHECK-LABEL: define void @cost_scalar_load_of_address(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[L_EXT:%.*]] = sext i32 [[L]] to i64
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[L_EXT]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 8
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %l.ext = sext i32 %l to i64
+  %gep.dst = getelementptr i32, ptr %dst, i64 %l.ext
+  store i32 0, ptr %gep.dst, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 8
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+%t = type { [3 x double] }
+%t.2 = type { [ 64 x double ] }
+
+define double @test_scalarization_cost_for_load_of_address(ptr %src.0, ptr %src.1, ptr %src.2) {
+; CHECK-LABEL: define double @test_scalarization_cost_for_load_of_address(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], ptr [[SRC_2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi double [ 3.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr [[T:%.*]], ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[L_0:%.*]] = load double, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[GEP_0]], i64 8
+; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    [[GEP_16:%.*]] = getelementptr i8, ptr [[GEP_0]], i64 16
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[GEP_16]], align 8
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul double [[L_0]], 3.000000e+00
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul double [[L_1]], 3.000000e+00
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul double [[L_2]], 3.000000e+00
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd double [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[MUL256_US:%.*]] = fmul double [[ADD_1]], [[L]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [[T_2:%.*]], ptr [[SRC_2]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP11]], i64 72
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[RED_NEXT]] = tail call double @llvm.fmuladd.f64(double [[MUL256_US]], double [[TMP17]], double [[RED]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret double [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi double [ 3.000000e+00, %entry ], [ %red.next, %loop ]
+  %gep.0 = getelementptr %t, ptr %src.0, i64 %iv
+  %l.0 = load double, ptr %gep.0, align 8
+  %gep.8 = getelementptr i8, ptr %gep.0, i64 8
+  %l.1 = load double, ptr %gep.8, align 8
+  %gep.16 = getelementptr i8, ptr %gep.0, i64 16
+  %l.2 = load double, ptr %gep.16, align 8
+  %mul.0 = fmul double %l.0, 3.000000e+00
+  %mul.1 = fmul double %l.1, 3.000000e+00
+  %mul.2 = fmul double %l.2, 3.000000e+00
+  %add.0 = fadd double %mul.0, %mul.1
+  %add.1 = fadd double %add.0, %mul.2
+  %gep.src = getelementptr double, ptr %src.1, i64 %iv
+  %l = load double, ptr %gep.src, align 8
+  %mul256.us = fmul double %add.1, %l
+  %gep.src.2 = getelementptr %t.2, ptr %src.2, i64 %iv
+  %gep.72 = getelementptr i8, ptr %gep.src.2, i64 72
+  %l.p.2 = load ptr, ptr %gep.72, align 8
+  %lv = load double, ptr %l.p.2, align 8
+  %red.next = tail call double @llvm.fmuladd.f64(double %mul256.us, double %lv, double %red)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 1
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret double  %red.next
+}
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }
+
+!0 = !{!1, !2, i64 0}
+!1 = !{!"", !2, i64 0}
+!2 = !{!"long long", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !3, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
index 403fc9f316d35..20409f66fc51f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
@@ -73,3 +73,26 @@ exit:
   %1 = select i1 %all.off, i32 1, i32 %0
   ret i32 %1
 }
+
+define i32 @select_vpinst_for_tail_folding(i8 %n) {
+; CHECK: LV: Checking a loop in 'select_vpinst_for_tail_folding'
+; CHECK: Cost of 1 for VF 2: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red>
+; CHECK: Cost of 1 for VF 4: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red>
+; CHECK: LV: Selecting VF: 4
+
+entry:
+  %c = icmp ne i8 %n, 0
+  %ext = zext i1 %c to i32
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %ext, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %iv.next = add i32 %iv, 1
+  %red.next = mul i32 %red, %iv
+  %ec = icmp eq i32 %iv, 12
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll
new file mode 100644
index 0000000000000..fd83a012541b5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target triple = "armv7-unknown-linux-gnueabihf"
+
+define void @replicating_load_used_by_other_load(i32 %arg, ptr %a, i32 %b) {
+; CHECK-LABEL: define void @replicating_load_used_by_other_load(
+; CHECK-SAME: i32 [[ARG:%.*]], ptr [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[ARG]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[IV]], 1
+; CHECK-NEXT:    [[AND_1:%.*]] = and i32 [[IV]], 1
+; CHECK-NEXT:    [[SHL_1:%.*]] = shl i32 [[IV]], 2
+; CHECK-NEXT:    [[SHL_2:%.*]] = shl i32 [[IV]], 1
+; CHECK-NEXT:    [[AND_2:%.*]] = and i32 [[SHL_2]], 2
+; CHECK-NEXT:    [[OR_1:%.*]] = or i32 [[AND_2]], [[AND_1]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i32 [[OR_1]], [[SHL_1]]
+; CHECK-NEXT:    [[XOR_1:%.*]] = xor i32 [[B]], [[OR_2]]
+; CHECK-NEXT:    [[XOR_2:%.*]] = xor i32 [[XOR_1]], [[ARG]]
+; CHECK-NEXT:    [[SHR_2:%.*]] = lshr i32 [[SHL_1]], 1
+; CHECK-NEXT:    [[XOR_3:%.*]] = xor i32 [[SHR]], [[ARG]]
+; CHECK-NEXT:    [[AND_3:%.*]] = and i32 [[XOR_3]], 1
+; CHECK-NEXT:    [[AND_4:%.*]] = and i32 [[IV]], 2147483646
+; CHECK-NEXT:    [[OR_3:%.*]] = or i32 [[AND_3]], [[AND_4]]
+; CHECK-NEXT:    [[AND_5:%.*]] = and i32 [[IV]], 254
+; CHECK-NEXT:    [[SHL_3:%.*]] = shl i32 [[OR_3]], 1
+; CHECK-NEXT:    [[XOR_4:%.*]] = xor i32 [[SHL_3]], 2
+; CHECK-NEXT:    [[OR_4:%.*]] = or i32 [[AND_5]], [[XOR_4]]
+; CHECK-NEXT:    [[XOR_5:%.*]] = xor i32 [[SHR_2]], [[OR_4]]
+; CHECK-NEXT:    [[XOR_6:%.*]] = xor i32 [[XOR_5]], [[XOR_2]]
+; CHECK-NEXT:    [[AND_6:%.*]] = and i32 [[XOR_6]], 255
+; CHECK-NEXT:    [[XOR_7:%.*]] = xor i32 [[AND_6]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[XOR_7]]
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[LD]] to i32
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr null, i32 [[ZEXT]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 100
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ %arg, %entry ]
+  %shr = lshr i32 %iv, 1
+  %and.1 = and i32 %iv, 1
+  %shl.1 = shl i32 %iv, 2
+  %shl.2 = shl i32 %iv, 1
+  %and.2 = and i32 %shl.2, 2
+  %or.1 = or i32 %and.2, %and.1
+  %or.2 = or i32 %or.1, %shl.1
+  %xor.1 = xor i32 %b, %or.2
+  %xor.2 = xor i32 %xor.1, %arg
+  %shr.2 = lshr i32 %shl.1, 1
+  %xor.3 = xor i32 %shr, %arg
+  %and.3 = and i32 %xor.3, 1
+  %and.4 = and i32 %iv, 2147483646
+  %or.3 = or i32 %and.3, %and.4
+  %and.5 = and i32 %iv, 254
+  %shl.3 = shl i32 %or.3, 1
+  %xor.4 = xor i32 %shl.3, 2
+  %or.4 = or i32 %and.5, %xor.4
+  %xor.5 = xor i32 %shr.2, %or.4
+  %xor.6 = xor i32 %xor.5, %xor.2
+  %and.6 = and i32 %xor.6, 255
+  %xor.7 = xor i32 %and.6, 1
+  %gep = getelementptr i8, ptr %a, i32 %xor.7
+  %ld = load i8, ptr %gep, align 1
+  %zext = zext i8 %ld to i32
+  %gep.2 = getelementptr i32, ptr null, i32 %zext
+  store i32 0, ptr %gep.2, align 4
+  %iv.next = add i32 %iv, 1
+  %cmp = icmp eq i32 %iv.next, 100
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
index 06e345f9c12ec..934f1ecb49410 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
@@ -23,7 +23,38 @@ define void @outside_user_blocks_tail_folding(ptr nocapture readonly %ptr, i32 %
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[INDEX]], 15
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP18]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP19]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP20]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP21]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]]
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP10]]
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP12]]
+; CHECK-NEXT:    [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP13]]
+; CHECK-NEXT:    [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP14]]
+; CHECK-NEXT:    [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP16]]
+; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index c6661cb644d80..8c550a752610e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -1019,3 +1019,79 @@ latch:
 for.end:
   ret void
 }
+
+; Test for https://github.com/llvm/llvm-project/issues/159402. For invariant divisors,
+; selects can be introduced outside the vector loop and their cost should not be
+; considered for each loop iteration.
+define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
+; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ -12, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[NARROW_IV:%.*]] = phi i8 [ -12, [[ENTRY]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X:%.*]]
+; CHECK-NEXT:    [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
+; CHECK-NEXT:    [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y:%.*]]
+; CHECK-NEXT:    [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
+; CHECK-NEXT:    [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[MERGE_LCSSA]]
+;
+; FIXED-LABEL: @udiv_sdiv_with_invariant_divisors(
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br label [[LOOP_HEADER:%.*]]
+; FIXED:       loop.header:
+; FIXED-NEXT:    [[IV:%.*]] = phi i16 [ -12, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; FIXED-NEXT:    [[NARROW_IV:%.*]] = phi i8 [ -12, [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
+; FIXED-NEXT:    br i1 [[C:%.*]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; FIXED:       then:
+; FIXED-NEXT:    [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X:%.*]]
+; FIXED-NEXT:    [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
+; FIXED-NEXT:    [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y:%.*]]
+; FIXED-NEXT:    [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
+; FIXED-NEXT:    br label [[LOOP_LATCH]]
+; FIXED:       loop.latch:
+; FIXED-NEXT:    [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
+; FIXED-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
+; FIXED-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
+; FIXED-NEXT:    [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
+; FIXED-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; FIXED:       exit:
+; FIXED-NEXT:    [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ]
+; FIXED-NEXT:    ret i32 [[MERGE_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i16 [ -12, %entry ], [ %iv.next, %loop.latch ]
+  %narrow.iv = phi i8 [ -12, %entry ], [ %iv.next.trunc, %loop.latch ]
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  %ud = udiv i8 %narrow.iv, %x
+  %ud.ext = zext i8 %ud to i16
+  %sd = sdiv i16 %ud.ext, %y
+  %sd.ext = sext i16 %sd to i32
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ 0, %loop.header ], [ %sd.ext, %then ]
+  %iv.next = add nsw i16 %iv, 1
+  %ec = icmp eq i16 %iv.next, 0
+  %iv.next.trunc = trunc i16 %iv.next to i8
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i32 %merge
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
index 875770cd047ed..b0d9cbe07ff7e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
@@ -102,10 +102,10 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
 ; CHECK-LABEL: @uniform_store_i1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]]
@@ -116,12 +116,13 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56, i64 64, i64 72, i64 80, i64 88, i64 96, i64 104, i64 112, i64 120, i64 128, i64 136, i64 144, i64 152, i64 160, i64 168, i64 176, i64 184, i64 192, i64 200, i64 208, i64 216, i64 224, i64 232, i64 240, i64 248>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1
+; CHECK-NEXT:    [[VECTOR_GEP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 256, i64 264, i64 272, i64 280, i64 288, i64 296, i64 304, i64 312, i64 320, i64 328, i64 336, i64 344, i64 352, i64 360, i64 368, i64 376, i64 384, i64 392, i64 400, i64 408, i64 416, i64 424, i64 432, i64 440, i64 448, i64 456, i64 464, i64 472, i64 480, i64 488, i64 496, i64 504>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[VECTOR_GEP1]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31
 ; CHECK-NEXT:    store i1 [[TMP8]], ptr [[DST:%.*]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
index 86758b5a24fe9..a3e3038c6b748 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
@@ -43,7 +43,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: store float %v4, ptr %out4, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: store float %v4, ptr %out4, align 4
 ; AVX512:  LV: Found an estimated cost of 70 for VF 32 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 140 for VF 64 For instruction: store float %v4, ptr %out4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
index 53c8f59491e76..f112e82e5b291 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
@@ -43,7 +43,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, ptr %out5, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, ptr %out5, align 4
 ; AVX512:  LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, ptr %out5, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
index 3f4216bb3a1ef..7792ff36f9a7a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
@@ -43,7 +43,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v2, ptr %out2, align 8
 ; AVX512:  LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v2, ptr %out2, align 8
 ; AVX512:  LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 96 for VF 64 For instruction: store double %v2, ptr %out2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
index 555bbe8e44269..110be8f6fb2f1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
@@ -39,7 +39,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: store double %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: store double %v6, ptr %out6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
index f4fbbec3a46f5..b369f71627bbc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
@@ -43,7 +43,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
 ; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
 ; AVX512:  LV: Found an estimated cost of 70 for VF 32 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 140 for VF 64 For instruction: store i32 %v4, ptr %out4, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
index 4f35f667276d8..22671885a0da3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
@@ -43,7 +43,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
 ; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
 ; AVX512:  LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, ptr %out5, align 4
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
index fe1dad3c3effc..b2f0c31f75c9d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
@@ -43,7 +43,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
 ; AVX512:  LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
 ; AVX512:  LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 96 for VF 64 For instruction: store i64 %v2, ptr %out2, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
index 881c7867614b7..55debbdc64182 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
@@ -39,7 +39,6 @@ define void @test() {
 ; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
 ; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: store i64 %v6, ptr %out6, align 8
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
new file mode 100644
index 0000000000000..72b59028d9ae3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -0,0 +1,1198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Test case for https://github.com/llvm/llvm-project/issues/156091.
+define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B, ptr align 4 noalias %C, ptr align 4 noalias %D, ptr noalias %E) #0 {
+; CHECK-LABEL: @test_replicate_call_chain(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP76]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x float>, ptr [[TMP77]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP79]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13
+; CHECK-NEXT:    [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14
+; CHECK-NEXT:    [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15
+; CHECK-NEXT:    [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]])
+; CHECK-NEXT:    [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]])
+; CHECK-NEXT:    [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]])
+; CHECK-NEXT:    [[TMP45:%.*]] = tail call float @llvm.pow.f32(float [[TMP16]], float [[X]])
+; CHECK-NEXT:    [[TMP46:%.*]] = tail call float @llvm.pow.f32(float [[TMP18]], float [[X]])
+; CHECK-NEXT:    [[TMP47:%.*]] = tail call float @llvm.pow.f32(float [[TMP20]], float [[X]])
+; CHECK-NEXT:    [[TMP48:%.*]] = tail call float @llvm.pow.f32(float [[TMP22]], float [[X]])
+; CHECK-NEXT:    [[TMP49:%.*]] = tail call float @llvm.pow.f32(float [[TMP24]], float [[X]])
+; CHECK-NEXT:    [[TMP50:%.*]] = tail call float @llvm.pow.f32(float [[TMP26]], float [[X]])
+; CHECK-NEXT:    [[TMP51:%.*]] = tail call float @llvm.pow.f32(float [[TMP28]], float [[X]])
+; CHECK-NEXT:    [[TMP52:%.*]] = tail call float @llvm.pow.f32(float [[TMP30]], float [[X]])
+; CHECK-NEXT:    [[TMP53:%.*]] = tail call float @llvm.pow.f32(float [[TMP32]], float [[X]])
+; CHECK-NEXT:    [[TMP54:%.*]] = tail call float @llvm.pow.f32(float [[TMP34]], float [[X]])
+; CHECK-NEXT:    [[TMP55:%.*]] = tail call float @llvm.pow.f32(float [[TMP36]], float [[X]])
+; CHECK-NEXT:    [[TMP56:%.*]] = tail call float @llvm.pow.f32(float [[TMP38]], float [[X]])
+; CHECK-NEXT:    [[TMP57:%.*]] = tail call float @llvm.pow.f32(float [[TMP40]], float [[X]])
+; CHECK-NEXT:    [[TMP58:%.*]] = tail call float @llvm.pow.f32(float [[TMP42]], float [[X]])
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <16 x float> poison, float [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x float> [[TMP59]], float [[TMP44]], i32 1
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <16 x float> [[TMP60]], float [[TMP45]], i32 2
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <16 x float> [[TMP61]], float [[TMP46]], i32 3
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <16 x float> [[TMP62]], float [[TMP47]], i32 4
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <16 x float> [[TMP63]], float [[TMP48]], i32 5
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <16 x float> [[TMP64]], float [[TMP49]], i32 6
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <16 x float> [[TMP65]], float [[TMP50]], i32 7
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <16 x float> [[TMP66]], float [[TMP51]], i32 8
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x float> [[TMP67]], float [[TMP52]], i32 9
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <16 x float> [[TMP68]], float [[TMP53]], i32 10
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <16 x float> [[TMP69]], float [[TMP54]], i32 11
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <16 x float> [[TMP70]], float [[TMP55]], i32 12
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <16 x float> [[TMP71]], float [[TMP56]], i32 13
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <16 x float> [[TMP72]], float [[TMP57]], i32 14
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <16 x float> [[TMP73]], float [[TMP58]], i32 15
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr float, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr [[TMP78]], i32 4, <16 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr [[TMP78]], i32 4, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[E:%.*]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP75]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 100, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[DEC_IV_NEXT:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[IV_INC:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[L_A:%.*]] = load float, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[C_A:%.*]] = fcmp ogt float [[L_A]], 0.000000e+00
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load float, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_B:%.*]] = fcmp ogt float [[L_B]], 0.000000e+00
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr float, ptr [[C]], i64 [[IV_INC]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_A]], [[C_B]]
+; CHECK-NEXT:    br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_C]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[IV_MUL_2:%.*]] = shl i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV_MUL_2]]
+; CHECK-NEXT:    [[L_D:%.*]] = load float, ptr [[GEP_D]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[L_D]], 2.000000e+00
+; CHECK-NEXT:    [[POW_1:%.*]] = tail call float @llvm.pow.f32(float [[MUL]], float [[X]])
+; CHECK-NEXT:    [[POW_2:%.*]] = tail call float @llvm.pow.f32(float [[POW_1]], float [[X]])
+; CHECK-NEXT:    store float [[POW_2]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[E]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[DEC_IV_NEXT]] = add i64 [[DEC_IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[DEC_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %dec.iv = phi i64 [ 100, %entry ], [ %dec.iv.next, %loop.latch ]
+  %iv.inc = add i64 %iv, 1
+  %gep.A = getelementptr inbounds float, ptr %A, i64 %iv
+  %l.A = load float, ptr %gep.A, align 4
+  %c.A = fcmp ogt float %l.A, 0.0
+  %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
+  %l.B = load float, ptr %gep.B, align 4
+  %c.B = fcmp ogt float %l.B, 0.0
+  %gep.C = getelementptr float, ptr %C, i64 %iv.inc
+  %and = and i1 %c.A, %c.B
+  br i1 %and, label %then, label %else
+
+then:
+  store float 0.0, ptr %gep.C, align 4
+  br label %loop.latch
+
+else:
+  %iv.mul.2 = shl i64 %iv, 2
+  %gep.D = getelementptr i8, ptr %D, i64 %iv.mul.2
+  %l.D = load float, ptr %gep.D, align 4
+  %mul = fmul float %l.D, 2.0
+  %pow.1 = tail call float @llvm.pow.f32(float %mul, float %x)
+  %pow.2 = tail call float @llvm.pow.f32(float %pow.1, float %x)
+  store float %pow.2, ptr %gep.C, align 4
+  br label %loop.latch
+
+loop.latch:
+  store float 0.000000e+00, ptr %E, align 4
+  %iv.next = add i64 %iv, 1
+  %dec.iv.next = add i64 %dec.iv, -1
+  %ec = icmp ne i64 %dec.iv.next, 0
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
+; CHECK-LABEL: @avx512_cond_load_cost(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
+; CHECK-NEXT:    [[EXT:%.*]] = sext i32 [[OR]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
+; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[RES_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %c.1 = icmp slt i32 %iv, 0
+  br i1 %c.1, label %if.then, label %loop.latch
+
+if.then:
+  %1 = urem i32 %a, %c
+  %mul = sub i32 0, %1
+  %div = udiv i32 %c, %d
+  %or = or i32 %div, %mul
+  %ext = sext i32 %or to i64
+  %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
+  %l = load i64, ptr %gep, align 8
+  %or.2 = or i64 %l, %b
+  br label %loop.latch
+
+loop.latch:
+  %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp ult i32 %iv, %c
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret i64 %res
+}
+
+define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
+; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP25]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
+; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK:       pred.store.if8:
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP29]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; CHECK:       pred.store.continue9:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
+; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; CHECK:       pred.store.if10:
+; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; CHECK:       pred.store.continue11:
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; CHECK:       pred.store.if12:
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; CHECK:       pred.store.continue13:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; CHECK:       pred.store.if14:
+; CHECK-NEXT:    [[TMP88:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP40]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; CHECK:       pred.store.continue15:
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
+; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; CHECK:       pred.store.if16:
+; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP44]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; CHECK:       pred.store.continue17:
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
+; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; CHECK:       pred.store.if18:
+; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP48]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; CHECK:       pred.store.continue19:
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
+; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; CHECK:       pred.store.if20:
+; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP52]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; CHECK:       pred.store.continue21:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
+; CHECK-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; CHECK:       pred.store.if22:
+; CHECK-NEXT:    [[TMP107:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP55]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; CHECK:       pred.store.continue23:
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; CHECK:       pred.store.if24:
+; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 9
+; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
+; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP59]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; CHECK:       pred.store.continue25:
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
+; CHECK-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; CHECK:       pred.store.if26:
+; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
+; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP63]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; CHECK:       pred.store.continue27:
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
+; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; CHECK:       pred.store.if28:
+; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 11
+; CHECK-NEXT:    [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP67]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; CHECK:       pred.store.continue29:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
+; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
+; CHECK:       pred.store.if30:
+; CHECK-NEXT:    [[TMP108:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP70]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE31]]
+; CHECK:       pred.store.continue31:
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
+; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
+; CHECK:       pred.store.if32:
+; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[INDEX]], 13
+; CHECK-NEXT:    [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP74]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE33]]
+; CHECK:       pred.store.continue33:
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
+; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
+; CHECK:       pred.store.if34:
+; CHECK-NEXT:    [[TMP76:%.*]] = add i64 [[INDEX]], 14
+; CHECK-NEXT:    [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP78]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE35]]
+; CHECK:       pred.store.continue35:
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
+; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
+; CHECK:       pred.store.if36:
+; CHECK-NEXT:    [[TMP80:%.*]] = add i64 [[INDEX]], 15
+; CHECK-NEXT:    [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP82]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE37]]
+; CHECK:       pred.store.continue37:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]]
+; CHECK-NEXT:    [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2
+; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]]
+; CHECK-NEXT:    [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0
+; CHECK-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
+; CHECK:       pred.store.if43:
+; CHECK-NEXT:    [[TMP86:%.*]] = add i64 [[INDEX40]], 0
+; CHECK-NEXT:    [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2
+; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP93]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE44]]
+; CHECK:       pred.store.continue44:
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1
+; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
+; CHECK:       pred.store.if45:
+; CHECK-NEXT:    [[TMP95:%.*]] = add i64 [[INDEX40]], 1
+; CHECK-NEXT:    [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2
+; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP97]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE46]]
+; CHECK:       pred.store.continue46:
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2
+; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
+; CHECK:       pred.store.if47:
+; CHECK-NEXT:    [[TMP99:%.*]] = add i64 [[INDEX40]], 2
+; CHECK-NEXT:    [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2
+; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP101]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE48]]
+; CHECK:       pred.store.continue48:
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3
+; CHECK-NEXT:    br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]]
+; CHECK:       pred.store.if49:
+; CHECK-NEXT:    [[TMP103:%.*]] = add i64 [[INDEX40]], 3
+; CHECK-NEXT:    [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2
+; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP105]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE50]]
+; CHECK:       pred.store.continue50:
+; CHECK-NEXT:    [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4
+; CHECK-NEXT:    [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]]
+; CHECK-NEXT:    br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_0]], align 8
+; CHECK-NEXT:    [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.shl = shl nsw i64 %iv, 2
+  %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
+  %l = load double, ptr %gep.0, align 8
+  %c = fcmp oeq double %l, 0.000000e+00
+  br i1 %c, label %if.then, label %loop.latch
+
+if.then:
+  %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
+  store double 0.000000e+00, ptr %gep.1, align 8
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; Test for https://github.com/llvm/llvm-project/issues/129236.
+define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
+; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i16 0, 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 0 to i32
+; CHECK-NEXT:    [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
+; CHECK-NEXT:    [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
+; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[REM_I]] = urem i32 -1, [[CONV_I]]
+; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1
+; CHECK-NEXT:    br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], -1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[P_2_LCSSA]]
+;
+entry:
+  %cmp.i = icmp eq i16 0, 0
+  %conv.i = sext i16 0 to i32
+  %conv5.i = sext i8 %a to i32
+  br label %loop.header
+
+loop.header:
+  %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %cmp.i, label %then, label %else
+
+then:
+  %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ]
+  %shr.i = ashr i32 %conv5.i, %p.1
+  %tobool6.not.i = icmp eq i32 %shr.i, 0
+  %sext.i = shl i32 %p.1, 24
+  %2 = ashr exact i32 %sext.i, 24
+  %3 = select i1 %tobool6.not.i, i32 %2, i32 0
+  br label %loop.latch
+
+else:
+  %rem.i = urem i32 -1, %conv.i
+  %cmp3.i = icmp sgt i32 %rem.i, 1
+  br i1 %cmp3.i, label %loop.latch, label %then
+
+loop.latch:
+  %p.2 = phi i32 [ 0, %else ], [ %3, %then ]
+  %iv.next = add i8 %iv, -1
+  %ec = icmp eq i8 %iv.next, 0
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i32 %p.2
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/156066.
+define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 {
+; CHECK-LABEL: @sdiv_by_zero(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i1> [[TMP46]], splat (i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; CHECK:       pred.sdiv.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sdiv i32 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
+; CHECK:       pred.sdiv.continue:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]]
+; CHECK:       pred.sdiv.if1:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i32 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE2]]
+; CHECK:       pred.sdiv.continue2:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <8 x i32> [ [[TMP6]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]]
+; CHECK:       pred.sdiv.if3:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = sdiv i32 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP14]], i32 2
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
+; CHECK:       pred.sdiv.continue4:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <8 x i32> [ [[TMP11]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP15]], [[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]]
+; CHECK:       pred.sdiv.if5:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i32 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP19]], i32 3
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE6]]
+; CHECK:       pred.sdiv.continue6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_SDIV_IF5]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]]
+; CHECK:       pred.sdiv.if7:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 4
+; CHECK-NEXT:    [[TMP24:%.*]] = sdiv i32 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP24]], i32 4
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
+; CHECK:       pred.sdiv.continue8:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <8 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]]
+; CHECK:       pred.sdiv.if9:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 5
+; CHECK-NEXT:    [[TMP29:%.*]] = sdiv i32 [[TMP28]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP29]], i32 5
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE10]]
+; CHECK:       pred.sdiv.continue10:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi <8 x i32> [ [[TMP26]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_SDIV_IF9]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]]
+; CHECK:       pred.sdiv.if11:
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 6
+; CHECK-NEXT:    [[TMP34:%.*]] = sdiv i32 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP34]], i32 6
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE12]]
+; CHECK:       pred.sdiv.continue12:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi <8 x i32> [ [[TMP31]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP35]], [[PRED_SDIV_IF11]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]]
+; CHECK:       pred.sdiv.if13:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 7
+; CHECK-NEXT:    [[TMP39:%.*]] = sdiv i32 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <8 x i32> [[TMP36]], i32 [[TMP39]], i32 7
+; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE14]]
+; CHECK:       pred.sdiv.continue14:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi <8 x i32> [ [[TMP36]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP40]], [[PRED_SDIV_IF13]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP46]], <8 x i32> zeroinitializer, <8 x i32> [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 0
+; CHECK-NEXT:    store <8 x i32> [[PREDPHI]], ptr [[TMP45]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[ICMP]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SDIV:%.*]] = sdiv i32 [[L]], 0
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[SDIV]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV]], 16
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %bb ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %icmp = icmp eq i32 %l, 0
+  br i1 %icmp, label %loop.latch, label %then
+
+then:
+  %sdiv = sdiv i32 %l, 0
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %sdiv, %then ], [ 0, %loop.header ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 %merge, ptr %gep.dst, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp ult i64 %iv, 16
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/158660.
+define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
+; CHECK-LABEL: @test_predicated_udiv(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i1> [[BROADCAST_SPLATINSERT]], <32 x i1> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <32 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE62:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE62]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i32> @llvm.usub.sat.v32i32(<32 x i32> [[VEC_IND]], <32 x i32> splat (i32 1))
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK:       pred.udiv.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <32 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <32 x i32> poison, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
+; CHECK:       pred.udiv.continue:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <32 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <32 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]]
+; CHECK:       pred.udiv.if1:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <32 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv i32 [[TMP8]], [[D]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <32 x i32> [[TMP6]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
+; CHECK:       pred.udiv.continue2:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <32 x i32> [ [[TMP6]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <32 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; CHECK:       pred.udiv.if3:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i32 [[TMP13]], [[D]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <32 x i32> [[TMP11]], i32 [[TMP14]], i32 2
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; CHECK:       pred.udiv.continue4:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <32 x i32> [ [[TMP11]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <32 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; CHECK:       pred.udiv.if5:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <32 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = udiv i32 [[TMP18]], [[D]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <32 x i32> [[TMP16]], i32 [[TMP19]], i32 3
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
+; CHECK:       pred.udiv.continue6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <32 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF5]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <32 x i1> [[TMP0]], i32 4
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; CHECK:       pred.udiv.if7:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <32 x i32> [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP24:%.*]] = udiv i32 [[TMP23]], [[D]]
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <32 x i32> [[TMP21]], i32 [[TMP24]], i32 4
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; CHECK:       pred.udiv.continue8:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <32 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <32 x i1> [[TMP0]], i32 5
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; CHECK:       pred.udiv.if9:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <32 x i32> [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP29:%.*]] = udiv i32 [[TMP28]], [[D]]
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <32 x i32> [[TMP26]], i32 [[TMP29]], i32 5
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
+; CHECK:       pred.udiv.continue10:
+; CHECK-NEXT:    [[TMP31:%.*]] = phi <32 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_UDIV_IF9]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <32 x i1> [[TMP0]], i32 6
+; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; CHECK:       pred.udiv.if11:
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <32 x i32> [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP34:%.*]] = udiv i32 [[TMP33]], [[D]]
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <32 x i32> [[TMP31]], i32 [[TMP34]], i32 6
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
+; CHECK:       pred.udiv.continue12:
+; CHECK-NEXT:    [[TMP36:%.*]] = phi <32 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP35]], [[PRED_UDIV_IF11]] ]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i1> [[TMP0]], i32 7
+; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; CHECK:       pred.udiv.if13:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <32 x i32> [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP39:%.*]] = udiv i32 [[TMP38]], [[D]]
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <32 x i32> [[TMP36]], i32 [[TMP39]], i32 7
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE14]]
+; CHECK:       pred.udiv.continue14:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi <32 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP40]], [[PRED_UDIV_IF13]] ]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <32 x i1> [[TMP0]], i32 8
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; CHECK:       pred.udiv.if15:
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP1]], i32 8
+; CHECK-NEXT:    [[TMP44:%.*]] = udiv i32 [[TMP43]], [[D]]
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <32 x i32> [[TMP41]], i32 [[TMP44]], i32 8
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE16]]
+; CHECK:       pred.udiv.continue16:
+; CHECK-NEXT:    [[TMP46:%.*]] = phi <32 x i32> [ [[TMP41]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP45]], [[PRED_UDIV_IF15]] ]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <32 x i1> [[TMP0]], i32 9
+; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18:%.*]]
+; CHECK:       pred.udiv.if17:
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <32 x i32> [[TMP1]], i32 9
+; CHECK-NEXT:    [[TMP49:%.*]] = udiv i32 [[TMP48]], [[D]]
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <32 x i32> [[TMP46]], i32 [[TMP49]], i32 9
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE18]]
+; CHECK:       pred.udiv.continue18:
+; CHECK-NEXT:    [[TMP51:%.*]] = phi <32 x i32> [ [[TMP46]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP50]], [[PRED_UDIV_IF17]] ]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <32 x i1> [[TMP0]], i32 10
+; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20:%.*]]
+; CHECK:       pred.udiv.if19:
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <32 x i32> [[TMP1]], i32 10
+; CHECK-NEXT:    [[TMP54:%.*]] = udiv i32 [[TMP53]], [[D]]
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <32 x i32> [[TMP51]], i32 [[TMP54]], i32 10
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; CHECK:       pred.udiv.continue20:
+; CHECK-NEXT:    [[TMP56:%.*]] = phi <32 x i32> [ [[TMP51]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP55]], [[PRED_UDIV_IF19]] ]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <32 x i1> [[TMP0]], i32 11
+; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_UDIV_IF21:%.*]], label [[PRED_UDIV_CONTINUE22:%.*]]
+; CHECK:       pred.udiv.if21:
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP1]], i32 11
+; CHECK-NEXT:    [[TMP59:%.*]] = udiv i32 [[TMP58]], [[D]]
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <32 x i32> [[TMP56]], i32 [[TMP59]], i32 11
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE22]]
+; CHECK:       pred.udiv.continue22:
+; CHECK-NEXT:    [[TMP61:%.*]] = phi <32 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE20]] ], [ [[TMP60]], [[PRED_UDIV_IF21]] ]
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <32 x i1> [[TMP0]], i32 12
+; CHECK-NEXT:    br i1 [[TMP62]], label [[PRED_UDIV_IF23:%.*]], label [[PRED_UDIV_CONTINUE24:%.*]]
+; CHECK:       pred.udiv.if23:
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <32 x i32> [[TMP1]], i32 12
+; CHECK-NEXT:    [[TMP64:%.*]] = udiv i32 [[TMP63]], [[D]]
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <32 x i32> [[TMP61]], i32 [[TMP64]], i32 12
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE24]]
+; CHECK:       pred.udiv.continue24:
+; CHECK-NEXT:    [[TMP66:%.*]] = phi <32 x i32> [ [[TMP61]], [[PRED_UDIV_CONTINUE22]] ], [ [[TMP65]], [[PRED_UDIV_IF23]] ]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <32 x i1> [[TMP0]], i32 13
+; CHECK-NEXT:    br i1 [[TMP67]], label [[PRED_UDIV_IF25:%.*]], label [[PRED_UDIV_CONTINUE26:%.*]]
+; CHECK:       pred.udiv.if25:
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <32 x i32> [[TMP1]], i32 13
+; CHECK-NEXT:    [[TMP69:%.*]] = udiv i32 [[TMP68]], [[D]]
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <32 x i32> [[TMP66]], i32 [[TMP69]], i32 13
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE26]]
+; CHECK:       pred.udiv.continue26:
+; CHECK-NEXT:    [[TMP71:%.*]] = phi <32 x i32> [ [[TMP66]], [[PRED_UDIV_CONTINUE24]] ], [ [[TMP70]], [[PRED_UDIV_IF25]] ]
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <32 x i1> [[TMP0]], i32 14
+; CHECK-NEXT:    br i1 [[TMP72]], label [[PRED_UDIV_IF27:%.*]], label [[PRED_UDIV_CONTINUE28:%.*]]
+; CHECK:       pred.udiv.if27:
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <32 x i32> [[TMP1]], i32 14
+; CHECK-NEXT:    [[TMP74:%.*]] = udiv i32 [[TMP73]], [[D]]
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <32 x i32> [[TMP71]], i32 [[TMP74]], i32 14
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE28]]
+; CHECK:       pred.udiv.continue28:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi <32 x i32> [ [[TMP71]], [[PRED_UDIV_CONTINUE26]] ], [ [[TMP75]], [[PRED_UDIV_IF27]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <32 x i1> [[TMP0]], i32 15
+; CHECK-NEXT:    br i1 [[TMP77]], label [[PRED_UDIV_IF29:%.*]], label [[PRED_UDIV_CONTINUE30:%.*]]
+; CHECK:       pred.udiv.if29:
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <32 x i32> [[TMP1]], i32 15
+; CHECK-NEXT:    [[TMP79:%.*]] = udiv i32 [[TMP78]], [[D]]
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <32 x i32> [[TMP76]], i32 [[TMP79]], i32 15
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE30]]
+; CHECK:       pred.udiv.continue30:
+; CHECK-NEXT:    [[TMP81:%.*]] = phi <32 x i32> [ [[TMP76]], [[PRED_UDIV_CONTINUE28]] ], [ [[TMP80]], [[PRED_UDIV_IF29]] ]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <32 x i1> [[TMP0]], i32 16
+; CHECK-NEXT:    br i1 [[TMP82]], label [[PRED_UDIV_IF31:%.*]], label [[PRED_UDIV_CONTINUE32:%.*]]
+; CHECK:       pred.udiv.if31:
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <32 x i32> [[TMP1]], i32 16
+; CHECK-NEXT:    [[TMP84:%.*]] = udiv i32 [[TMP83]], [[D]]
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <32 x i32> [[TMP81]], i32 [[TMP84]], i32 16
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE32]]
+; CHECK:       pred.udiv.continue32:
+; CHECK-NEXT:    [[TMP86:%.*]] = phi <32 x i32> [ [[TMP81]], [[PRED_UDIV_CONTINUE30]] ], [ [[TMP85]], [[PRED_UDIV_IF31]] ]
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <32 x i1> [[TMP0]], i32 17
+; CHECK-NEXT:    br i1 [[TMP87]], label [[PRED_UDIV_IF33:%.*]], label [[PRED_UDIV_CONTINUE34:%.*]]
+; CHECK:       pred.udiv.if33:
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP1]], i32 17
+; CHECK-NEXT:    [[TMP89:%.*]] = udiv i32 [[TMP88]], [[D]]
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <32 x i32> [[TMP86]], i32 [[TMP89]], i32 17
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE34]]
+; CHECK:       pred.udiv.continue34:
+; CHECK-NEXT:    [[TMP91:%.*]] = phi <32 x i32> [ [[TMP86]], [[PRED_UDIV_CONTINUE32]] ], [ [[TMP90]], [[PRED_UDIV_IF33]] ]
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <32 x i1> [[TMP0]], i32 18
+; CHECK-NEXT:    br i1 [[TMP92]], label [[PRED_UDIV_IF35:%.*]], label [[PRED_UDIV_CONTINUE36:%.*]]
+; CHECK:       pred.udiv.if35:
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <32 x i32> [[TMP1]], i32 18
+; CHECK-NEXT:    [[TMP94:%.*]] = udiv i32 [[TMP93]], [[D]]
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <32 x i32> [[TMP91]], i32 [[TMP94]], i32 18
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE36]]
+; CHECK:       pred.udiv.continue36:
+; CHECK-NEXT:    [[TMP96:%.*]] = phi <32 x i32> [ [[TMP91]], [[PRED_UDIV_CONTINUE34]] ], [ [[TMP95]], [[PRED_UDIV_IF35]] ]
+; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <32 x i1> [[TMP0]], i32 19
+; CHECK-NEXT:    br i1 [[TMP97]], label [[PRED_UDIV_IF37:%.*]], label [[PRED_UDIV_CONTINUE38:%.*]]
+; CHECK:       pred.udiv.if37:
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <32 x i32> [[TMP1]], i32 19
+; CHECK-NEXT:    [[TMP99:%.*]] = udiv i32 [[TMP98]], [[D]]
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <32 x i32> [[TMP96]], i32 [[TMP99]], i32 19
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE38]]
+; CHECK:       pred.udiv.continue38:
+; CHECK-NEXT:    [[TMP101:%.*]] = phi <32 x i32> [ [[TMP96]], [[PRED_UDIV_CONTINUE36]] ], [ [[TMP100]], [[PRED_UDIV_IF37]] ]
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <32 x i1> [[TMP0]], i32 20
+; CHECK-NEXT:    br i1 [[TMP102]], label [[PRED_UDIV_IF39:%.*]], label [[PRED_UDIV_CONTINUE40:%.*]]
+; CHECK:       pred.udiv.if39:
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <32 x i32> [[TMP1]], i32 20
+; CHECK-NEXT:    [[TMP104:%.*]] = udiv i32 [[TMP103]], [[D]]
+; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <32 x i32> [[TMP101]], i32 [[TMP104]], i32 20
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE40]]
+; CHECK:       pred.udiv.continue40:
+; CHECK-NEXT:    [[TMP106:%.*]] = phi <32 x i32> [ [[TMP101]], [[PRED_UDIV_CONTINUE38]] ], [ [[TMP105]], [[PRED_UDIV_IF39]] ]
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <32 x i1> [[TMP0]], i32 21
+; CHECK-NEXT:    br i1 [[TMP107]], label [[PRED_UDIV_IF41:%.*]], label [[PRED_UDIV_CONTINUE42:%.*]]
+; CHECK:       pred.udiv.if41:
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <32 x i32> [[TMP1]], i32 21
+; CHECK-NEXT:    [[TMP109:%.*]] = udiv i32 [[TMP108]], [[D]]
+; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <32 x i32> [[TMP106]], i32 [[TMP109]], i32 21
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE42]]
+; CHECK:       pred.udiv.continue42:
+; CHECK-NEXT:    [[TMP111:%.*]] = phi <32 x i32> [ [[TMP106]], [[PRED_UDIV_CONTINUE40]] ], [ [[TMP110]], [[PRED_UDIV_IF41]] ]
+; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <32 x i1> [[TMP0]], i32 22
+; CHECK-NEXT:    br i1 [[TMP112]], label [[PRED_UDIV_IF43:%.*]], label [[PRED_UDIV_CONTINUE44:%.*]]
+; CHECK:       pred.udiv.if43:
+; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <32 x i32> [[TMP1]], i32 22
+; CHECK-NEXT:    [[TMP114:%.*]] = udiv i32 [[TMP113]], [[D]]
+; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <32 x i32> [[TMP111]], i32 [[TMP114]], i32 22
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE44]]
+; CHECK:       pred.udiv.continue44:
+; CHECK-NEXT:    [[TMP116:%.*]] = phi <32 x i32> [ [[TMP111]], [[PRED_UDIV_CONTINUE42]] ], [ [[TMP115]], [[PRED_UDIV_IF43]] ]
+; CHECK-NEXT:    [[TMP117:%.*]] = extractelement <32 x i1> [[TMP0]], i32 23
+; CHECK-NEXT:    br i1 [[TMP117]], label [[PRED_UDIV_IF45:%.*]], label [[PRED_UDIV_CONTINUE46:%.*]]
+; CHECK:       pred.udiv.if45:
+; CHECK-NEXT:    [[TMP118:%.*]] = extractelement <32 x i32> [[TMP1]], i32 23
+; CHECK-NEXT:    [[TMP119:%.*]] = udiv i32 [[TMP118]], [[D]]
+; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <32 x i32> [[TMP116]], i32 [[TMP119]], i32 23
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE46]]
+; CHECK:       pred.udiv.continue46:
+; CHECK-NEXT:    [[TMP121:%.*]] = phi <32 x i32> [ [[TMP116]], [[PRED_UDIV_CONTINUE44]] ], [ [[TMP120]], [[PRED_UDIV_IF45]] ]
+; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <32 x i1> [[TMP0]], i32 24
+; CHECK-NEXT:    br i1 [[TMP122]], label [[PRED_UDIV_IF47:%.*]], label [[PRED_UDIV_CONTINUE48:%.*]]
+; CHECK:       pred.udiv.if47:
+; CHECK-NEXT:    [[TMP123:%.*]] = extractelement <32 x i32> [[TMP1]], i32 24
+; CHECK-NEXT:    [[TMP124:%.*]] = udiv i32 [[TMP123]], [[D]]
+; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <32 x i32> [[TMP121]], i32 [[TMP124]], i32 24
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE48]]
+; CHECK:       pred.udiv.continue48:
+; CHECK-NEXT:    [[TMP126:%.*]] = phi <32 x i32> [ [[TMP121]], [[PRED_UDIV_CONTINUE46]] ], [ [[TMP125]], [[PRED_UDIV_IF47]] ]
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <32 x i1> [[TMP0]], i32 25
+; CHECK-NEXT:    br i1 [[TMP127]], label [[PRED_UDIV_IF49:%.*]], label [[PRED_UDIV_CONTINUE50:%.*]]
+; CHECK:       pred.udiv.if49:
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <32 x i32> [[TMP1]], i32 25
+; CHECK-NEXT:    [[TMP129:%.*]] = udiv i32 [[TMP128]], [[D]]
+; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <32 x i32> [[TMP126]], i32 [[TMP129]], i32 25
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE50]]
+; CHECK:       pred.udiv.continue50:
+; CHECK-NEXT:    [[TMP131:%.*]] = phi <32 x i32> [ [[TMP126]], [[PRED_UDIV_CONTINUE48]] ], [ [[TMP130]], [[PRED_UDIV_IF49]] ]
+; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <32 x i1> [[TMP0]], i32 26
+; CHECK-NEXT:    br i1 [[TMP132]], label [[PRED_UDIV_IF51:%.*]], label [[PRED_UDIV_CONTINUE52:%.*]]
+; CHECK:       pred.udiv.if51:
+; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <32 x i32> [[TMP1]], i32 26
+; CHECK-NEXT:    [[TMP134:%.*]] = udiv i32 [[TMP133]], [[D]]
+; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <32 x i32> [[TMP131]], i32 [[TMP134]], i32 26
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE52]]
+; CHECK:       pred.udiv.continue52:
+; CHECK-NEXT:    [[TMP136:%.*]] = phi <32 x i32> [ [[TMP131]], [[PRED_UDIV_CONTINUE50]] ], [ [[TMP135]], [[PRED_UDIV_IF51]] ]
+; CHECK-NEXT:    [[TMP137:%.*]] = extractelement <32 x i1> [[TMP0]], i32 27
+; CHECK-NEXT:    br i1 [[TMP137]], label [[PRED_UDIV_IF53:%.*]], label [[PRED_UDIV_CONTINUE54:%.*]]
+; CHECK:       pred.udiv.if53:
+; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <32 x i32> [[TMP1]], i32 27
+; CHECK-NEXT:    [[TMP139:%.*]] = udiv i32 [[TMP138]], [[D]]
+; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <32 x i32> [[TMP136]], i32 [[TMP139]], i32 27
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE54]]
+; CHECK:       pred.udiv.continue54:
+; CHECK-NEXT:    [[TMP141:%.*]] = phi <32 x i32> [ [[TMP136]], [[PRED_UDIV_CONTINUE52]] ], [ [[TMP140]], [[PRED_UDIV_IF53]] ]
+; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <32 x i1> [[TMP0]], i32 28
+; CHECK-NEXT:    br i1 [[TMP142]], label [[PRED_UDIV_IF55:%.*]], label [[PRED_UDIV_CONTINUE56:%.*]]
+; CHECK:       pred.udiv.if55:
+; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <32 x i32> [[TMP1]], i32 28
+; CHECK-NEXT:    [[TMP144:%.*]] = udiv i32 [[TMP143]], [[D]]
+; CHECK-NEXT:    [[TMP145:%.*]] = insertelement <32 x i32> [[TMP141]], i32 [[TMP144]], i32 28
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE56]]
+; CHECK:       pred.udiv.continue56:
+; CHECK-NEXT:    [[TMP146:%.*]] = phi <32 x i32> [ [[TMP141]], [[PRED_UDIV_CONTINUE54]] ], [ [[TMP145]], [[PRED_UDIV_IF55]] ]
+; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <32 x i1> [[TMP0]], i32 29
+; CHECK-NEXT:    br i1 [[TMP147]], label [[PRED_UDIV_IF57:%.*]], label [[PRED_UDIV_CONTINUE58:%.*]]
+; CHECK:       pred.udiv.if57:
+; CHECK-NEXT:    [[TMP148:%.*]] = extractelement <32 x i32> [[TMP1]], i32 29
+; CHECK-NEXT:    [[TMP149:%.*]] = udiv i32 [[TMP148]], [[D]]
+; CHECK-NEXT:    [[TMP150:%.*]] = insertelement <32 x i32> [[TMP146]], i32 [[TMP149]], i32 29
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE58]]
+; CHECK:       pred.udiv.continue58:
+; CHECK-NEXT:    [[TMP151:%.*]] = phi <32 x i32> [ [[TMP146]], [[PRED_UDIV_CONTINUE56]] ], [ [[TMP150]], [[PRED_UDIV_IF57]] ]
+; CHECK-NEXT:    [[TMP152:%.*]] = extractelement <32 x i1> [[TMP0]], i32 30
+; CHECK-NEXT:    br i1 [[TMP152]], label [[PRED_UDIV_IF59:%.*]], label [[PRED_UDIV_CONTINUE60:%.*]]
+; CHECK:       pred.udiv.if59:
+; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <32 x i32> [[TMP1]], i32 30
+; CHECK-NEXT:    [[TMP154:%.*]] = udiv i32 [[TMP153]], [[D]]
+; CHECK-NEXT:    [[TMP155:%.*]] = insertelement <32 x i32> [[TMP151]], i32 [[TMP154]], i32 30
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE60]]
+; CHECK:       pred.udiv.continue60:
+; CHECK-NEXT:    [[TMP156:%.*]] = phi <32 x i32> [ [[TMP151]], [[PRED_UDIV_CONTINUE58]] ], [ [[TMP155]], [[PRED_UDIV_IF59]] ]
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <32 x i1> [[TMP0]], i32 31
+; CHECK-NEXT:    br i1 [[TMP157]], label [[PRED_UDIV_IF61:%.*]], label [[PRED_UDIV_CONTINUE62]]
+; CHECK:       pred.udiv.if61:
+; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <32 x i32> [[TMP1]], i32 31
+; CHECK-NEXT:    [[TMP159:%.*]] = udiv i32 [[TMP158]], [[D]]
+; CHECK-NEXT:    [[TMP160:%.*]] = insertelement <32 x i32> [[TMP156]], i32 [[TMP159]], i32 31
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE62]]
+; CHECK:       pred.udiv.continue62:
+; CHECK-NEXT:    [[TMP161:%.*]] = phi <32 x i32> [ [[TMP156]], [[PRED_UDIV_CONTINUE60]] ], [ [[TMP160]], [[PRED_UDIV_IF61]] ]
+; CHECK-NEXT:    [[TMP162:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <32 x i1> [[BROADCAST_SPLAT]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32)
+; CHECK-NEXT:    [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i32 31
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT64:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT63]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP165:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT64]], splat (i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT65:%.*]] = insertelement <8 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT66:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT65]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT66]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX67:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT86:%.*]], [[PRED_UDIV_CONTINUE84:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND68:%.*]] = phi <8 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT87:%.*]], [[PRED_UDIV_CONTINUE84]] ]
+; CHECK-NEXT:    [[TMP166:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[VEC_IND68]], <8 x i32> splat (i32 1))
+; CHECK-NEXT:    [[TMP167:%.*]] = extractelement <8 x i1> [[TMP165]], i32 0
+; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF69:%.*]], label [[PRED_UDIV_CONTINUE70:%.*]]
+; CHECK:       pred.udiv.if69:
+; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP166]], i32 0
+; CHECK-NEXT:    [[TMP169:%.*]] = udiv i32 [[TMP168]], [[D]]
+; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <8 x i32> poison, i32 [[TMP169]], i32 0
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE70]]
+; CHECK:       pred.udiv.continue70:
+; CHECK-NEXT:    [[TMP171:%.*]] = phi <8 x i32> [ poison, [[VEC_EPILOG_VECTOR_BODY]] ], [ [[TMP170]], [[PRED_UDIV_IF69]] ]
+; CHECK-NEXT:    [[TMP172:%.*]] = extractelement <8 x i1> [[TMP165]], i32 1
+; CHECK-NEXT:    br i1 [[TMP172]], label [[PRED_UDIV_IF71:%.*]], label [[PRED_UDIV_CONTINUE72:%.*]]
+; CHECK:       pred.udiv.if71:
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <8 x i32> [[TMP166]], i32 1
+; CHECK-NEXT:    [[TMP174:%.*]] = udiv i32 [[TMP173]], [[D]]
+; CHECK-NEXT:    [[TMP175:%.*]] = insertelement <8 x i32> [[TMP171]], i32 [[TMP174]], i32 1
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE72]]
+; CHECK:       pred.udiv.continue72:
+; CHECK-NEXT:    [[TMP176:%.*]] = phi <8 x i32> [ [[TMP171]], [[PRED_UDIV_CONTINUE70]] ], [ [[TMP175]], [[PRED_UDIV_IF71]] ]
+; CHECK-NEXT:    [[TMP177:%.*]] = extractelement <8 x i1> [[TMP165]], i32 2
+; CHECK-NEXT:    br i1 [[TMP177]], label [[PRED_UDIV_IF73:%.*]], label [[PRED_UDIV_CONTINUE74:%.*]]
+; CHECK:       pred.udiv.if73:
+; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <8 x i32> [[TMP166]], i32 2
+; CHECK-NEXT:    [[TMP179:%.*]] = udiv i32 [[TMP178]], [[D]]
+; CHECK-NEXT:    [[TMP180:%.*]] = insertelement <8 x i32> [[TMP176]], i32 [[TMP179]], i32 2
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE74]]
+; CHECK:       pred.udiv.continue74:
+; CHECK-NEXT:    [[TMP181:%.*]] = phi <8 x i32> [ [[TMP176]], [[PRED_UDIV_CONTINUE72]] ], [ [[TMP180]], [[PRED_UDIV_IF73]] ]
+; CHECK-NEXT:    [[TMP182:%.*]] = extractelement <8 x i1> [[TMP165]], i32 3
+; CHECK-NEXT:    br i1 [[TMP182]], label [[PRED_UDIV_IF75:%.*]], label [[PRED_UDIV_CONTINUE76:%.*]]
+; CHECK:       pred.udiv.if75:
+; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <8 x i32> [[TMP166]], i32 3
+; CHECK-NEXT:    [[TMP184:%.*]] = udiv i32 [[TMP183]], [[D]]
+; CHECK-NEXT:    [[TMP185:%.*]] = insertelement <8 x i32> [[TMP181]], i32 [[TMP184]], i32 3
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE76]]
+; CHECK:       pred.udiv.continue76:
+; CHECK-NEXT:    [[TMP186:%.*]] = phi <8 x i32> [ [[TMP181]], [[PRED_UDIV_CONTINUE74]] ], [ [[TMP185]], [[PRED_UDIV_IF75]] ]
+; CHECK-NEXT:    [[TMP187:%.*]] = extractelement <8 x i1> [[TMP165]], i32 4
+; CHECK-NEXT:    br i1 [[TMP187]], label [[PRED_UDIV_IF77:%.*]], label [[PRED_UDIV_CONTINUE78:%.*]]
+; CHECK:       pred.udiv.if77:
+; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <8 x i32> [[TMP166]], i32 4
+; CHECK-NEXT:    [[TMP189:%.*]] = udiv i32 [[TMP188]], [[D]]
+; CHECK-NEXT:    [[TMP190:%.*]] = insertelement <8 x i32> [[TMP186]], i32 [[TMP189]], i32 4
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE78]]
+; CHECK:       pred.udiv.continue78:
+; CHECK-NEXT:    [[TMP191:%.*]] = phi <8 x i32> [ [[TMP186]], [[PRED_UDIV_CONTINUE76]] ], [ [[TMP190]], [[PRED_UDIV_IF77]] ]
+; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <8 x i1> [[TMP165]], i32 5
+; CHECK-NEXT:    br i1 [[TMP192]], label [[PRED_UDIV_IF79:%.*]], label [[PRED_UDIV_CONTINUE80:%.*]]
+; CHECK:       pred.udiv.if79:
+; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <8 x i32> [[TMP166]], i32 5
+; CHECK-NEXT:    [[TMP194:%.*]] = udiv i32 [[TMP193]], [[D]]
+; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <8 x i32> [[TMP191]], i32 [[TMP194]], i32 5
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE80]]
+; CHECK:       pred.udiv.continue80:
+; CHECK-NEXT:    [[TMP196:%.*]] = phi <8 x i32> [ [[TMP191]], [[PRED_UDIV_CONTINUE78]] ], [ [[TMP195]], [[PRED_UDIV_IF79]] ]
+; CHECK-NEXT:    [[TMP197:%.*]] = extractelement <8 x i1> [[TMP165]], i32 6
+; CHECK-NEXT:    br i1 [[TMP197]], label [[PRED_UDIV_IF81:%.*]], label [[PRED_UDIV_CONTINUE82:%.*]]
+; CHECK:       pred.udiv.if81:
+; CHECK-NEXT:    [[TMP198:%.*]] = extractelement <8 x i32> [[TMP166]], i32 6
+; CHECK-NEXT:    [[TMP199:%.*]] = udiv i32 [[TMP198]], [[D]]
+; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <8 x i32> [[TMP196]], i32 [[TMP199]], i32 6
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE82]]
+; CHECK:       pred.udiv.continue82:
+; CHECK-NEXT:    [[TMP201:%.*]] = phi <8 x i32> [ [[TMP196]], [[PRED_UDIV_CONTINUE80]] ], [ [[TMP200]], [[PRED_UDIV_IF81]] ]
+; CHECK-NEXT:    [[TMP202:%.*]] = extractelement <8 x i1> [[TMP165]], i32 7
+; CHECK-NEXT:    br i1 [[TMP202]], label [[PRED_UDIV_IF83:%.*]], label [[PRED_UDIV_CONTINUE84]]
+; CHECK:       pred.udiv.if83:
+; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <8 x i32> [[TMP166]], i32 7
+; CHECK-NEXT:    [[TMP204:%.*]] = udiv i32 [[TMP203]], [[D]]
+; CHECK-NEXT:    [[TMP205:%.*]] = insertelement <8 x i32> [[TMP201]], i32 [[TMP204]], i32 7
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE84]]
+; CHECK:       pred.udiv.continue84:
+; CHECK-NEXT:    [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ]
+; CHECK-NEXT:    [[TMP207:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64>
+; CHECK-NEXT:    [[PREDPHI85:%.*]] = select <8 x i1> [[BROADCAST_SPLAT64]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]]
+; CHECK-NEXT:    [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8)
+; CHECK-NEXT:    [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000
+; CHECK-NEXT:    br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i32 7
+; CHECK-NEXT:    br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL88:%.*]] = phi i32 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL88]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @llvm.usub.sat.i32(i32 [[IV]], i32 1)
+; CHECK-NEXT:    [[UDIV:%.*]] = udiv i32 [[CALL]], [[D]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[UDIV]] to i64
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i64 [ [[ZEXT]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = phi i64 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP164]], [[MIDDLE_BLOCK]] ], [ [[TMP209]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[MERGE_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  %call = tail call i32 @llvm.usub.sat.i32(i32 %iv, i32 1)
+  %udiv = udiv i32 %call, %d
+  %zext = zext i32 %udiv to i64
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i64 [ %zext, %then ], [ 0, %loop.header ]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i64 %merge
+}
+
+attributes #0 = { "target-cpu"="znver4" }
+attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" }
+attributes #2 = { "target-cpu"="znver3" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index e398586df348f..ef12b54fc3002 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -335,7 +335,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
 ; CHECK-NEXT:    [[UMIN10:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[UMIN10]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 46
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 24
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1)
@@ -542,356 +542,6 @@ exit:
   ret i1 %any.of.next
 }
 
-define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
-; CHECK-LABEL: @avx512_cond_load_cost(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i32 [[IV]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = sub i32 0, [[TMP0]]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]]
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[DIV]], [[MUL]]
-; CHECK-NEXT:    [[EXT:%.*]] = sext i32 [[OR]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[IV]], [[C]]
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    ret i64 [[RES_LCSSA]]
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %c.1 = icmp slt i32 %iv, 0
-  br i1 %c.1, label %if.then, label %loop.latch
-
-if.then:
-  %1 = urem i32 %a, %c
-  %mul = sub i32 0, %1
-  %div = udiv i32 %c, %d
-  %or = or i32 %div, %mul
-  %ext = sext i32 %or to i64
-  %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2
-  %l = load i64, ptr %gep, align 8
-  %or.2 = or i64 %l, %b
-  br label %loop.latch
-
-loop.latch:
-  %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ]
-  %iv.next = add i32 %iv, 1
-  %ec = icmp ult i32 %iv, %c
-  br i1 %ec, label %loop.header, label %exit
-
-exit:
-  ret i64 %res
-}
-
-define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 {
-; CHECK-LABEL: @cost_duplicate_recipe_for_sinking(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0
-; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP25]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1
-; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
-; CHECK:       pred.store.if8:
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP29]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; CHECK:       pred.store.continue9:
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2
-; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
-; CHECK:       pred.store.if10:
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE11]]
-; CHECK:       pred.store.continue11:
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3
-; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
-; CHECK:       pred.store.if12:
-; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE13]]
-; CHECK:       pred.store.continue13:
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0
-; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
-; CHECK:       pred.store.if14:
-; CHECK-NEXT:    [[TMP88:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP40]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE15]]
-; CHECK:       pred.store.continue15:
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1
-; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
-; CHECK:       pred.store.if16:
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP44]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE17]]
-; CHECK:       pred.store.continue17:
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2
-; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
-; CHECK:       pred.store.if18:
-; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP48]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE19]]
-; CHECK:       pred.store.continue19:
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3
-; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
-; CHECK:       pred.store.if20:
-; CHECK-NEXT:    [[TMP50:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP52]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE21]]
-; CHECK:       pred.store.continue21:
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
-; CHECK:       pred.store.if22:
-; CHECK-NEXT:    [[TMP107:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP55]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE23]]
-; CHECK:       pred.store.continue23:
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1
-; CHECK-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
-; CHECK:       pred.store.if24:
-; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT:    [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP59]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE25]]
-; CHECK:       pred.store.continue25:
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2
-; CHECK-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
-; CHECK:       pred.store.if26:
-; CHECK-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT:    [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP63]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE27]]
-; CHECK:       pred.store.continue27:
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
-; CHECK:       pred.store.if28:
-; CHECK-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT:    [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP67]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE29]]
-; CHECK:       pred.store.continue29:
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0
-; CHECK-NEXT:    br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
-; CHECK:       pred.store.if30:
-; CHECK-NEXT:    [[TMP108:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT:    [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP70]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE31]]
-; CHECK:       pred.store.continue31:
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1
-; CHECK-NEXT:    br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
-; CHECK:       pred.store.if32:
-; CHECK-NEXT:    [[TMP72:%.*]] = add i64 [[INDEX]], 13
-; CHECK-NEXT:    [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP74]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE33]]
-; CHECK:       pred.store.continue33:
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2
-; CHECK-NEXT:    br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]]
-; CHECK:       pred.store.if34:
-; CHECK-NEXT:    [[TMP76:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT:    [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2
-; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP78]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE35]]
-; CHECK:       pred.store.continue35:
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3
-; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]]
-; CHECK:       pred.store.if36:
-; CHECK-NEXT:    [[TMP80:%.*]] = add i64 [[INDEX]], 15
-; CHECK-NEXT:    [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP82]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE37]]
-; CHECK:       pred.store.continue37:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0
-; CHECK-NEXT:    [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]]
-; CHECK-NEXT:    [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ]
-; CHECK-NEXT:    [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2
-; CHECK-NEXT:    [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]]
-; CHECK-NEXT:    [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0
-; CHECK-NEXT:    br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]]
-; CHECK:       pred.store.if43:
-; CHECK-NEXT:    [[TMP86:%.*]] = add i64 [[INDEX40]], 0
-; CHECK-NEXT:    [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2
-; CHECK-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP93]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE44]]
-; CHECK:       pred.store.continue44:
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1
-; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]]
-; CHECK:       pred.store.if45:
-; CHECK-NEXT:    [[TMP95:%.*]] = add i64 [[INDEX40]], 1
-; CHECK-NEXT:    [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2
-; CHECK-NEXT:    [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP97]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE46]]
-; CHECK:       pred.store.continue46:
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2
-; CHECK-NEXT:    br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]]
-; CHECK:       pred.store.if47:
-; CHECK-NEXT:    [[TMP99:%.*]] = add i64 [[INDEX40]], 2
-; CHECK-NEXT:    [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2
-; CHECK-NEXT:    [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP101]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE48]]
-; CHECK:       pred.store.continue48:
-; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3
-; CHECK-NEXT:    br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]]
-; CHECK:       pred.store.if49:
-; CHECK-NEXT:    [[TMP103:%.*]] = add i64 [[INDEX40]], 3
-; CHECK-NEXT:    [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2
-; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP105]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE50]]
-; CHECK:       pred.store.continue50:
-; CHECK-NEXT:    [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4
-; CHECK-NEXT:    [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]]
-; CHECK-NEXT:    br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]]
-; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_0]], align 8
-; CHECK-NEXT:    [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
-  %iv.shl = shl nsw i64 %iv, 2
-  %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl
-  %l = load double, ptr %gep.0, align 8
-  %c = fcmp oeq double %l, 0.000000e+00
-  br i1 %c, label %if.then, label %loop.latch
-
-if.then:
-  %gep.1 = getelementptr double, ptr %A, i64 %iv.shl
-  store double 0.000000e+00, ptr %gep.1, align 8
-  br label %loop.latch
-
-loop.latch:
-  %iv.next = add nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv, %N
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret void
-}
-
 define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-LABEL: @cost_assume(
 ; CHECK-NEXT:  entry:
@@ -926,7 +576,7 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP9]], [[BIN_RDX]]
@@ -947,7 +597,7 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C]])
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i64 [ [[TMP12]], [[LOOP]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[DOTLCSSA]]
@@ -991,7 +641,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <4 x i32> [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]])
@@ -1015,7 +665,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    store i32 [[RED_NEXT]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 29
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1060,7 +710,7 @@ define i64 @live_in_known_1_via_scev() {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 3, i64 1, i64 1, i64 1>, [[VECTOR_PH]] ], [ [[VEC_PHI]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -1074,7 +724,7 @@ define i64 @live_in_known_1_via_scev() {
 ; CHECK-NEXT:    [[RED_MUL]] = mul nsw i64 [[RED]], [[P_EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
@@ -1121,7 +771,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 1), [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -1138,7 +788,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
 ; CHECK-NEXT:    [[RED_MUL]] = mul i64 [[SHL]], [[RED]]
 ; CHECK-NEXT:    [[IV_NEXT_I_I_I]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RED_MUL_LCSSA:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RED_MUL_LCSSA]]
@@ -1179,7 +829,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i1 [[TMP20]] to i32
@@ -1195,7 +845,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
 ; CHECK-NEXT:    [[OR]] = or i32 [[AND]], [[CONV]]
 ; CHECK-NEXT:    [[INC]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 16
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP1]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
@@ -1268,7 +918,7 @@ define i32 @g(i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]]
@@ -1303,7 +953,7 @@ define i32 @g(i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]]
-; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]])
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]]
@@ -1320,7 +970,7 @@ define i32 @g(i64 %n) {
 ; CHECK-NEXT:    [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2
 ; CHECK-NEXT:    [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SELECT_NEXT_LCSSA]]
@@ -1342,72 +992,6 @@ exit:
   ret i32 %select.next
 }
 
-; Test for https://github.com/llvm/llvm-project/issues/129236.
-define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
-; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i16 0, 0
-; CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 0 to i32
-; CHECK-NEXT:    [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]]
-; CHECK-NEXT:    [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0
-; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i32 [[P_1]], 24
-; CHECK-NEXT:    [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       else:
-; CHECK-NEXT:    [[REM_I]] = urem i32 -1, [[CONV_I]]
-; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1
-; CHECK-NEXT:    br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], -1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    ret i32 [[P_2_LCSSA]]
-;
-entry:
-  %cmp.i = icmp eq i16 0, 0
-  %conv.i = sext i16 0 to i32
-  %conv5.i = sext i8 %a to i32
-  br label %loop.header
-
-loop.header:
-  %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ]
-  br i1 %cmp.i, label %then, label %else
-
-then:
-  %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ]
-  %shr.i = ashr i32 %conv5.i, %p.1
-  %tobool6.not.i = icmp eq i32 %shr.i, 0
-  %sext.i = shl i32 %p.1, 24
-  %2 = ashr exact i32 %sext.i, 24
-  %3 = select i1 %tobool6.not.i, i32 %2, i32 0
-  br label %loop.latch
-
-else:
-  %rem.i = urem i32 -1, %conv.i
-  %cmp3.i = icmp sgt i32 %rem.i, 1
-  br i1 %cmp3.i, label %loop.latch, label %then
-
-loop.latch:
-  %p.2 = phi i32 [ 0, %else ], [ %3, %then ]
-  %iv.next = add i8 %iv, -1
-  %ec = icmp eq i8 %iv.next, 0
-  br i1 %ec, label %exit, label %loop.header
-
-exit:
-  ret i32 %p.2
-}
-
 declare void @llvm.assume(i1 noundef) #0
 
 attributes #0 = { "target-cpu"="penryn" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 86fa5d6820416..c7a2bb4fa04d7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -368,28 +368,28 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x ptr> [[TMP33]], ptr [[TMP20]], i32 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x ptr> [[TMP34]], ptr [[TMP31]], i32 2
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x ptr> [[TMP35]], ptr [[TMP32]], i32 3
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP28]], <4 x ptr> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3
 ; CHECK-NEXT:    store float [[TMP30]], ptr [[C:%.*]], align 4
@@ -403,7 +403,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
index de6418066dea0..d6514871b82d5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll
@@ -26,8 +26,16 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP18:%.*]] = and i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
@@ -35,7 +43,7 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP29]] = xor <8 x i8> [[REVERSE]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP10]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
index 59317fa463709..fb6c116569a0b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -69,8 +69,12 @@ define i32 @main(ptr %ptr) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP28]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
new file mode 100644
index 0000000000000..7c1efe8c922c6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -0,0 +1,589 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
+; RUN: opt -p loop-vectorize -mtriple=x86_64-linux-gnu -S %s | FileCheck --check-prefix=I64 %s
+; RUN: opt -p loop-vectorize -mtriple=i386-pc-linux-gnu -S %s | FileCheck --check-prefix=I32 %s
+
+
+define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
+; I64-LABEL: define void @test_store_initially_interleave(
+; I64-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; I64-NEXT:  [[ITER_CHECK:.*:]]
+; I64-NEXT:    [[TMP4:%.*]] = add i32 [[N]], 1
+; I64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP4]], 4
+; I64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; I64:       [[VECTOR_SCEVCHECK]]:
+; I64-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[N]], 0
+; I64-NEXT:    br i1 [[TMP1]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; I64:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; I64-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP4]], 16
+; I64-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP4]], 16
+; I64-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; I64-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 16, i32 [[N_MOD_VF]]
+; I64-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP4]], [[TMP3]]
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; I64-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; I64-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; I64-NEXT:    [[IV:%.*]] = add i32 [[INDEX]], 0
+; I64-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; I64-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 2
+; I64-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 3
+; I64-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 4
+; I64-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 5
+; I64-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 6
+; I64-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 7
+; I64-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 8
+; I64-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 9
+; I64-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 10
+; I64-NEXT:    [[TMP15:%.*]] = add i32 [[INDEX]], 11
+; I64-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 12
+; I64-NEXT:    [[TMP17:%.*]] = add i32 [[INDEX]], 13
+; I64-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 14
+; I64-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX]], 15
+; I64-NEXT:    [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I64-NEXT:    [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT:    [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT:    [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]]
+; I64-NEXT:    [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I64-NEXT:    [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I64-NEXT:    [[TMP27:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I64-NEXT:    [[TMP28:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I64-NEXT:    [[TMP29:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I64-NEXT:    [[TMP30:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I64-NEXT:    [[TMP31:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I64-NEXT:    [[TMP32:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I64-NEXT:    [[TMP33:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I64-NEXT:    [[TMP34:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I64-NEXT:    [[TMP35:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
+; I64-NEXT:    [[TMP36:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
+; I64-NEXT:    [[TMP37:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
+; I64-NEXT:    [[TMP38:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]]
+; I64-NEXT:    [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP19]]
+; I64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADD_PTR_I]], align 4
+; I64-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP25]], align 4
+; I64-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[TMP26]], align 4
+; I64-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP27]], align 4
+; I64-NEXT:    [[TMP44:%.*]] = load ptr, ptr [[TMP28]], align 4
+; I64-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP29]], align 4
+; I64-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[TMP30]], align 4
+; I64-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[TMP31]], align 4
+; I64-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[TMP32]], align 4
+; I64-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP33]], align 4
+; I64-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[TMP34]], align 4
+; I64-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP35]], align 4
+; I64-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP36]], align 4
+; I64-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4
+; I64-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4
+; I64-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
+; I64-NEXT:    [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
+; I64-NEXT:    store double [[CONV]], ptr [[TMP0]], align 4
+; I64-NEXT:    [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
+; I64-NEXT:    store double [[TMP57]], ptr [[TMP41]], align 4
+; I64-NEXT:    [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
+; I64-NEXT:    store double [[TMP58]], ptr [[TMP42]], align 4
+; I64-NEXT:    [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
+; I64-NEXT:    store double [[TMP59]], ptr [[TMP43]], align 4
+; I64-NEXT:    [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
+; I64-NEXT:    store double [[TMP60]], ptr [[TMP44]], align 4
+; I64-NEXT:    [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
+; I64-NEXT:    store double [[TMP61]], ptr [[TMP45]], align 4
+; I64-NEXT:    [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
+; I64-NEXT:    store double [[TMP62]], ptr [[TMP46]], align 4
+; I64-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
+; I64-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
+; I64-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
+; I64-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
+; I64-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
+; I64-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
+; I64-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
+; I64-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
+; I64-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
+; I64-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
+; I64-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
+; I64-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
+; I64-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
+; I64-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
+; I64-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
+; I64-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
+; I64-NEXT:    [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
+; I64-NEXT:    store double [[TMP71]], ptr [[TMP55]], align 4
+; I64-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; I64-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; I64-NEXT:    [[TMP72:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; I64-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; I64:       [[VEC_EPILOG_ITER_CHECK]]:
+; I64-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP4]], [[N_VEC]]
+; I64-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[N_VEC_REMAINING]], 4
+; I64-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; I64:       [[VEC_EPILOG_PH]]:
+; I64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; I64-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[TMP4]], 4
+; I64-NEXT:    [[TMP73:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0
+; I64-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 4, i32 [[N_MOD_VF2]]
+; I64-NEXT:    [[N_VEC3:%.*]] = sub i32 [[TMP4]], [[TMP74]]
+; I64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; I64-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; I64-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; I64-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; I64:       [[VEC_EPILOG_VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I64-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I64-NEXT:    [[TMP75:%.*]] = add i32 [[INDEX4]], 0
+; I64-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 1
+; I64-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 2
+; I64-NEXT:    [[TMP78:%.*]] = add i32 [[INDEX4]], 3
+; I64-NEXT:    [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I64-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I64-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I64-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I64-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
+; I64-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
+; I64-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
+; I64-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
+; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
+; I64-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
+; I64-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
+; I64-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
+; I64-NEXT:    [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3
+; I64-NEXT:    store double [[TMP91]], ptr [[TMP87]], align 4
+; I64-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
+; I64-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
+; I64-NEXT:    [[TMP92:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; I64-NEXT:    br i1 [[TMP92]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; I64:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; I64-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; I64:       [[VEC_EPILOG_SCALAR_PH]]:
+;
+; I32-LABEL: define void @test_store_initially_interleave(
+; I32-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; I32-NEXT:  [[ENTRY:.*:]]
+; I32-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 1
+; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], 4
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; I32:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; I32-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP0]], 16
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32:       [[VECTOR_PH]]:
+; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
+; I32-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; I32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 16, i32 [[N_MOD_VF]]
+; I32-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP2]]
+; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I32:       [[VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; I32-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; I32-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; I32-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; I32-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 1
+; I32-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 2
+; I32-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 3
+; I32-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 4
+; I32-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 5
+; I32-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 6
+; I32-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 7
+; I32-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 8
+; I32-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 9
+; I32-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 10
+; I32-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 11
+; I32-NEXT:    [[TMP40:%.*]] = add i32 [[INDEX]], 12
+; I32-NEXT:    [[TMP41:%.*]] = add i32 [[INDEX]], 13
+; I32-NEXT:    [[TMP42:%.*]] = add i32 [[INDEX]], 14
+; I32-NEXT:    [[TMP43:%.*]] = add i32 [[INDEX]], 15
+; I32-NEXT:    [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I32-NEXT:    [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT:    [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT:    [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I32-NEXT:    [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
+; I32-NEXT:    [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
+; I32-NEXT:    [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I32-NEXT:    [[TMP18:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I32-NEXT:    [[TMP19:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I32-NEXT:    [[TMP20:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I32-NEXT:    [[TMP21:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I32-NEXT:    [[TMP22:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I32-NEXT:    [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I32-NEXT:    [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I32-NEXT:    [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I32-NEXT:    [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I32-NEXT:    [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP40]]
+; I32-NEXT:    [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP41]]
+; I32-NEXT:    [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP42]]
+; I32-NEXT:    [[TMP71:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP43]]
+; I32-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP15]], align 4
+; I32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[TMP16]], align 4
+; I32-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP17]], align 4
+; I32-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP18]], align 4
+; I32-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[TMP19]], align 4
+; I32-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[TMP20]], align 4
+; I32-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[TMP21]], align 4
+; I32-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP22]], align 4
+; I32-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[TMP56]], align 4
+; I32-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[TMP57]], align 4
+; I32-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP58]], align 4
+; I32-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[TMP59]], align 4
+; I32-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP60]], align 4
+; I32-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4
+; I32-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4
+; I32-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4
+; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
+; I32-NEXT:    store double [[TMP31]], ptr [[TMP23]], align 4
+; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
+; I32-NEXT:    store double [[TMP32]], ptr [[TMP24]], align 4
+; I32-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
+; I32-NEXT:    store double [[TMP33]], ptr [[TMP25]], align 4
+; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
+; I32-NEXT:    store double [[TMP34]], ptr [[TMP26]], align 4
+; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
+; I32-NEXT:    store double [[TMP35]], ptr [[TMP27]], align 4
+; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
+; I32-NEXT:    store double [[TMP36]], ptr [[TMP28]], align 4
+; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
+; I32-NEXT:    store double [[TMP37]], ptr [[TMP29]], align 4
+; I32-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
+; I32-NEXT:    store double [[TMP38]], ptr [[TMP30]], align 4
+; I32-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
+; I32-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
+; I32-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
+; I32-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
+; I32-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
+; I32-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
+; I32-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
+; I32-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
+; I32-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
+; I32-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
+; I32-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
+; I32-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
+; I32-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
+; I32-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
+; I32-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
+; I32-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; I32-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; I32-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I32:       [[MIDDLE_BLOCK]]:
+; I32-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; I32:       [[VEC_EPILOG_ITER_CHECK]]:
+; I32-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP0]], [[N_VEC]]
+; I32-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[N_VEC_REMAINING]], 4
+; I32-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; I32:       [[VEC_EPILOG_PH]]:
+; I32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; I32-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[TMP0]], 4
+; I32-NEXT:    [[TMP72:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0
+; I32-NEXT:    [[TMP73:%.*]] = select i1 [[TMP72]], i32 4, i32 [[N_MOD_VF2]]
+; I32-NEXT:    [[N_VEC3:%.*]] = sub i32 [[TMP0]], [[TMP73]]
+; I32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; I32-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; I32-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; I32-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; I32:       [[VEC_EPILOG_VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I32-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I32-NEXT:    [[TMP74:%.*]] = add i32 [[INDEX4]], 0
+; I32-NEXT:    [[TMP75:%.*]] = add i32 [[INDEX4]], 1
+; I32-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 2
+; I32-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 3
+; I32-NEXT:    [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I32-NEXT:    [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I32-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I32-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I32-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I32-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4
+; I32-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
+; I32-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
+; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
+; I32-NEXT:    [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0
+; I32-NEXT:    store double [[TMP87]], ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1
+; I32-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
+; I32-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2
+; I32-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3
+; I32-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
+; I32-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
+; I32-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
+; I32-NEXT:    [[TMP91:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; I32-NEXT:    br i1 [[TMP91]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; I32:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; I32-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; I32:       [[VEC_EPILOG_SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %conv = uitofp i32 %iv to double
+  %add.ptr.i = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 %iv
+  %0 = load ptr, ptr %add.ptr.i, align 4
+  store double %conv, ptr %0, align 4
+  %inc = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:                    ; preds = %loop
+  ret void
+}
+
+define void @test_store_loaded_value(ptr noalias %src, ptr noalias %dst, i32 %n) #0 {
+; I64-LABEL: define void @test_store_loaded_value(
+; I64-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; I64-NEXT:  [[BB:.*:]]
+; I64-NEXT:    [[PRE:%.*]] = icmp slt i32 [[N]], 1
+; I64-NEXT:    br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]]
+; I64:       [[PH]]:
+; I64-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; I64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4
+; I64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4
+; I64-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I64-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I64-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I64-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]]
+; I64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; I64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; I64-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; I64-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; I64-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; I64-NEXT:    [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
+; I64-NEXT:    [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; I64-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP0]], 1
+; I64-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP1]], 1
+; I64-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP2]], 1
+; I64-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP3]], 1
+; I64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; I64-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; I64-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; I64-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
+; I64-NEXT:    store double [[TMP8]], ptr [[TMP16]], align 8
+; I64-NEXT:    store double [[TMP9]], ptr [[TMP17]], align 8
+; I64-NEXT:    store double [[TMP10]], ptr [[TMP18]], align 8
+; I64-NEXT:    store double [[TMP11]], ptr [[TMP19]], align 8
+; I64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; I64-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; I64:       [[SCALAR_PH]]:
+;
+; I32-LABEL: define void @test_store_loaded_value(
+; I32-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; I32-NEXT:  [[BB:.*:]]
+; I32-NEXT:    [[PRE:%.*]] = icmp slt i32 [[N]], 1
+; I32-NEXT:    br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]]
+; I32:       [[PH]]:
+; I32-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32:       [[VECTOR_PH]]:
+; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4
+; I32-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I32:       [[VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I32-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I32-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I32-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]]
+; I32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; I32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; I32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; I32-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; I32-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; I32-NEXT:    [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
+; I32-NEXT:    [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; I32-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP0]], 1
+; I32-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP1]], 1
+; I32-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP2]], 1
+; I32-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP3]], 1
+; I32-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; I32-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; I32-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; I32-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
+; I32-NEXT:    store double [[TMP8]], ptr [[TMP16]], align 8
+; I32-NEXT:    store double [[TMP9]], ptr [[TMP17]], align 8
+; I32-NEXT:    store double [[TMP10]], ptr [[TMP18]], align 8
+; I32-NEXT:    store double [[TMP11]], ptr [[TMP19]], align 8
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; I32:       [[MIDDLE_BLOCK]]:
+; I32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; I32:       [[SCALAR_PH]]:
+;
+bb:
+  %pre = icmp slt i32 %n, 1
+  br i1 %pre, label %exit, label %ph
+
+ph:
+  %n.ext = zext i32 %n to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i8, ptr %src, i64 %iv
+  %l = load double, ptr %gep.src, align 8
+  %sext = shl i64 %iv, 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %sext
+  store double %l, ptr %gep.dst, align 8
+  %ec = icmp eq i64 %iv.next, %n.ext
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %ptr.c) {
+; I64-LABEL: define double @test_load_used_by_other_load_scev(
+; I64-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {
+; I64-NEXT:  [[ENTRY:.*]]:
+; I64-NEXT:    br label %[[OUTER_LOOP:.*]]
+; I64:       [[OUTER_LOOP_LOOPEXIT:.*]]:
+; I64-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], [[INNER_LOOP:%.*]] ], [ [[TMP29:%.*]], %[[MIDDLE_BLOCK:.*]] ]
+; I64-NEXT:    br label %[[OUTER_LOOP]]
+; I64:       [[OUTER_LOOP]]:
+; I64-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ]
+; I64-NEXT:    [[COND:%.*]] = call i1 @cond()
+; I64-NEXT:    br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], [[EXIT:label %.*]]
+; I64:       [[INNER_LOOP_PREHEADER]]:
+; I64-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[TMP0:%.*]] = add i64 0, 1
+; I64-NEXT:    [[TMP1:%.*]] = add i64 1, 1
+; I64-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
+; I64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
+; I64-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
+; I64-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
+; I64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; I64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
+; I64-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
+; I64-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
+; I64-NEXT:    [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
+; I64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
+; I64-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; I64-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
+; I64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
+; I64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
+; I64-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
+; I64-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
+; I64-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
+; I64-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
+; I64-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
+; I64-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
+; I64-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; I64-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
+; I64-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
+; I64-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
+; I64-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
+; I64-NEXT:    [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
+; I64-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
+; I64-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
+; I64-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
+; I64-NEXT:    [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
+; I64-NEXT:    [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
+; I64-NEXT:    br label %[[MIDDLE_BLOCK]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
+; I64-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; I64-NEXT:    br i1 true, label %[[OUTER_LOOP_LOOPEXIT]], label %[[SCALAR_PH]]
+; I64:       [[SCALAR_PH]]:
+;
+; I32-LABEL: define double @test_load_used_by_other_load_scev(
+; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {
+; I32-NEXT:  [[ENTRY:.*]]:
+; I32-NEXT:    br label %[[OUTER_LOOP:.*]]
+; I32:       [[OUTER_LOOP]]:
+; I32-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
+; I32-NEXT:    [[COND:%.*]] = call i1 @cond()
+; I32-NEXT:    br i1 [[COND]], label %[[INNER_LOOP]], label %[[EXIT:.*]]
+; I32:       [[INNER_LOOP]]:
+; I32-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[OUTER_LOOP]] ], [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ]
+; I32-NEXT:    [[ACCUM_INNER:%.*]] = phi double [ [[ACCUM]], %[[OUTER_LOOP]] ], [ [[MUL1:%.*]], %[[INNER_LOOP]] ]
+; I32-NEXT:    [[IDX_PLUS1:%.*]] = add i64 [[IV]], 1
+; I32-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[IDX_PLUS1]]
+; I32-NEXT:    [[GEP_A_I64:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[IDX_PLUS1]]
+; I32-NEXT:    [[LOAD_IDX:%.*]] = load i64, ptr [[GEP_A_I64]], align 8
+; I32-NEXT:    [[GEP_B:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[LOAD_IDX]]
+; I32-NEXT:    [[LOAD_A:%.*]] = load double, ptr [[PTR_A]], align 8
+; I32-NEXT:    [[ADD1:%.*]] = fadd double [[LOAD_A]], 0.000000e+00
+; I32-NEXT:    [[GEP_C_OFFSET:%.*]] = getelementptr i8, ptr [[GEP_C]], i64 8
+; I32-NEXT:    [[LOAD_C:%.*]] = load double, ptr [[GEP_C_OFFSET]], align 8
+; I32-NEXT:    [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
+; I32-NEXT:    [[MUL2:%.*]] = fmul double [[LOAD_C]], 0.000000e+00
+; I32-NEXT:    [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
+; I32-NEXT:    [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
+; I32-NEXT:    [[LOAD_B:%.*]] = load double, ptr [[GEP_B]], align 8
+; I32-NEXT:    [[DIV:%.*]] = fdiv double [[LOAD_B]], [[ADD3]]
+; I32-NEXT:    [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
+; I32-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; I32-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
+; I32-NEXT:    br i1 [[EXITCOND]], label %[[OUTER_LOOP]], label %[[INNER_LOOP]]
+; I32:       [[EXIT]]:
+; I32-NEXT:    ret double [[ACCUM]]
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %accum = phi double [ 0.0, %entry ], [ %result, %inner.loop ]
+  %cond = call i1 @cond()
+  br i1 %cond, label %inner.loop, label %exit
+
+inner.loop:
+  %iv = phi i64 [ 0, %outer.loop ], [ %iv.next, %inner.loop ]
+  %accum.inner = phi double [ %accum, %outer.loop ], [ %mul1, %inner.loop ]
+  %idx.plus1 = add i64 %iv, 1
+  %gep.c = getelementptr i8, ptr %ptr.c, i64 %idx.plus1
+  %gep.a.i64 = getelementptr i64, ptr %ptr.a, i64 %idx.plus1
+  %load.idx = load i64, ptr %gep.a.i64, align 8
+  %gep.b = getelementptr double, ptr %ptr.b, i64 %load.idx
+  %load.a = load double, ptr %ptr.a, align 8
+  %add1 = fadd double %load.a, 0.000000e+00
+  %gep.c.offset = getelementptr i8, ptr %gep.c, i64 8
+  %load.c = load double, ptr %gep.c.offset, align 8
+  %mul1 = fmul double %add1, 0.000000e+00
+  %mul2 = fmul double %load.c, 0.000000e+00
+  %add2 = fadd double %mul2, 0.000000e+00
+  %add3 = fadd double %add2, 1.000000e+00
+  %load.b = load double, ptr %gep.b, align 8
+  %div = fdiv double %load.b, %add3
+  %result = fsub double %accum.inner, %div
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv, 1
+  br i1 %exitcond, label %outer.loop, label %inner.loop
+
+exit:
+  ret double %accum
+}
+
+attributes #0 = { "target-cpu"="znver2" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
index 166875dd55aae..b612bfb88198e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -5,7 +5,7 @@
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %shift = ashr i32 %val, %k
 ; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
 ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k>
-define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr #0 {
+define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr {
 entry:
   br label %body
 
@@ -21,5 +21,31 @@ body:
 
 exit:
   ret void
+}
+
+; CHECK: 'shift_and_masked_load_store'
+; CHECK: Cost of 1 for VF 2: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2>
+; CHECK: Cost of 1 for VF 4: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2>
+; CHECK: Cost of 4 for VF 8: WIDEN ir<%shifted> = lshr ir<%iv>, ir<2>
+define void @shift_and_masked_load_store(i64 %trip.count) #0 {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %shifted = lshr i64 %iv, 2
+  %masked.idx = and i64 %shifted, 1
+  %load.ptr = getelementptr i16, ptr poison, i64 %masked.idx
+  %val = load i16, ptr %load.ptr, align 2
+  %store.idx = shl nuw i64 %iv, 2
+  %store.ptr = getelementptr i8, ptr poison, i64 %store.idx
+  store i16 %val, ptr %store.ptr, align 2
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp eq i64 %iv, %trip.count
+  br i1 %cmp, label %exit, label %loop
 
+exit:
+  ret void
 }
+
+attributes #0 = { "target-features"="+avx2" "tune-cpu"="alderlake" }
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
index df926fa6d189c..b62d9445f2597 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
@@ -117,7 +117,10 @@ define void @const_fold_select(ptr %dst, i64 %d) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[D]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[D]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], splat (i64 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
index a02fddc4cf72d..10db620d83f0d 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll
@@ -134,14 +134,18 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 3
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]]
-; CHECK-NEXT:    store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[A]], align 4, !alias.scope [[META12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index c5d318141369b..2848a2ce3d878 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -136,18 +136,18 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -516,18 +516,18 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -708,18 +708,18 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -900,18 +900,18 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4
 ; CHECK-VF1IC4-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP12]], true
-; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP13]], true
-; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP14]], true
-; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP15]], true
-; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]]
-; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]]
-; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]]
-; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]]
+; CHECK-VF1IC4-NEXT:    [[TMP16:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP17:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP18:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP19:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00
+; CHECK-VF1IC4-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP16]], true
+; CHECK-VF1IC4-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP17]], true
+; CHECK-VF1IC4-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP18]], true
+; CHECK-VF1IC4-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP19]], true
+; CHECK-VF1IC4-NEXT:    [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]]
+; CHECK-VF1IC4-NEXT:    [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]]
+; CHECK-VF1IC4-NEXT:    [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]]
+; CHECK-VF1IC4-NEXT:    [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1059,8 +1059,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4:       [[VECTOR_PH]]:
 ; CHECK-VF1IC4-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-VF1IC4-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[A]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP0]], true
+; CHECK-VF1IC4-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A]], 3
+; CHECK-VF1IC4-NEXT:    [[TMP0:%.*]] = xor i1 [[TMP1]], true
 ; CHECK-VF1IC4-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF1IC4:       [[VECTOR_BODY]]:
 ; CHECK-VF1IC4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -1068,10 +1068,10 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF1IC4-NEXT:    [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]]
-; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]]
-; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]]
-; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]]
+; CHECK-VF1IC4-NEXT:    [[TMP5]] = or i1 [[VEC_PHI]], [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP0]]
+; CHECK-VF1IC4-NEXT:    [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP0]]
 ; CHECK-VF1IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-VF1IC4-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF1IC4-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
index fb32e907c7d44..3084f9987262d 100644
--- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -19,10 +19,10 @@ define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
 ; CHECK-NEXT:    store i8 [[TMP3]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -85,13 +85,12 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> zeroinitializer, <4 x i16> [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
+; CHECK-NEXT:    store i16 [[TMP4]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
index 3b442a9ab4d3c..ba450e6878fe6 100644
--- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
@@ -83,6 +83,8 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 %
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -90,8 +92,8 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 %
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
@@ -166,6 +168,8 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -173,8 +177,8 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PREDPHI]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
index df8123d5fc2d0..67844faccaf0c 100644
--- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -26,7 +26,14 @@ define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
@@ -89,7 +96,14 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]]
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 732214aa1449e..0cf5c1a715538 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -905,7 +905,8 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
 ; CHECK-NEXT:    WIDEN ir<%l> = load vp<[[VEC_PTR]]>
 ; CHECK-NEXT:    WIDEN-CAST ir<%zext> = zext nneg ir<%l>
-; CHECK-NEXT:    REPLICATE store ir<%zext>, ir<%p1>
+; CHECK-NEXT:    EMIT vp<[[EXT:%.+]]> = extract-last-element ir<%zext>
+; CHECK-NEXT:    CLONE store vp<[[EXT]]>, ir<%p1>
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>
 ; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:  No successors
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 4a4bda254bf88..f2a128b05e3fb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -272,8 +272,9 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:   EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]>, vp<[[BTC]]>
 ; CHECK-NEXT:   CLONE ir<%lv> = load ir<%A>
-; CHECK-NEXT:   WIDEN ir<%cmp> = icmp uge ir<%iv>, ir<%k>
-; CHECK-NEXT:   EMIT vp<[[MASK2:%.+]]> = logical-and vp<[[MASK]]>, ir<%cmp>
+; CHECK-NEXT:   WIDEN ir<%cmp> = icmp ult ir<%iv>, ir<%k>
+; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%cmp>
+; CHECK-NEXT:   EMIT vp<[[MASK2:%.+]]> = logical-and vp<[[MASK]]>, vp<[[NOT]]>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -980,12 +981,13 @@ define void @sinking_requires_duplication(ptr %addr) {
 ; CHECK-NEXT:   CLONE ir<%gep> = getelementptr ir<%addr>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
 ; CHECK-NEXT:   WIDEN ir<%0> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:   WIDEN ir<%pred> = fcmp une ir<%0>, ir<0.000000e+00>
+; CHECK-NEXT:   WIDEN ir<%pred> = fcmp oeq ir<%0>, ir<0.000000e+00>
+; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%pred>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK ir<%pred>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[NOT]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
@@ -1150,12 +1152,13 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) {
 ; CHECK-NEXT:     CLONE ir<%ptr.iv.next> = getelementptr inbounds vp<[[PTR_IV]]>, ir<-1>
 ; CHECK-NEXT:     vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%ptr.iv.next>, vp<[[VF]]>
 ; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:     WIDEN ir<%c.1> = icmp ne ir<%l>, ir<0>
+; CHECK-NEXT:     WIDEN ir<%c.1> = icmp eq ir<%l>, ir<0>
+; CHECK-NEXT:     EMIT vp<[[NOT:%.+]]> = not ir<%c.1>
 ; CHECK-NEXT:   Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   <xVFxUF> pred.store: {
 ; CHECK-NEXT:     pred.store.entry:
-; CHECK-NEXT:       BRANCH-ON-MASK ir<%c.1>
+; CHECK-NEXT:       BRANCH-ON-MASK vp<[[NOT]]>
 ; CHECK-NEXT:     Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:     pred.store.if: