diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index ca0b7823558ff..cd2be3373f556 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -636,8 +636,12 @@ class ScalarEvolution { /// \p GEP The GEP. The indices contained in the GEP itself are ignored, /// instead we use IndexExprs. /// \p IndexExprs The expressions for the indices. - LLVM_ABI const SCEV * - getGEPExpr(GEPOperator *GEP, const SmallVectorImpl &IndexExprs); + LLVM_ABI const SCEV *getGEPExpr(GEPOperator *GEP, + ArrayRef IndexExprs); + LLVM_ABI const SCEV *getGEPExpr(const SCEV *BaseExpr, + ArrayRef IndexExprs, + Type *SrcElementTy, + GEPNoWrapFlags NW = GEPNoWrapFlags::none()); LLVM_ABI const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW); LLVM_ABI const SCEV *getMinMaxExpr(SCEVTypes Kind, SmallVectorImpl &Operands); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 98b793aace7a3..e4076084d5332 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -961,12 +961,10 @@ class TargetTransformInfo { TTI::TargetCostKind CostKind, bool ForPoisonSrc = true, ArrayRef VL = {}) const; - /// Estimate the overhead of scalarizing an instructions unique - /// non-constant operands. The (potentially vector) types to use for each of - /// argument are passes via Tys. + /// Estimate the overhead of scalarizing operands with the given types. The + /// (potentially vector) types to use for each of argument are passes via Tys. LLVM_ABI InstructionCost getOperandsScalarizationOverhead( - ArrayRef Args, ArrayRef Tys, - TTI::TargetCostKind CostKind) const; + ArrayRef Tys, TTI::TargetCostKind CostKind) const; /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index ddc8a5eaffa94..97c9e509f1df2 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -459,8 +459,7 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys, + getOperandsScalarizationOverhead(ArrayRef Tys, TTI::TargetCostKind CostKind) const { return 0; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index f6936b98bf3e4..91a2dd7eb612e 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -18,6 +18,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" @@ -347,6 +348,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Cost; } + /// Filter out constant and duplicated entries in \p Ops and return a vector + /// containing the types from \p Tys corresponding to the remaining operands. + static SmallVector + filterConstantAndDuplicatedOperands(ArrayRef Ops, + ArrayRef Tys) { + SmallPtrSet UniqueOperands; + SmallVector FilteredTys; + for (const auto &[Op, Ty] : zip_equal(Ops, Tys)) { + if (isa(Op) || !UniqueOperands.insert(Op).second) + continue; + FilteredTys.push_back(Ty); + } + return FilteredTys; + } + protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) : BaseT(DL) {} @@ -935,29 +951,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { CostKind); } - /// Estimate the overhead of scalarizing an instructions unique - /// non-constant operands. The (potentially vector) types to use for each of + /// Estimate the overhead of scalarizing an instruction's + /// operands. The (potentially vector) types to use for each of /// argument are passes via Tys. InstructionCost getOperandsScalarizationOverhead( - ArrayRef Args, ArrayRef Tys, - TTI::TargetCostKind CostKind) const override { - assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); - + ArrayRef Tys, TTI::TargetCostKind CostKind) const override { InstructionCost Cost = 0; - SmallPtrSet UniqueOperands; - for (int I = 0, E = Args.size(); I != E; I++) { + for (Type *Ty : Tys) { // Disregard things like metadata arguments. - const Value *A = Args[I]; - Type *Ty = Tys[I]; if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && !Ty->isPtrOrPtrVectorTy()) continue; - if (!isa(A) && UniqueOperands.insert(A).second) { - if (auto *VecTy = dyn_cast(Ty)) - Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, - /*Extract*/ true, CostKind); - } + if (auto *VecTy = dyn_cast(Ty)) + Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, + /*Extract*/ true, CostKind); } return Cost; @@ -974,7 +982,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost Cost = getScalarizationOverhead( RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); + Cost += getOperandsScalarizationOverhead( + filterConstantAndDuplicatedOperands(Args, Tys), CostKind); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. @@ -2158,8 +2167,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /*Insert=*/true, /*Extract=*/false, CostKind); } } - ScalarizationCost += - getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); + ScalarizationCost += getOperandsScalarizationOverhead( + filterConstantAndDuplicatedOperands(Args, ICA.getArgTypes()), + CostKind); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 239849e670350..53494e1aee1cc 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -3493,17 +3493,25 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS, /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0. // We can currently only fold X%N if X is constant. const SCEVConstant *StartC = dyn_cast(AR->getStart()); - if (StartC && !DivInt.urem(StepInt) && - getZeroExtendExpr(AR, ExtTy) == - getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), - getZeroExtendExpr(Step, ExtTy), - AR->getLoop(), SCEV::FlagAnyWrap)) { + if (StartC && !DivInt.urem(StepInt)) { const APInt &StartInt = StartC->getAPInt(); const APInt &StartRem = StartInt.urem(StepInt); - if (StartRem != 0) { - const SCEV *NewLHS = - getAddRecExpr(getConstant(StartInt - StartRem), Step, - AR->getLoop(), SCEV::FlagNW); + bool NoWrap = + getZeroExtendExpr(AR, ExtTy) == + getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy), + getZeroExtendExpr(Step, ExtTy), AR->getLoop(), + SCEV::FlagAnyWrap); + + // With N <= C and both N, C as powers-of-2, the transformation + // {X,+,N}/C => {(X - X%N),+,N}/C preserves division results even + // if wrapping occurs, as the division results remain equivalent for + // all offsets in [[(X - X%N), X). + bool CanFoldWithWrap = StepInt.ule(DivInt) && // N <= C + StepInt.isPowerOf2() && DivInt.isPowerOf2(); + if (StartRem != 0 && (NoWrap || CanFoldWithWrap)) { + const SCEV *NewLHS = getAddRecExpr( + getConstant(StartInt - StartRem), Step, AR->getLoop(), + NoWrap ? SCEV::FlagNW : SCEV::FlagAnyWrap); if (LHS != NewLHS) { LHS = NewLHS; @@ -3770,13 +3778,11 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, return getOrCreateAddRecExpr(Operands, L, Flags); } -const SCEV * -ScalarEvolution::getGEPExpr(GEPOperator *GEP, - const SmallVectorImpl &IndexExprs) { +const SCEV *ScalarEvolution::getGEPExpr(GEPOperator *GEP, + ArrayRef IndexExprs) { const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand()); // getSCEV(Base)->getType() has the same address space as Base->getType() // because SCEV::getType() preserves the address space. - Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType()); GEPNoWrapFlags NW = GEP->getNoWrapFlags(); if (NW != GEPNoWrapFlags::none()) { // We'd like to propagate flags from the IR to the corresponding SCEV nodes, @@ -3789,13 +3795,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP, NW = GEPNoWrapFlags::none(); } + return getGEPExpr(BaseExpr, IndexExprs, GEP->getSourceElementType(), NW); +} + +const SCEV *ScalarEvolution::getGEPExpr(const SCEV *BaseExpr, + ArrayRef IndexExprs, + Type *SrcElementTy, GEPNoWrapFlags NW) { SCEV::NoWrapFlags OffsetWrap = SCEV::FlagAnyWrap; if (NW.hasNoUnsignedSignedWrap()) OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNSW); if (NW.hasNoUnsignedWrap()) OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNUW); - Type *CurTy = GEP->getType(); + Type *CurTy = BaseExpr->getType(); + Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType()); bool FirstIter = true; SmallVector Offsets; for (const SCEV *IndexExpr : IndexExprs) { @@ -3814,7 +3827,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP, if (FirstIter) { assert(isa(CurTy) && "The first index of a GEP indexes a pointer"); - CurTy = GEP->getSourceElementType(); + CurTy = SrcElementTy; FirstIter = false; } else { CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8a470ebf85a16..ae7e738ea5a26 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -637,9 +637,8 @@ InstructionCost TargetTransformInfo::getScalarizationOverhead( } InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead( - ArrayRef Args, ArrayRef Tys, - TTI::TargetCostKind CostKind) const { - return TTIImpl->getOperandsScalarizationOverhead(Args, Tys, CostKind); + ArrayRef Tys, TTI::TargetCostKind CostKind) const { + return TTIImpl->getOperandsScalarizationOverhead(Tys, CostKind); } bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b5193f65593de..699f31f744342 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1152,7 +1152,10 @@ class LoopVectorizationCostModel { CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const { assert(!VF.isScalar() && "Expected vector VF"); - return CallWideningDecisions.at({CI, VF}); + auto I = CallWideningDecisions.find({CI, VF}); + if (I == CallWideningDecisions.end()) + return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0}; + return I->second; } /// Return True if instruction \p I is an optimizable truncate whose operand @@ -1562,7 +1565,7 @@ class LoopVectorizationCostModel { /// A type representing the costs for instructions if they were to be /// scalarized rather than vectorized. The entries are Instruction-Cost /// pairs. - using ScalarCostsTy = DenseMap; + using ScalarCostsTy = MapVector; /// A set containing all BasicBlocks that are known to present after /// vectorization as a predicated block. @@ -1596,7 +1599,7 @@ class LoopVectorizationCostModel { /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + MapVector InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. @@ -1665,7 +1668,9 @@ class LoopVectorizationCostModel { Instruction *I = dyn_cast(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I) || - getWideningDecision(I, VF) == CM_Scalarize) + getWideningDecision(I, VF) == CM_Scalarize || + (isa(I) && + getCallWideningDecision(cast(I), VF).Kind == CM_Scalarize)) return false; // Assume we can vectorize V (and hence we need extraction) if the @@ -1680,8 +1685,16 @@ class LoopVectorizationCostModel { /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, ElementCount VF) const { - return SmallVector(make_filter_range( - Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); + + SmallPtrSet UniqueOperands; + SmallVector Res; + for (Value *Op : Ops) { + if (isa(Op) || !UniqueOperands.insert(Op).second || + !needsExtract(Op, VF)) + continue; + Res.push_back(Op); + } + return Res; } public: @@ -3037,10 +3050,9 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // likely. ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); } - InstructionCost SafeDivisorCost = 0; + InstructionCost SafeDivisorCost = 0; auto *VecTy = toVectorTy(I->getType(), VF); - // The cost of the select guard to ensure all lanes are well defined // after we speculate above any internal control flow. SafeDivisorCost += @@ -3048,19 +3060,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, toVectorTy(Type::getInt1Ty(I->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); - // Certain instructions can be cheaper to vectorize if they have a constant - // second vector operand. One example of this are shifts on x86. - Value *Op2 = I->getOperand(1); - auto Op2Info = TTI.getOperandInfo(Op2); - if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && - Legal->isInvariant(Op2)) - Op2Info.Kind = TargetTransformInfo::OK_UniformValue; - SmallVector Operands(I->operand_values()); SafeDivisorCost += TTI.getArithmeticInstrCost( - I->getOpcode(), VecTy, CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - Op2Info, Operands, I); + I->getOpcode(), VecTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + Operands, I); return {ScalarizationCost, SafeDivisorCost}; } @@ -3311,6 +3316,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (!Ptr) continue; + // If the pointer can be proven to be uniform, always add it to the + // worklist. + if (isa(Ptr) && Legal->isUniform(Ptr, VF)) + AddToWorklistIfAllowed(cast(Ptr)); + if (IsUniformMemOpUse(&I)) AddToWorklistIfAllowed(&I); @@ -4284,6 +4294,25 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (!VPI) continue; switch (VPI->getOpcode()) { + // Selects are only modelled in the legacy cost model for safe + // divisors. + case Instruction::Select: { + VPValue *VPV = VPI->getVPSingleValue(); + if (VPV->getNumUsers() == 1) { + if (auto *WR = dyn_cast(*VPV->user_begin())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + continue; + default: + break; + } + } + } + [[fallthrough]]; + } case VPInstruction::ActiveLaneMask: case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); @@ -4925,7 +4954,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { - ScalarCostsVF.insert_range(ScalarCosts); + for (const auto &[I, IC] : ScalarCosts) + ScalarCostsVF.insert({I, IC}); // Check if we decided to scalarize a call. If so, update the widening // decision of the call to CM_Scalarize with the computed scalar cost. for (const auto &[I, Cost] : ScalarCosts) { @@ -5564,8 +5594,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, SmallVector Tys; for (auto *V : filterExtractingOperands(Ops, VF)) Tys.push_back(maybeVectorizeType(V->getType(), VF)); - return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), Tys, CostKind); + return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { @@ -5687,9 +5716,18 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // If the instructions belongs to an interleave group, the whole group // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. - if (const auto *Group = getInterleavedAccessGroup(&I)) - setWideningDecision(Group, VF, Decision, Cost); - else + if (const auto *Group = getInterleavedAccessGroup(&I)) { + if (Decision == CM_Scalarize) { + for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) { + if (auto *I = Group->getMember(Idx)) { + setWideningDecision(I, VF, Decision, + getMemInstScalarizationCost(I, VF)); + } + } + } else { + setWideningDecision(Group, VF, Decision, Cost); + } + } else setWideningDecision(&I, VF, Decision, Cost); } } @@ -5725,6 +5763,20 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Worklist.push_back(InstOp); } + auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) { + // If there are direct memory op users of the newly scalarized load, + // their cost may have changed because there's no scalarization + // overhead for the operand. Update it. + for (User *U : LI->users()) { + if (!isa(U)) + continue; + if (getWideningDecision(cast(U), VF) != CM_Scalarize) + continue; + setWideningDecision( + cast(U), VF, CM_Scalarize, + getMemInstScalarizationCost(cast(U), VF)); + } + }; for (auto *I : AddrDefs) { if (isa(I)) { // Setting the desired widening decision should ideally be handled in @@ -5732,20 +5784,26 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // if the loaded register is involved in an address computation, it is // instead changed here when we know this is the case. InstWidening Decision = getWideningDecision(I, VF); - if (Decision == CM_Widen || Decision == CM_Widen_Reverse) - // Scalarize a widened load of address. + if (Decision == CM_Widen || Decision == CM_Widen_Reverse || + (!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) && + Decision == CM_Scalarize)) { + // Scalarize a widened load of address or update the cost of a scalar + // load of an address. setWideningDecision( I, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); - else if (const auto *Group = getInterleavedAccessGroup(I)) { + UpdateMemOpUserCost(cast(I)); + } else if (const auto *Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { - if (Instruction *Member = Group->getMember(I)) + if (Instruction *Member = Group->getMember(I)) { setWideningDecision( Member, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + UpdateMemOpUserCost(cast(Member)); + } } } } else { @@ -6912,6 +6970,28 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, return nullptr; }; + // Check if a select for a safe divisor was hoisted to the pre-header. If so, + // the select doesn't need to be considered for the vector loop cost; go with + // the more accurate VPlan-based cost model. + for (VPRecipeBase &R : *Plan.getVectorPreheader()) { + auto *VPI = dyn_cast(&R); + if (!VPI || VPI->getOpcode() != Instruction::Select || + VPI->getNumUsers() != 1) + continue; + + if (auto *WR = dyn_cast(*VPI->user_begin())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + } + } + DenseSet SeenInstrs; auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -6952,12 +7032,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, if (Instruction *UI = GetInstructionForCost(&R)) { // If we adjusted the predicate of the recipe, the cost in the legacy // cost model may be different. - if (auto *WidenCmp = dyn_cast(&R)) { - if ((WidenCmp->getOpcode() == Instruction::ICmp || - WidenCmp->getOpcode() == Instruction::FCmp) && - WidenCmp->getPredicate() != cast(UI)->getPredicate()) - return true; - } + using namespace VPlanPatternMatch; + CmpPredicate Pred; + if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) && + cast(R).getPredicate() != + cast(UI)->getPredicate()) + return true; SeenInstrs.insert(UI); } } @@ -7217,8 +7297,9 @@ DenseMap LoopVectorizationPlanner::executePlan( // cost model is complete for better cost estimates. VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); - VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); + VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan); VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan); + VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF); bool HasBranchWeights = hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()); if (HasBranchWeights) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 40a55656bfa7e..a0f3b97b4b9c2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -372,6 +372,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { set(Def, VectorValue); } else { assert(!VF.isScalable() && "VF is assumed to be non scalable."); + assert(isa(Def) && + "Explicit BuildVector recipes must have" + "handled packing for non-VPInstructions."); // Initialize packing with insertelements to start from poison. VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF)); for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane) @@ -1074,12 +1077,17 @@ void VPlan::execute(VPTransformState *State) { InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) { // For now only return the cost of the vector loop region, ignoring any other - // blocks, like the preheader or middle blocks. + // blocks, like the preheader or middle blocks, expect for checking them for + // recipes with invalid costs. InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx); - // If any instructions in the middle block are invalid return invalid. - // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created. - if (!getMiddleBlock()->cost(VF, Ctx).isValid()) + // If the cost of the loop region is invalid or any recipe in the skeleton + // outside loop regions are invalid return an invalid cost. + if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly( + vp_depth_first_shallow(getEntry())), + [&VF, &Ctx](VPBasicBlock *VPBB) { + return !VPBB->cost(VF, Ctx).isValid(); + })) return InstructionCost::getInvalid(); return Cost; @@ -1684,3 +1692,38 @@ VPCostContext::getOperandInfo(VPValue *V) const { return TTI::getOperandInfo(V->getLiveInIRValue()); } + +InstructionCost VPCostContext::getScalarizationOverhead( + Type *ResultTy, ArrayRef Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { + if (VF.isScalar()) + return 0; + + InstructionCost ScalarizationCost = 0; + // Compute the cost of scalarizing the result if needed. + if (!ResultTy->isVoidTy()) { + for (Type *VectorTy : + to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) { + ScalarizationCost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert=*/true, + /*Extract=*/false, CostKind); + } + } + // Compute the cost of scalarizing the operands, skipping ones that do not + // require extraction/scalarization and do not incur any overhead. + SmallPtrSet UniqueOperands; + SmallVector Tys; + for (auto *Op : Operands) { + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa(Op)) || + (isa(Op) && + cast(Op)->getOpcode() == Instruction::Load) || + !UniqueOperands.insert(Op).second) + continue; + Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); + } + return ScalarizationCost + + TTI.getOperandsScalarizationOverhead(Tys, CostKind); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0115c6f4331c4..9ddc040cf6c70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -806,6 +806,9 @@ class VPIRFlags { GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; } + /// Returns true if the recipe has a comparison predicate. + bool hasPredicate() const { return OpType == OperationType::Cmp; } + /// Returns true if the recipe has fast-math flags. bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } @@ -899,6 +902,10 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { } void execute(VPTransformState &State) override = 0; + + /// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx. + InstructionCost getCostForRecipeWithOpcode(unsigned Opcode, ElementCount VF, + VPCostContext &Ctx) const; }; /// Helper to access the operand that contains the unroll part for this recipe diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 7075fb1f39613..d7f74fc0df7b8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -374,6 +374,15 @@ struct VPCostContext { /// legacy cost model for \p VF. Only used to check for additional VPlan /// simplifications. bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const; + + /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy + /// and \p Operands with \p VF. This is a convenience wrapper for the + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 0d206993cb043..4b940c28b158a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -461,6 +461,99 @@ m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) { return m_BinaryOr(Op0, Op1); } +/// Cmp_match is a variant of BinaryRecipe_match that also binds the comparison +/// predicate. Opcodes must either be Instruction::ICmp or Instruction::FCmp, or +/// both. +template +struct Cmp_match { + static_assert((sizeof...(Opcodes) == 1 || sizeof...(Opcodes) == 2) && + "Expected one or two opcodes"); + static_assert( + ((Opcodes == Instruction::ICmp || Opcodes == Instruction::FCmp) && ...) && + "Expected a compare instruction opcode"); + + CmpPredicate *Predicate = nullptr; + Op0_t Op0; + Op1_t Op1; + + Cmp_match(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) + : Predicate(&Pred), Op0(Op0), Op1(Op1) {} + Cmp_match(const Op0_t &Op0, const Op1_t &Op1) : Op0(Op0), Op1(Op1) {} + + bool match(const VPValue *V) const { + auto *DefR = V->getDefiningRecipe(); + return DefR && match(DefR); + } + + bool match(const VPRecipeBase *V) const { + if ((m_Binary(Op0, Op1).match(V) || ...)) { + if (Predicate) + *Predicate = cast(V)->getPredicate(); + return true; + } + return false; + } +}; + +/// SpecificCmp_match is a variant of Cmp_match that matches the comparison +/// predicate, instead of binding it. +template +struct SpecificCmp_match { + const CmpPredicate Predicate; + Op0_t Op0; + Op1_t Op1; + + SpecificCmp_match(CmpPredicate Pred, const Op0_t &LHS, const Op1_t &RHS) + : Predicate(Pred), Op0(LHS), Op1(RHS) {} + + bool match(const VPValue *V) const { + CmpPredicate CurrentPred; + return Cmp_match(CurrentPred, Op0, Op1) + .match(V) && + CmpPredicate::getMatching(CurrentPred, Predicate); + } +}; + +template +inline Cmp_match m_ICmp(const Op0_t &Op0, + const Op1_t &Op1) { + return Cmp_match(Op0, Op1); +} + +template +inline Cmp_match +m_ICmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) { + return Cmp_match(Pred, Op0, Op1); +} + +template +inline SpecificCmp_match +m_SpecificICmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) { + return SpecificCmp_match(MatchPred, Op0, + Op1); +} + +template +inline Cmp_match +m_Cmp(const Op0_t &Op0, const Op1_t &Op1) { + return Cmp_match(Op0, + Op1); +} + +template +inline Cmp_match +m_Cmp(CmpPredicate &Pred, const Op0_t &Op0, const Op1_t &Op1) { + return Cmp_match( + Pred, Op0, Op1); +} + +template +inline SpecificCmp_match +m_SpecificCmp(CmpPredicate MatchPred, const Op0_t &Op0, const Op1_t &Op1) { + return SpecificCmp_match( + MatchPred, Op0, Op1); +} + template using GEPLikeRecipe_match = BinaryRecipe_match using namespace llvm; +using namespace llvm::VPlanPatternMatch; using VectorParts = SmallVector; @@ -308,7 +309,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, VPRecipeBase *OpR = Op->getDefiningRecipe(); // If the partial reduction is predicated, a select will be operand 0 - using namespace llvm::VPlanPatternMatch; if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { OpR = Op->getDefiningRecipe(); } @@ -465,6 +465,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::Load: case VPInstruction::AnyOf: case VPInstruction::BranchOnCond: + case VPInstruction::BuildStructVector: + case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExplicitVectorLength: @@ -901,31 +903,105 @@ Value *VPInstruction::generate(VPTransformState &State) { } } +InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode( + unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const { + Type *ScalarTy = Ctx.Types.inferScalarType(this); + Type *ResultTy = VF.isVector() ? toVectorTy(ScalarTy, VF) : ScalarTy; + switch (Opcode) { + case Instruction::FNeg: + return Ctx.TTI.getArithmeticInstrCost(Opcode, ResultTy, Ctx.CostKind); + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + TargetTransformInfo::OperandValueInfo RHSInfo = { + TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}; + + if (VF.isVector()) { + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. One example of this are shifts on x86. + VPValue *RHS = getOperand(1); + RHSInfo = Ctx.getOperandInfo(RHS); + + if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && + getOperand(1)->isDefinedOutsideLoopRegions()) + RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; + } + + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + SmallVector Operands; + if (CtxI) + Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); + return Ctx.TTI.getArithmeticInstrCost( + Opcode, ResultTy, Ctx.CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + RHSInfo, Operands, CtxI, &Ctx.TLI); + } + case Instruction::Freeze: + // This opcode is unknown. Assume that it is the same as 'mul'. + return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy, + Ctx.CostKind); + case Instruction::ExtractValue: + return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, + Ctx.CostKind); + case Instruction::ICmp: + case Instruction::FCmp: { + Type *ScalarOpTy = Ctx.Types.inferScalarType(getOperand(0)); + Type *OpTy = VF.isVector() ? toVectorTy(ScalarOpTy, VF) : ScalarOpTy; + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + return Ctx.TTI.getCmpSelInstrCost( + Opcode, OpTy, CmpInst::makeCmpResultType(OpTy), getPredicate(), + Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, CtxI); + } + } + llvm_unreachable("called for unsupported opcode"); +} + InstructionCost VPInstruction::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (Instruction::isBinaryOp(getOpcode())) { - Type *ResTy = Ctx.Types.inferScalarType(this); - if (!vputils::onlyFirstLaneUsed(this)) - ResTy = toVectorTy(ResTy, VF); - - if (!getUnderlyingValue()) { - switch (getOpcode()) { - case Instruction::FMul: - return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); - default: - // TODO: Compute cost for VPInstructions without underlying values once - // the legacy cost model has been retired. - return 0; - } + if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) { + // TODO: Compute cost for VPInstructions without underlying values once + // the legacy cost model has been retired. + return 0; } assert(!doesGeneratePerAllLanes() && "Should only generate a vector value or single scalar, not scalars " "for all lanes."); - return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); + return getCostForRecipeWithOpcode( + getOpcode(), + vputils::onlyFirstLaneUsed(this) ? ElementCount::getFixed(1) : VF, Ctx); } switch (getOpcode()) { + case Instruction::Select: { + llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; + match(getOperand(0), m_Cmp(Pred, m_VPValue(), m_VPValue())); + auto *CondTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VecTy = Ctx.Types.inferScalarType(getOperand(1)); + if (!vputils::onlyFirstLaneUsed(this)) { + CondTy = toVectorTy(CondTy, VF); + VecTy = toVectorTy(VecTy, VF); + } + return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred, + Ctx.CostKind); + } case Instruction::ExtractElement: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); @@ -1542,18 +1618,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.set(this, V); } -InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. +static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, + ArrayRef Operands, + const VPRecipeWithIRFlags &R, + ElementCount VF, + VPCostContext &Ctx) { // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise // clear Arguments. // TODO: Rework TTI interface to be independent of concrete IR values. SmallVector Arguments; - for (const auto &[Idx, Op] : enumerate(operands())) { + for (const auto &[Idx, Op] : enumerate(Operands)) { auto *V = Op->getUnderlyingValue(); if (!V) { - if (auto *UI = dyn_cast_or_null(getUnderlyingValue())) { + if (auto *UI = dyn_cast_or_null(R.getUnderlyingValue())) { Arguments.push_back(UI->getArgOperand(Idx)); continue; } @@ -1563,21 +1643,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); + Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); + Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; SmallVector ParamTys; - for (unsigned I = 0; I != getNumOperands(); ++I) - ParamTys.push_back( - toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + for (const VPValue *Op : Operands) { + ParamTys.push_back(VF.isVector() + ? toVectorTy(Ctx.Types.inferScalarType(Op), VF) + : Ctx.Types.inferScalarType(Op)); + } // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. - FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); + FastMathFlags FMF = + R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( - VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null(getUnderlyingValue()), + ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast_or_null(R.getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } +InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + SmallVector ArgOps(operands()); + return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx); +} + StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } @@ -1741,7 +1831,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF, Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); VPValue *Op0, *Op1; - using namespace llvm::VPlanPatternMatch; if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 && (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) || match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) { @@ -1963,20 +2052,15 @@ void VPWidenRecipe::execute(VPTransformState &State) { InstructionCost VPWidenRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { switch (Opcode) { - case Instruction::FNeg: { - Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - return Ctx.TTI.getArithmeticInstrCost( - Opcode, VectorTy, Ctx.CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); - } - case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // More complex computation, let the legacy cost-model handle this for now. - return Ctx.getLegacyCost(cast(getUnderlyingValue()), VF); + // If the div/rem operation isn't safe to speculate and requires + // predication, then the only way we can even create a vplan is to insert + // a select on the second input operand to ensure we use the value of 1 + // for the inactive lanes. The select will be costed separately. + case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -1990,45 +2074,12 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { - VPValue *RHS = getOperand(1); - // Certain instructions can be cheaper to vectorize if they have a constant - // second vector operand. One example of this are shifts on x86. - TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(RHS); - - if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue && - getOperand(1)->isDefinedOutsideLoopRegions()) - RHSInfo.Kind = TargetTransformInfo::OK_UniformValue; - Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); - - SmallVector Operands; - if (CtxI) - Operands.append(CtxI->value_op_begin(), CtxI->value_op_end()); - return Ctx.TTI.getArithmeticInstrCost( - Opcode, VectorTy, Ctx.CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - RHSInfo, Operands, CtxI, &Ctx.TLI); - } - case Instruction::Freeze: { - // This opcode is unknown. Assume that it is the same as 'mul'. - Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); - return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - Ctx.CostKind); - } - case Instruction::ExtractValue: { - return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue, - Ctx.CostKind); - } + case Instruction::Xor: + case Instruction::Freeze: + case Instruction::ExtractValue: case Instruction::ICmp: - case Instruction::FCmp: { - Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); - Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); - return Ctx.TTI.getCmpSelInstrCost( - Opcode, VectorTy, CmpInst::makeCmpResultType(VectorTy), getPredicate(), - Ctx.CostKind, {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_AnyValue, TTI::OP_None}, CtxI); - } + case Instruction::FCmp: + return getCostForRecipeWithOpcode(getOpcode(), VF, Ctx); default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -2863,6 +2914,9 @@ static void scalarizeInstruction(const Instruction *Instr, RepRecipe->applyFlags(*Cloned); RepRecipe->applyMetadata(*Cloned); + if (RepRecipe->hasPredicate()) + cast(Cloned)->setPredicate(RepRecipe->getPredicate()); + if (auto DL = RepRecipe->getDebugLoc()) State.setDebugLocFrom(DL); @@ -2931,6 +2985,83 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns a SCEV expression for \p Ptr if it is a pointer computation for +/// which the legacy cost model computes a SCEV expression when computing the +/// address cost. Computing SCEVs for VPValues is incomplete and returns +/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In +/// those cases we fall back to the legacy cost model. Otherwise return nullptr. +static const SCEV *getAddressAccessSCEV(const VPValue *Ptr, ScalarEvolution &SE, + const Loop *L) { + using namespace llvm::VPlanPatternMatch; + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa(PtrR) && + cast(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa(PtrR) || + match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue())))) + return nullptr; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa(Opd)) + return nullptr; + } + + return vputils::getSCEVExprForVPValue(Ptr, SE, L); +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet Seen; + SmallVector WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + auto *Blend = dyn_cast(Cur); + // Skip blends that use V only through a compare by checking if any incoming + // value was already visited. + if (Blend && none_of(seq(0, Blend->getNumIncomingValues()), + [&](unsigned I) { + return Seen.contains( + Blend->getIncomingValue(I)->getDefiningRecipe()); + })) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + // The legacy cost model only supports scalarization loads/stores with phi + // addresses, if the phi is directly used as load/store address. Don't + // traverse further for Blends. + if (Blend) + continue; + + append_range(WorkList, Cur->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast(getUnderlyingValue()); @@ -2938,8 +3069,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // transform, avoid computing their cost multiple times for now. Ctx.SkipCostComputation.insert(UI); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - Type *ResultTy = Ctx.Types.inferScalarType(this); switch (UI->getOpcode()) { case Instruction::GetElementPtr: // We mark this instruction as zero-cost because the cost of GEPs in @@ -2947,6 +3076,52 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // is scalarized or not. Therefore, we handle GEPs with the memory // instruction cost. return 0; + case Instruction::Call: { + auto *CalledFn = + cast(getOperand(getNumOperands() - 1)->getLiveInIRValue()); + + SmallVector ArgOps(drop_end(operands())); + SmallVector Tys; + for (const VPValue *ArgOp : ArgOps) + Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); + + if (CalledFn->isIntrinsic()) + // Various pseudo-intrinsics with costs of 0 are scalarized instead of + // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. + switch (CalledFn->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + case Intrinsic::sideeffect: + case Intrinsic::pseudoprobe: + case Intrinsic::experimental_noalias_scope_decl: { + assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx) == 0 && + "scalarizing intrinsic should be free"); + return InstructionCost(0); + } + default: + break; + } + + Type *ResultTy = Ctx.Types.inferScalarType(this); + InstructionCost ScalarCallCost = + Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + if (isSingleScalar()) { + if (CalledFn->isIntrinsic()) + ScalarCallCost = std::min( + ScalarCallCost, + getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx)); + return ScalarCallCost; + } + + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + return ScalarCallCost * VF.getFixedValue() + + Ctx.getScalarizationOverhead(ResultTy, ArgOps, VF); + } case Instruction::Add: case Instruction::Sub: case Instruction::FAdd: @@ -2960,14 +3135,91 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { - auto Op2Info = Ctx.getOperandInfo(getOperand(1)); - SmallVector Operands(UI->operand_values()); - return Ctx.TTI.getArithmeticInstrCost( - UI->getOpcode(), ResultTy, CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - Op2Info, Operands, UI, &Ctx.TLI) * + case Instruction::Xor: + return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), + Ctx) * (isSingleScalar() ? 1 : VF.getFixedValue()); + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: { + InstructionCost ScalarCost = + getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx); + if (isSingleScalar()) + return ScalarCost; + + ScalarCost = ScalarCost * VF.getFixedValue() + + Ctx.getScalarizationOverhead(Ctx.Types.inferScalarType(this), + to_vector(operands()), VF); + // If the recipe is not predicated (i.e. not in a replicate region), return + // the scalar cost. Otherwise handle predicated cost. + if (!getParent()->getParent()->isReplicator()) + return ScalarCost; + + // Account for the phi nodes that we will create. + ScalarCost += VF.getFixedValue() * + Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); + // Scale the cost by the probability of executing the predicated blocks. + // This assumes the predicated block for each vector lane is equally + // likely. + ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind); + return ScalarCost; + } + case Instruction::Load: + case Instruction::Store: { + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + + // TODO: See getMemInstScalarizationCost for how to handle replicating and + // predicated cases. + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + const SCEV *PtrSCEV = getAddressAccessSCEV(PtrOp, Ctx.SE, Ctx.L); + if (isa_and_nonnull(PtrSCEV)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = cast(ScalarPtrTy)->getAddressSpace(); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + bool UsedByLoadStoreAddress = + !PreferVectorizedAddressing && isUsedByLoadStoreAddress(this); + InstructionCost ScalarCost = + ScalarMemOpCost + + Ctx.TTI.getAddressComputationCost( + PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE, PtrSCEV); + if (isSingleScalar()) + return ScalarCost; + + SmallVector OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + if (!UsedByLoadStoreAddress) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b612ac3b19c0f..479b9d3371169 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1296,6 +1296,20 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { continue; auto *RepOrWidenR = cast(&R); + if (RepR && isa(RepR->getUnderlyingInstr()) && + vputils::isSingleScalar(RepR->getOperand(1))) { + auto *Clone = new VPReplicateRecipe( + RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), + true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/); + Clone->insertBefore(RepOrWidenR); + auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement, + {Clone->getOperand(0)}); + Ext->insertBefore(Clone); + Clone->setOperand(0, Ext); + RepR->eraseFromParent(); + continue; + } + // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. @@ -1988,9 +2002,8 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); runPass(simplifyBlends, Plan); - runPass(removeDeadRecipes, Plan); - runPass(narrowToSingleScalarRecipes, Plan); runPass(legalizeAndOptimizeInductions, Plan); + runPass(narrowToSingleScalarRecipes, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); runPass(removeBranchOnConst, Plan); @@ -3417,6 +3430,52 @@ void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE, } } +void VPlanTransforms::materializeBuildVectors(VPlan &Plan) { + if (Plan.hasScalarVFOnly()) + return; + + VPTypeAnalysis TypeInfo(Plan); + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getEntry())); + auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly( + vp_depth_first_shallow(LoopRegion->getEntry())); + // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes, + // excluding ones in replicate regions. Those are not materialized explicitly + // yet. Those vector users are still handled in VPReplicateRegion::execute(), + // via shouldPack(). + // TODO: materialize build vectors for replicating recipes in replicating + // regions. + // TODO: materialize build vectors for VPInstructions. + for (VPBasicBlock *VPBB : + concat(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *RepR = dyn_cast(&R); + auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) { + VPRegionBlock *ParentRegion = + cast(U)->getParent()->getParent(); + return !U->usesScalars(RepR) || ParentRegion != LoopRegion; + }; + if (!RepR || RepR->isSingleScalar() || + none_of(RepR->users(), UsesVectorOrInsideReplicateRegion)) + continue; + + Type *ScalarTy = TypeInfo.inferScalarType(RepR); + unsigned Opcode = ScalarTy->isStructTy() + ? VPInstruction::BuildStructVector + : VPInstruction::BuildVector; + auto *BuildVector = new VPInstruction(Opcode, {RepR}); + BuildVector->insertAfter(RepR); + + RepR->replaceUsesWithIf( + BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion]( + VPUser &U, unsigned) { + return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U); + }); + } + } +} + /// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be /// converted to a narrower recipe. \p V is used by a wide recipe that feeds a /// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index dfd9fd09ff9d4..71af37b0af48b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -248,6 +248,10 @@ struct VPlanTransforms { static void sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE, const Loop *L); + /// Add explicit Build[Struct]Vector recipes that combine multiple scalar + /// values into single vectors. + static void materializeBuildVectors(VPlan &Plan); + /// Try to convert a plan with interleave groups with VF elements to a plan /// with the interleave groups replaced by wide loads and stores processing VF /// elements, if all transformed interleave groups access the full vector diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index b89cd21595efd..9061bcbc98f47 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -455,10 +455,12 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { VPlanTransforms::removeDeadRecipes(Plan); } -/// Create a single-scalar clone of \p RepR for lane \p Lane. -static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, - Type *IdxTy, VPReplicateRecipe *RepR, - VPLane Lane) { +/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p +/// Def2LaneDefs to look up scalar definitions for operands of \RepR. +static VPReplicateRecipe * +cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, + VPReplicateRecipe *RepR, VPLane Lane, + const DenseMap> &Def2LaneDefs) { // Collect the operands at Lane, creating extracts as needed. SmallVector NewOps; for (VPValue *Op : RepR->operands()) { @@ -471,6 +473,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); continue; } + // If Op is a definition that has been unrolled, directly use the clone for + // the corresponding lane. + auto LaneDefs = Def2LaneDefs.find(Op); + if (LaneDefs != Def2LaneDefs.end()) { + NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]); + continue; + } + // Look through buildvector to avoid unnecessary extracts. if (match(Op, m_BuildVector())) { NewOps.push_back( @@ -503,6 +513,13 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())); auto VPBBsToUnroll = concat(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion); + // A mapping of current VPValue definitions to collections of new VPValues + // defined per lane. Serves to hook-up potential users of current VPValue + // definition that are replicated-per-VF later. + DenseMap> Def2LaneDefs; + // The removal of current recipes being replaced by new ones needs to be + // delayed after Def2LaneDefs is no longer in use. + SmallVector ToRemove; for (VPBasicBlock *VPBB : VPBBsToUnroll) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { auto *RepR = dyn_cast(&R); @@ -511,39 +528,42 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { VPBuilder Builder(RepR); if (RepR->getNumUsers() == 0) { - if (isa(RepR->getUnderlyingInstr()) && - vputils::isSingleScalar(RepR->getOperand(1))) { - // Stores to invariant addresses need to store the last lane only. - cloneForLane(Plan, Builder, IdxTy, RepR, - VPLane::getLastLaneForVF(VF)); - } else { - // Create single-scalar version of RepR for all lanes. - for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)); - } + // Create single-scalar version of RepR for all lanes. + for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs); RepR->eraseFromParent(); continue; } /// Create single-scalar version of RepR for all lanes. SmallVector LaneDefs; for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) - LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I))); + LaneDefs.push_back( + cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs)); + Def2LaneDefs[RepR] = LaneDefs; /// Users that only demand the first lane can use the definition for lane /// 0. RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) { return U.onlyFirstLaneUsed(RepR); }); - // If needed, create a Build(Struct)Vector recipe to insert the scalar - // lane values into a vector. - Type *ResTy = RepR->getUnderlyingInstr()->getType(); - VPValue *VecRes = Builder.createNaryOp( - ResTy->isStructTy() ? VPInstruction::BuildStructVector - : VPInstruction::BuildVector, - LaneDefs); - RepR->replaceAllUsesWith(VecRes); - RepR->eraseFromParent(); + // Update each build vector user that currently has RepR as its only + // operand, to have all LaneDefs as its operands. + for (VPUser *U : to_vector(RepR->users())) { + auto *VPI = dyn_cast(U); + if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector && + VPI->getOpcode() != VPInstruction::BuildStructVector)) + continue; + assert(VPI->getNumOperands() == 1 && + "Build(Struct)Vector must have a single operand before " + "replicating by VF"); + VPI->setOperand(0, LaneDefs[0]); + for (VPValue *LaneDef : drop_begin(LaneDefs)) + VPI->addOperand(LaneDef); + } + ToRemove.push_back(RepR); } } + for (auto *R : reverse(ToRemove)) + R->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index a1c71512a1ed2..1e45380db9759 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -137,8 +137,9 @@ const SCEV *vputils::getSCEVExprForVPValue(const VPValue *V, IndexExprs.push_back(IndexExpr); } - auto *GEP = cast(R->getUnderlyingInstr()); - return SE.getGEPExpr(const_cast(GEP), IndexExprs); + Type *SrcElementTy = cast(R->getUnderlyingInstr()) + ->getSourceElementType(); + return SE.getGEPExpr(Base, IndexExprs, SrcElementTy); }) .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); }); } diff --git a/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll b/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll new file mode 100644 index 0000000000000..9a9a6a7d45931 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/addrec-may-wrap-udiv-canonicalize.ll @@ -0,0 +1,402 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +declare void @use(i64) + +define void @test_step2_div4(i64 %n) { +; CHECK-LABEL: 'test_step2_div4' +; CHECK-NEXT: Classifying expressions for: @test_step2_div4 +; CHECK-NEXT: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {0,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.0 = udiv i64 %iv, 4 +; CHECK-NEXT: --> ({0,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.1 = add i64 %iv, 1 +; CHECK-NEXT: --> {1,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.1 = udiv i64 %iv.1, 4 +; CHECK-NEXT: --> ({0,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.2 = add i64 %iv, 2 +; CHECK-NEXT: --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.2 = udiv i64 %iv.2, 4 +; CHECK-NEXT: --> ({2,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.neg.1 = add i64 %iv, -1 +; CHECK-NEXT: --> {-1,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.neg.1 = udiv i64 %iv.neg.1, 4 +; CHECK-NEXT: --> ({-2,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = add i64 %iv, 2 +; CHECK-NEXT: --> {2,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_step2_div4 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %div.0 = udiv i64 %iv, 4 + call void @use(i64 %div.0) + %iv.1 = add i64 %iv, 1 + %div.1 = udiv i64 %iv.1, 4 + call void @use(i64 %div.1) + %iv.2 = add i64 %iv, 2 + %div.2 = udiv i64 %iv.2, 4 + call void @use(i64 %div.2) + %iv.neg.1 = add i64 %iv, -1 + %div.neg.1 = udiv i64 %iv.neg.1, 4 + call void @use(i64 %div.neg.1) + %iv.next = add i64 %iv, 2 + %cond = icmp slt i64 %iv, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @test_step3_div6(i64 %n) { +; CHECK-LABEL: 'test_step3_div6' +; CHECK-NEXT: Classifying expressions for: @test_step3_div6 +; CHECK-NEXT: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {0,+,3}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.0 = udiv i64 %iv, 6 +; CHECK-NEXT: --> ({0,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.1 = add i64 %iv, 1 +; CHECK-NEXT: --> {1,+,3}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.1 = udiv i64 %iv.1, 6 +; CHECK-NEXT: --> ({1,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.2 = add i64 %iv, 2 +; CHECK-NEXT: --> {2,+,3}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.2 = udiv i64 %iv.2, 6 +; CHECK-NEXT: --> ({2,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.neg.1 = add i64 %iv, -1 +; CHECK-NEXT: --> {-1,+,3}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.neg.1 = udiv i64 %iv.neg.1, 6 +; CHECK-NEXT: --> ({-1,+,3}<%loop> /u 6) U: [0,3074457345618258603) S: [0,3074457345618258603) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = add i64 %iv, 3 +; CHECK-NEXT: --> {3,+,3}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_step3_div6 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %div.0 = udiv i64 %iv, 6 + call void @use(i64 %div.0) + %iv.1 = add i64 %iv, 1 + %div.1 = udiv i64 %iv.1, 6 + call void @use(i64 %div.1) + %iv.2 = add i64 %iv, 2 + %div.2 = udiv i64 %iv.2, 6 + call void @use(i64 %div.2) + %iv.neg.1 = add i64 %iv, -1 + %div.neg.1 = udiv i64 %iv.neg.1, 6 + call void @use(i64 %div.neg.1) + %iv.next = add i64 %iv, 3 + %cond = icmp slt i64 %iv, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + + +define void @test_step4_div4(i64 %n) { +; CHECK-LABEL: 'test_step4_div4' +; CHECK-NEXT: Classifying expressions for: @test_step4_div4 +; CHECK-NEXT: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {0,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.0 = udiv i64 %iv, 4 +; CHECK-NEXT: --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.1 = add i64 %iv, 1 +; CHECK-NEXT: --> {1,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.1 = udiv i64 %iv.1, 4 +; CHECK-NEXT: --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.2 = add i64 %iv, 2 +; CHECK-NEXT: --> {2,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.2 = udiv i64 %iv.2, 4 +; CHECK-NEXT: --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.3 = add i64 %iv, 3 +; CHECK-NEXT: --> {3,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.3 = udiv i64 %iv.3, 4 +; CHECK-NEXT: --> ({0,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.4 = add i64 %iv, 4 +; CHECK-NEXT: --> {4,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.4 = udiv i64 %iv.4, 4 +; CHECK-NEXT: --> ({4,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.5 = add i64 %iv, 5 +; CHECK-NEXT: --> {5,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %div.5 = udiv i64 %iv.5, 4 +; CHECK-NEXT: --> ({4,+,4}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = add i64 %iv, 4 +; CHECK-NEXT: --> {4,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_step4_div4 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %div.0 = udiv i64 %iv, 4 + call void @use(i64 %div.0) + %iv.1 = add i64 %iv, 1 + %div.1 = udiv i64 %iv.1, 4 + call void @use(i64 %div.1) + %iv.2 = add i64 %iv, 2 + %div.2 = udiv i64 %iv.2, 4 + call void @use(i64 %div.2) + %iv.3 = add i64 %iv, 3 + %div.3 = udiv i64 %iv.3, 4 + call void @use(i64 %div.3) + %iv.4 = add i64 %iv, 4 + %div.4 = udiv i64 %iv.4, 4 + call void @use(i64 %div.4) + %iv.5 = add i64 %iv, 5 + %div.5 = udiv i64 %iv.5, 4 + call void @use(i64 %div.5) + %iv.next = add i64 %iv, 4 + %cond = icmp slt i64 %iv, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +define void @test_step2_start_outer_add_rec_step_16(i64 %n, i64 %m) { +; CHECK-LABEL: 'test_step2_start_outer_add_rec_step_16' +; CHECK-NEXT: Classifying expressions for: @test_step2_start_outer_add_rec_step_16 +; CHECK-NEXT: %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] +; CHECK-NEXT: --> {0,+,16}<%outer.header> U: [0,-15) S: [-9223372036854775808,9223372036854775793) Exits: <> LoopDispositions: { %outer.header: Computable, %loop: Invariant } +; CHECK-NEXT: %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {{\{\{}}0,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.0 = udiv i64 %iv, 4 +; CHECK-NEXT: --> ({{\{\{}}0,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.1 = add i64 %iv, 1 +; CHECK-NEXT: --> {{\{\{}}1,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.1 = udiv i64 %iv.1, 4 +; CHECK-NEXT: --> ({{\{\{}}1,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.2 = add i64 %iv, 2 +; CHECK-NEXT: --> {{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.2 = udiv i64 %iv.2, 4 +; CHECK-NEXT: --> ({{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.3 = add i64 %iv, 3 +; CHECK-NEXT: --> {{\{\{}}3,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.3 = udiv i64 %iv.3, 4 +; CHECK-NEXT: --> ({{\{\{}}3,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.4 = add i64 %iv, 4 +; CHECK-NEXT: --> {{\{\{}}4,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.4 = udiv i64 %iv.4, 4 +; CHECK-NEXT: --> ({{\{\{}}4,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.5 = add i64 %iv, 5 +; CHECK-NEXT: --> {{\{\{}}5,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.5 = udiv i64 %iv.5, 4 +; CHECK-NEXT: --> ({{\{\{}}5,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.neg.1 = add i64 %iv, -1 +; CHECK-NEXT: --> {{\{\{}}-1,+,16}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.neg.1 = udiv i64 %iv.neg.1, 4 +; CHECK-NEXT: --> ({{\{\{}}-1,+,16}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.0 = udiv i64 %iv, 3 +; CHECK-NEXT: --> ({{\{\{}}0,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.1 = udiv i64 %iv.1, 3 +; CHECK-NEXT: --> ({{\{\{}}1,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.2 = udiv i64 %iv.2, 3 +; CHECK-NEXT: --> ({{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.4 = udiv i64 %iv.4, 3 +; CHECK-NEXT: --> ({{\{\{}}4,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.5 = udiv i64 %iv.5, 3 +; CHECK-NEXT: --> ({{\{\{}}5,+,16}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.next = add i64 %iv, 2 +; CHECK-NEXT: --> {{\{\{}}2,+,16}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %outer.iv.next = add i64 %outer.iv, 16 +; CHECK-NEXT: --> {16,+,16}<%outer.header> U: [0,-15) S: [-9223372036854775808,9223372036854775793) Exits: <> LoopDispositions: { %outer.header: Computable, %loop: Invariant } +; CHECK-NEXT: Determining loop execution counts for: @test_step2_start_outer_add_rec_step_16 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Predicated backedge-taken count is (%m /u 16) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i4 (trunc i64 %m to i4) to i64) == 0 +; CHECK-NEXT: Loop %outer.header: Predicated constant max backedge-taken count is i64 1152921504606846975 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i4 (trunc i64 %m to i4) to i64) == 0 +; CHECK-NEXT: Loop %outer.header: Predicated symbolic max backedge-taken count is (%m /u 16) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i4 (trunc i64 %m to i4) to i64) == 0 +; +entry: + br label %outer.header + +outer.header: + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] + br label %loop + +loop: + %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ] + %div.0 = udiv i64 %iv, 4 + call void @use(i64 %div.0) + %iv.1 = add i64 %iv, 1 + %div.1 = udiv i64 %iv.1, 4 + call void @use(i64 %div.1) + %iv.2 = add i64 %iv, 2 + %div.2 = udiv i64 %iv.2, 4 + call void @use(i64 %div.2) + %iv.3 = add i64 %iv, 3 + %div.3 = udiv i64 %iv.3, 4 + call void @use(i64 %div.3) + %iv.4 = add i64 %iv, 4 + %div.4 = udiv i64 %iv.4, 4 + call void @use(i64 %div.4) + %iv.5 = add i64 %iv, 5 + %div.5 = udiv i64 %iv.5, 4 + call void @use(i64 %div.5) + %iv.neg.1 = add i64 %iv, -1 + %div.neg.1 = udiv i64 %iv.neg.1, 4 + call void @use(i64 %div.neg.1) + %div3.0 = udiv i64 %iv, 3 + call void @use(i64 %div3.0) + %div3.1 = udiv i64 %iv.1,3 + call void @use(i64 %div3.1) + %div3.2 = udiv i64 %iv.2, 3 + call void @use(i64 %div3.2) + %div3.4 = udiv i64 %iv.4, 3 + call void @use(i64 %div3.4) + %div3.5 = udiv i64 %iv.5, 3 + call void @use(i64 %div3.5) + %iv.next = add i64 %iv, 2 + %cond = icmp slt i64 %iv, %n + br i1 %cond, label %loop, label %outer.latch + +outer.latch: + %outer.iv.next = add i64 %outer.iv, 16 + %outer.ec = icmp eq i64 %outer.iv, %m + br i1 %outer.ec, label %exit, label %outer.header + +exit: + ret void +} + +define void @test_step2_div4_start_outer_add_rec_step_2(i64 %n, i64 %m) { +; CHECK-LABEL: 'test_step2_div4_start_outer_add_rec_step_2' +; CHECK-NEXT: Classifying expressions for: @test_step2_div4_start_outer_add_rec_step_2 +; CHECK-NEXT: %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] +; CHECK-NEXT: --> {0,+,2}<%outer.header> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %outer.header: Computable, %loop: Invariant } +; CHECK-NEXT: %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {{\{\{}}0,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.0 = udiv i64 %iv, 4 +; CHECK-NEXT: --> ({{\{\{}}0,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.1 = add i64 %iv, 1 +; CHECK-NEXT: --> {{\{\{}}1,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.1 = udiv i64 %iv.1, 4 +; CHECK-NEXT: --> ({{\{\{}}1,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.2 = add i64 %iv, 2 +; CHECK-NEXT: --> {{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.2 = udiv i64 %iv.2, 4 +; CHECK-NEXT: --> ({{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.3 = add i64 %iv, 3 +; CHECK-NEXT: --> {{\{\{}}3,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.3 = udiv i64 %iv.3, 4 +; CHECK-NEXT: --> ({{\{\{}}3,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.4 = add i64 %iv, 4 +; CHECK-NEXT: --> {{\{\{}}4,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.4 = udiv i64 %iv.4, 4 +; CHECK-NEXT: --> ({{\{\{}}4,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.5 = add i64 %iv, 5 +; CHECK-NEXT: --> {{\{\{}}5,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.5 = udiv i64 %iv.5, 4 +; CHECK-NEXT: --> ({{\{\{}}5,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.neg.1 = add i64 %iv, -1 +; CHECK-NEXT: --> {{\{\{}}-1,+,2}<%outer.header>,+,2}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div.neg.1 = udiv i64 %iv.neg.1, 4 +; CHECK-NEXT: --> ({{\{\{}}-1,+,2}<%outer.header>,+,2}<%loop> /u 4) U: [0,4611686018427387904) S: [0,4611686018427387904) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.0 = udiv i64 %iv, 3 +; CHECK-NEXT: --> ({{\{\{}}0,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.1 = udiv i64 %iv.1, 3 +; CHECK-NEXT: --> ({{\{\{}}1,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.2 = udiv i64 %iv.2, 3 +; CHECK-NEXT: --> ({{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.4 = udiv i64 %iv.4, 3 +; CHECK-NEXT: --> ({{\{\{}}4,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517205) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %div3.5 = udiv i64 %iv.5, 3 +; CHECK-NEXT: --> ({{\{\{}}5,+,2}<%outer.header>,+,2}<%loop> /u 3) U: [0,6148914691236517206) S: [0,6148914691236517206) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %iv.next = add i64 %iv, 2 +; CHECK-NEXT: --> {{\{\{}}2,+,2}<%outer.header>,+,2}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable, %outer.header: Variant } +; CHECK-NEXT: %outer.iv.next = add i64 %outer.iv, 2 +; CHECK-NEXT: --> {2,+,2}<%outer.header> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <> LoopDispositions: { %outer.header: Computable, %loop: Invariant } +; CHECK-NEXT: Determining loop execution counts for: @test_step2_div4_start_outer_add_rec_step_2 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %outer.header: Predicated backedge-taken count is (%m /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %m to i1) to i64) == 0 +; CHECK-NEXT: Loop %outer.header: Predicated constant max backedge-taken count is i64 9223372036854775807 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %m to i1) to i64) == 0 +; CHECK-NEXT: Loop %outer.header: Predicated symbolic max backedge-taken count is (%m /u 2) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i1 (trunc i64 %m to i1) to i64) == 0 +; +entry: + br label %outer.header + +outer.header: + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] + br label %loop + +loop: + %iv = phi i64 [ %outer.iv, %outer.header ], [ %iv.next, %loop ] + %div.0 = udiv i64 %iv, 4 + call void @use(i64 %div.0) + %iv.1 = add i64 %iv, 1 + %div.1 = udiv i64 %iv.1, 4 + call void @use(i64 %div.1) + %iv.2 = add i64 %iv, 2 + %div.2 = udiv i64 %iv.2, 4 + call void @use(i64 %div.2) + %iv.3 = add i64 %iv, 3 + %div.3 = udiv i64 %iv.3, 4 + call void @use(i64 %div.3) + %iv.4 = add i64 %iv, 4 + %div.4 = udiv i64 %iv.4, 4 + call void @use(i64 %div.4) + %iv.5 = add i64 %iv, 5 + %div.5 = udiv i64 %iv.5, 4 + call void @use(i64 %div.5) + %iv.neg.1 = add i64 %iv, -1 + %div.neg.1 = udiv i64 %iv.neg.1, 4 + call void @use(i64 %div.neg.1) + %div3.0 = udiv i64 %iv, 3 + call void @use(i64 %div3.0) + %div3.1 = udiv i64 %iv.1,3 + call void @use(i64 %div3.1) + %div3.2 = udiv i64 %iv.2, 3 + call void @use(i64 %div3.2) + %div3.4 = udiv i64 %iv.4, 3 + call void @use(i64 %div3.4) + %div3.5 = udiv i64 %iv.5, 3 + call void @use(i64 %div3.5) + call void @use(i64 %div.neg.1) + %iv.next = add i64 %iv, 2 + %cond = icmp slt i64 %iv, %n + br i1 %cond, label %loop, label %outer.latch + +outer.latch: + %outer.iv.next = add i64 %outer.iv, 2 + %outer.ec = icmp eq i64 %outer.iv, %m + br i1 %outer.ec, label %exit, label %outer.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 6029095bbe7b1..6af3b9841a64c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -1002,12 +1002,12 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] -; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = xor i1 [[TMP7]], true -; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = xor i1 [[TMP8]], true -; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP9]], i1 false -; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP10]], i1 false +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = fcmp ogt double [[TMP5]], 0.000000e+00 +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = xor i1 [[TMP9]], true +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = xor i1 [[TMP10]], true +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = select i1 [[ACTIVE_LANE_MASK]], i1 [[TMP7]], i1 false +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], i1 [[TMP8]], i1 false ; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP11]], double 1.000000e+00, double 0.000000e+00 ; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP12]], double 1.000000e+00, double 0.000000e+00 ; TFA_INTERLEAVE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[PREDPHI]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll index 1bacae764f760..e925c253439fe 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -5,8 +5,8 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-COST: Checking a loop in 'fixed_width' -; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}> -; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}> +; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<{{.+}}>, ir<2>, vp<{{.+}}> +; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<{{.+}}>, ir<2>, vp<{{.+}}> ; CHECK-COST: Selecting VF: 1. ; We should decide this loop is not worth vectorising using fixed width vectors diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll new file mode 100644 index 0000000000000..f4047c5822094 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr151664-cost-hoisted-vector-scalable.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 5 +; RUN: opt -passes=loop-vectorize -mtriple=aarch64 -mattr=+sve -S %s | FileCheck %s + +define void @cost_hoisted_vector_code(ptr %p, float %arg) { +; CHECK-LABEL: define void @cost_hoisted_vector_code( +; CHECK-SAME: ptr [[P:%.*]], float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[ARG]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> zeroinitializer) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = add i64 1, [[INDEX1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[TMP8]], i32 4 +; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[TMP4]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP0]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], -8 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ] + %res = tail call float @llvm.minimumnum.f32(float %arg, float 0.0) + %gep.p.red = getelementptr float, ptr %p, i64 %iv + store float %res, ptr %gep.p.red, align 4 + %iv.next = add i64 %iv, 1 + %exit.cond = icmp eq i64 %iv.next, 0 + br i1 %exit.cond, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +declare float @llvm.minimumnum.f32(float, float) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll index 019d2ee9886a6..1e46e212d44b9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll @@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux" ; Test case from https://github.com/llvm/llvm-project/issues/148431. define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 %n, i64 %off) #0 { ; CHECK-LABEL: define void @test_predicated_load_cast_hint( -; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) { +; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[N_EXT:%.*]] = sext i8 [[N]] to i32 ; CHECK-NEXT: [[N_SUB:%.*]] = add i32 [[N_EXT]], -15 @@ -67,205 +67,67 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 ; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP2]], 15 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP2]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]]) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE50:.*]] ] -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i8 [[DOTCAST]], 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT17]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT18]], -; CHECK-NEXT: [[TMP25:%.*]] = icmp ule <16 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <16 x i8> poison, i8 [[TMP26]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT19]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT20]] to <16 x i64> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP25]], i32 0 -; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ] +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64> +; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64> +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 +; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: -; CHECK-NEXT: [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP30:%.*]] = zext i8 [[TMP29]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP30]], i64 [[OFF]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP32]], 1 -; CHECK-NEXT: store i64 [[TMP33]], ptr [[TMP31]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] -; CHECK: [[PRED_STORE_CONTINUE]]: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP25]], i32 1 -; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] -; CHECK: [[PRED_STORE_IF21]]: -; CHECK-NEXT: [[TMP35:%.*]] = add i8 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP36:%.*]] = zext i8 [[TMP35]] to i64 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP36]], i64 [[OFF]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = or i64 [[TMP38]], 1 -; CHECK-NEXT: store i64 [[TMP39]], ptr [[TMP37]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] -; CHECK: [[PRED_STORE_CONTINUE22]]: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP25]], i32 2 -; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] -; CHECK: [[PRED_STORE_IF23]]: -; CHECK-NEXT: [[TMP41:%.*]] = add i8 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP42:%.*]] = zext i8 [[TMP41]] to i64 -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP42]], i64 [[OFF]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2 -; CHECK-NEXT: [[TMP45:%.*]] = or i64 [[TMP44]], 1 -; CHECK-NEXT: store i64 [[TMP45]], ptr [[TMP43]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]] -; CHECK: [[PRED_STORE_CONTINUE24]]: -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i1> [[TMP25]], i32 3 -; CHECK-NEXT: br i1 [[TMP46]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] -; CHECK: [[PRED_STORE_IF25]]: -; CHECK-NEXT: [[TMP47:%.*]] = add i8 [[OFFSET_IDX]], 12 -; CHECK-NEXT: [[TMP48:%.*]] = zext i8 [[TMP47]] to i64 -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP48]], i64 [[OFF]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3 -; CHECK-NEXT: [[TMP51:%.*]] = or i64 [[TMP50]], 1 -; CHECK-NEXT: store i64 [[TMP51]], ptr [[TMP49]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]] -; CHECK: [[PRED_STORE_CONTINUE26]]: -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP25]], i32 4 -; CHECK-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] -; CHECK: [[PRED_STORE_IF27]]: -; CHECK-NEXT: [[TMP53:%.*]] = add i8 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP54:%.*]] = zext i8 [[TMP53]] to i64 -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP54]], i64 [[OFF]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4 -; CHECK-NEXT: [[TMP57:%.*]] = or i64 [[TMP56]], 1 -; CHECK-NEXT: store i64 [[TMP57]], ptr [[TMP55]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]] -; CHECK: [[PRED_STORE_CONTINUE28]]: -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i1> [[TMP25]], i32 5 -; CHECK-NEXT: br i1 [[TMP58]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] -; CHECK: [[PRED_STORE_IF29]]: -; CHECK-NEXT: [[TMP59:%.*]] = add i8 [[OFFSET_IDX]], 20 -; CHECK-NEXT: [[TMP60:%.*]] = zext i8 [[TMP59]] to i64 -; CHECK-NEXT: [[TMP61:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP60]], i64 [[OFF]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5 -; CHECK-NEXT: [[TMP63:%.*]] = or i64 [[TMP62]], 1 -; CHECK-NEXT: store i64 [[TMP63]], ptr [[TMP61]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]] -; CHECK: [[PRED_STORE_CONTINUE30]]: -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP25]], i32 6 -; CHECK-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] -; CHECK: [[PRED_STORE_IF31]]: -; CHECK-NEXT: [[TMP65:%.*]] = add i8 [[OFFSET_IDX]], 24 -; CHECK-NEXT: [[TMP66:%.*]] = zext i8 [[TMP65]] to i64 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP66]], i64 [[OFF]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6 -; CHECK-NEXT: [[TMP69:%.*]] = or i64 [[TMP68]], 1 -; CHECK-NEXT: store i64 [[TMP69]], ptr [[TMP67]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; CHECK: [[PRED_STORE_CONTINUE32]]: -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i1> [[TMP25]], i32 7 -; CHECK-NEXT: br i1 [[TMP70]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] -; CHECK: [[PRED_STORE_IF33]]: -; CHECK-NEXT: [[TMP71:%.*]] = add i8 [[OFFSET_IDX]], 28 -; CHECK-NEXT: [[TMP72:%.*]] = zext i8 [[TMP71]] to i64 -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP72]], i64 [[OFF]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7 -; CHECK-NEXT: [[TMP75:%.*]] = or i64 [[TMP74]], 1 -; CHECK-NEXT: store i64 [[TMP75]], ptr [[TMP73]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE34]] -; CHECK: [[PRED_STORE_CONTINUE34]]: -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x i1> [[TMP25]], i32 8 -; CHECK-NEXT: br i1 [[TMP76]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]] -; CHECK: [[PRED_STORE_IF35]]: -; CHECK-NEXT: [[TMP77:%.*]] = add i8 [[OFFSET_IDX]], 32 -; CHECK-NEXT: [[TMP78:%.*]] = zext i8 [[TMP77]] to i64 -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP78]], i64 [[OFF]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8 -; CHECK-NEXT: [[TMP81:%.*]] = or i64 [[TMP80]], 1 -; CHECK-NEXT: store i64 [[TMP81]], ptr [[TMP79]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE36]] -; CHECK: [[PRED_STORE_CONTINUE36]]: -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP25]], i32 9 -; CHECK-NEXT: br i1 [[TMP82]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]] -; CHECK: [[PRED_STORE_IF37]]: -; CHECK-NEXT: [[TMP83:%.*]] = add i8 [[OFFSET_IDX]], 36 -; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP83]] to i64 -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP84]], i64 [[OFF]] -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9 -; CHECK-NEXT: [[TMP87:%.*]] = or i64 [[TMP86]], 1 -; CHECK-NEXT: store i64 [[TMP87]], ptr [[TMP85]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE38]] -; CHECK: [[PRED_STORE_CONTINUE38]]: -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x i1> [[TMP25]], i32 10 -; CHECK-NEXT: br i1 [[TMP88]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]] -; CHECK: [[PRED_STORE_IF39]]: -; CHECK-NEXT: [[TMP89:%.*]] = add i8 [[OFFSET_IDX]], 40 -; CHECK-NEXT: [[TMP90:%.*]] = zext i8 [[TMP89]] to i64 -; CHECK-NEXT: [[TMP91:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP90]], i64 [[OFF]] -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10 -; CHECK-NEXT: [[TMP93:%.*]] = or i64 [[TMP92]], 1 -; CHECK-NEXT: store i64 [[TMP93]], ptr [[TMP91]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE40]] -; CHECK: [[PRED_STORE_CONTINUE40]]: -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <16 x i1> [[TMP25]], i32 11 -; CHECK-NEXT: br i1 [[TMP94]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]] -; CHECK: [[PRED_STORE_IF41]]: -; CHECK-NEXT: [[TMP95:%.*]] = add i8 [[OFFSET_IDX]], 44 -; CHECK-NEXT: [[TMP96:%.*]] = zext i8 [[TMP95]] to i64 -; CHECK-NEXT: [[TMP97:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP96]], i64 [[OFF]] -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11 -; CHECK-NEXT: [[TMP99:%.*]] = or i64 [[TMP98]], 1 -; CHECK-NEXT: store i64 [[TMP99]], ptr [[TMP97]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE42]] -; CHECK: [[PRED_STORE_CONTINUE42]]: -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <16 x i1> [[TMP25]], i32 12 -; CHECK-NEXT: br i1 [[TMP100]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]] -; CHECK: [[PRED_STORE_IF43]]: -; CHECK-NEXT: [[TMP101:%.*]] = add i8 [[OFFSET_IDX]], 48 -; CHECK-NEXT: [[TMP102:%.*]] = zext i8 [[TMP101]] to i64 +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12 +; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0 ; CHECK-NEXT: [[TMP105:%.*]] = or i64 [[TMP104]], 1 ; CHECK-NEXT: store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE44]] -; CHECK: [[PRED_STORE_CONTINUE44]]: -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <16 x i1> [[TMP25]], i32 13 -; CHECK-NEXT: br i1 [[TMP106]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46:.*]] -; CHECK: [[PRED_STORE_IF45]]: -; CHECK-NEXT: [[TMP107:%.*]] = add i8 [[OFFSET_IDX]], 52 -; CHECK-NEXT: [[TMP108:%.*]] = zext i8 [[TMP107]] to i64 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; CHECK: [[PRED_STORE_IF17]]: +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1 ; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]] -; CHECK-NEXT: [[TMP110:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13 +; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1 ; CHECK-NEXT: [[TMP111:%.*]] = or i64 [[TMP110]], 1 ; CHECK-NEXT: store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE46]] -; CHECK: [[PRED_STORE_CONTINUE46]]: -; CHECK-NEXT: [[TMP112:%.*]] = extractelement <16 x i1> [[TMP25]], i32 14 -; CHECK-NEXT: br i1 [[TMP112]], label %[[PRED_STORE_IF47:.*]], label %[[PRED_STORE_CONTINUE48:.*]] -; CHECK: [[PRED_STORE_IF47]]: -; CHECK-NEXT: [[TMP113:%.*]] = add i8 [[OFFSET_IDX]], 56 -; CHECK-NEXT: [[TMP114:%.*]] = zext i8 [[TMP113]] to i64 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; CHECK: [[PRED_STORE_CONTINUE18]]: +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; CHECK: [[PRED_STORE_IF19]]: +; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2 ; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]] -; CHECK-NEXT: [[TMP116:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14 +; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2 ; CHECK-NEXT: [[TMP117:%.*]] = or i64 [[TMP116]], 1 ; CHECK-NEXT: store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE48]] -; CHECK: [[PRED_STORE_CONTINUE48]]: -; CHECK-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP25]], i32 15 -; CHECK-NEXT: br i1 [[TMP118]], label %[[PRED_STORE_IF49:.*]], label %[[PRED_STORE_CONTINUE50]] -; CHECK: [[PRED_STORE_IF49]]: -; CHECK-NEXT: [[TMP119:%.*]] = add i8 [[OFFSET_IDX]], 60 -; CHECK-NEXT: [[TMP120:%.*]] = zext i8 [[TMP119]] to i64 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; CHECK: [[PRED_STORE_CONTINUE20]]: +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 +; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3 ; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]] -; CHECK-NEXT: [[TMP122:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15 +; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3 ; CHECK-NEXT: [[TMP123:%.*]] = or i64 [[TMP122]], 1 ; CHECK-NEXT: store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE50]] -; CHECK: [[PRED_STORE_CONTINUE50]]: +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: ; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP124:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP124]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]]) +; CHECK-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16) +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP48]], i32 0 +; CHECK-NEXT: br i1 [[TMP49]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: @@ -310,6 +172,219 @@ exit: ret void } +; Check computing costs for sdiv/udiv with invariant divisor and tail folding. +; From https://github.com/llvm/llvm-project/issues/160354. +define void @srem_sdiv_with_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #0 { +; CHECK-LABEL: define void @srem_sdiv_with_tail_folding( +; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1 +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]] +; CHECK-NEXT: [[REM_1:%.*]] = add nsw i32 [[REM]], 1 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]] +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[DIV]], 1 +; CHECK-NEXT: [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]] +; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %iv.sub = add nsw i32 %iv, -1 + %rem = srem i32 %iv.sub, %d.0 + %rem.1 = add nsw i32 %rem, 1 + %c = icmp eq i32 %rem.1, %d.0 + br i1 %c, label %then, label %loop.latch + +then: + %div = sdiv i32 %iv.sub, %d.1 + %add.1 = add i32 %div, 1 + %add.1.ext = sext i32 %add.1 to i64 + %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext + store i32 %iv, ptr %gep.dst, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp ne i32 %iv.next, %end + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} + +; Check computing costs for predicated sdiv/udiv with invariant divisor without tail folding. +; From https://github.com/llvm/llvm-project/issues/160356. +define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #1 { +; CHECK-LABEL: define void @srem_sdiv_without_tail_folding( +; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[END]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[END]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[END]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[D_0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -1) +; CHECK-NEXT: [[TMP1:%.*]] = srem <4 x i32> [[TMP0]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], splat (i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]] +; CHECK: [[PRED_SDIV_IF]]: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = sdiv i32 [[TMP5]], [[D_1]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE]] +; CHECK: [[PRED_SDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]] +; CHECK: [[PRED_SDIV_IF1]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[D_1]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE2]] +; CHECK: [[PRED_SDIV_CONTINUE2]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP12]], %[[PRED_SDIV_IF1]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]] +; CHECK: [[PRED_SDIV_IF3]]: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[D_1]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]] +; CHECK: [[PRED_SDIV_CONTINUE4]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP17]], %[[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]] +; CHECK: [[PRED_SDIV_IF5]]: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[D_1]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE6]] +; CHECK: [[PRED_SDIV_CONTINUE6]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP22]], %[[PRED_SDIV_IF5]] ] +; CHECK-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP23]], splat (i32 1) +; CHECK-NEXT: [[TMP25:%.*]] = sext <4 x i32> [[TMP24]] to <4 x i64> +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; CHECK: [[PRED_STORE_IF7]]: +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; CHECK: [[PRED_STORE_CONTINUE8]]: +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 +; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; CHECK: [[PRED_STORE_IF9]]: +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: store i32 [[TMP37]], ptr [[TMP36]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; CHECK: [[PRED_STORE_CONTINUE10]]: +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; CHECK: [[PRED_STORE_IF11]]: +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; CHECK: [[PRED_STORE_CONTINUE12]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[END]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1 +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]] +; CHECK-NEXT: [[REM_1:%.*]] = add nsw i32 [[REM]], 1 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]] +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[DIV]], 1 +; CHECK-NEXT: [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]] +; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %iv.sub = add nsw i32 %iv, -1 + %rem = srem i32 %iv.sub, %d.0 + %rem.1 = add nsw i32 %rem, 1 + %c = icmp eq i32 %rem.1, %d.0 + br i1 %c, label %then, label %loop.latch + +then: + %div = sdiv i32 %iv.sub, %d.1 + %add.1 = add i32 %div, 1 + %add.1.ext = sext i32 %add.1 to i64 + %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext + store i32 %iv, ptr %gep.dst, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp ne i32 %iv.next, %end + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} + +attributes #0 = { "target-cpu"="neoverse-v1" } +attributes #1 = { "target-cpu"="neoverse-v2" } + !0 = distinct !{!0, !1, !2, !3} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} @@ -328,4 +403,6 @@ exit: ; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META11]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META10]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll new file mode 100644 index 0000000000000..cce9596c5bb39 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -0,0 +1,622 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target triple = "arm64-apple-macosx15.0.0" + +define void @replicating_load_used_as_store_addr(ptr noalias %A) { +; CHECK-LABEL: define void @replicating_load_used_as_store_addr( +; CHECK-SAME: ptr noalias [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr ptr, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr ptr, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[TMP8]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.A = getelementptr ptr, ptr %A, i64 %iv + %l.p = load ptr, ptr %gep.A, align 8 + %iv.trunc = trunc i64 %iv.next to i32 + store i32 %iv.trunc, ptr %l.p, align 4 + %ec = icmp eq i64 %iv, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @replicating_load_used_as_store_addr_2(ptr noalias %invar.dst, ptr noalias %invar.src, ptr noalias %src) { +; CHECK-LABEL: define void @replicating_load_used_as_store_addr_2( +; CHECK-SAME: ptr noalias [[INVAR_DST:%.*]], ptr noalias [[INVAR_SRC:%.*]], ptr noalias [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[INVAR_SRC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 123 +; CHECK-NEXT: store i32 [[TMP4]], ptr [[INVAR_DST]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %l.offset = load i32, ptr %invar.src, align 4 + %offset.ext = sext i32 %l.offset to i64 + %gep.src = getelementptr i128, ptr %src, i64 %offset.ext + %l.v = load i32, ptr %gep.src, align 4 + %add = add i32 %l.v, 123 + store i32 %add, ptr %invar.dst, align 8 + %iv.next = add i64 %iv, 1 + %exitcond41.not = icmp eq i64 %iv.next, 100 + br i1 %exitcond41.not, label %exit, label %loop + +exit: + ret void +} + + +define void @replicating_load_used_as_store_addr_3(ptr noalias %src, ptr noalias %dst, ptr noalias %invar.dst, i8 %x) { +; CHECK-LABEL: define void @replicating_load_used_as_store_addr_3( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], ptr noalias [[INVAR_DST:%.*]], i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = xor i8 [[X]], 10 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP11]], 111 +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: store i8 0, ptr [[TMP7]], align 1 +; CHECK-NEXT: store i8 0, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP5]] to i8 +; CHECK-NEXT: store i8 [[TMP8]], ptr [[INVAR_DST]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %xor = xor i8 %x, 10 + %ext = zext i8 %xor to i64 + %gep.src = getelementptr i8, ptr %src, i64 %ext + %l = load i8, ptr %gep.src, align 1 + %l.ext = zext i8 %l to i32 + %xor.2 = xor i32 %l.ext, 111 + %idx2.ext = zext i32 %l.ext to i64 + %gep.dst = getelementptr i8, ptr %dst, i64 %idx2.ext + store i8 0, ptr %gep.dst, align 1 + %xor.2.trunc = trunc i32 %xor.2 to i8 + store i8 %xor.2.trunc, ptr %invar.dst, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @uniform_gep_for_replicating_gep(ptr %dst) { +; CHECK-LABEL: define void @uniform_gep_for_replicating_gep( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = lshr i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i8> [[TMP4]], i32 1 +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1 +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2) +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %c = icmp eq i32 %iv, 0 + %shift = lshr i32 %iv, 1 + %ext = zext i1 %c to i8 + %ext.shift = zext i32 %shift to i64 + %gep = getelementptr i64, ptr %dst, i64 %ext.shift + store i8 %ext, ptr %gep, align 1 + %iv.next = add i32 %iv, 1 + %ec = icmp eq i32 %iv, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @test_load_gep_widen_induction(ptr noalias %dst, ptr noalias %dst2) #0 { +; CHECK-LABEL: define void @test_load_gep_widen_induction( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[DST2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_2]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0 +; CHECK-NEXT: store ptr null, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1 +; CHECK-NEXT: store ptr null, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0 +; CHECK-NEXT: store ptr null, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: store ptr null, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0 +; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1 +; CHECK-NEXT: store ptr null, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: store ptr null, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1 +; CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr ptr, ptr [[DST2]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 6 +; CHECK-NEXT: store <2 x ptr> [[TMP0]], ptr [[TMP17]], align 8 +; CHECK-NEXT: store <2 x ptr> [[TMP1]], ptr [[TMP13]], align 8 +; CHECK-NEXT: store <2 x ptr> [[TMP2]], ptr [[TMP14]], align 8 +; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[TMP15]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2) +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 false, [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.dst.iv = getelementptr i128, ptr %dst, i64 %iv + %iv.next = add i64 %iv, 1 + store ptr null, ptr %gep.dst.iv, align 8 + %gep.dst2.iv = getelementptr ptr, ptr %dst2, i64 %iv + store ptr %gep.dst.iv, ptr %gep.dst2.iv + %exitcond.not = icmp eq i64 %iv.next, 100 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + + +define ptr @replicating_store_in_conditional_latch(ptr %p, i32 %n) #0 { +; CHECK-LABEL: define ptr @replicating_store_in_conditional_latch( +; CHECK-SAME: ptr [[P:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 0, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[DOTCAST]], -2 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 48 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 48 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 48 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 144 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 24 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 24 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 24 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 24 +; CHECK-NEXT: store ptr null, ptr [[TMP12]], align 8 +; CHECK-NEXT: store ptr null, ptr [[TMP13]], align 8 +; CHECK-NEXT: store ptr null, ptr [[TMP14]], align 8 +; CHECK-NEXT: store ptr null, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %ptr.iv = phi ptr [ %p, %entry ], [ %ptr.iv.next, %loop.latch ] + %gep.ptr.iv = getelementptr i8, ptr %ptr.iv, i64 24 + %c = icmp eq i32 %iv, %n + br i1 %c, label %exit, label %loop.latch + +loop.latch: + %iv.next = add nsw i32 %iv, -2 + store ptr null, ptr %gep.ptr.iv, align 8 + %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 48 + br label %loop.header + +exit: + ret ptr %gep.ptr.iv +} + +declare void @init(ptr) + +define void @scalar_store_cost_after_discarding_interleave_group(ptr %dst, i32 %x, ptr %src) { +; CHECK-LABEL: define void @scalar_store_cost_after_discarding_interleave_group( +; CHECK-SAME: ptr [[DST:%.*]], i32 [[X:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TEMP1:%.*]] = alloca [64 x i32], align 4 +; CHECK-NEXT: call void @init(ptr [[TEMP1]]) +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[TMP21:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TEMP1]], align 4 +; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 1 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[X]], -171254 +; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[MUL_0]], 1 +; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[SHR_0]], [[SHR_1]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: store i16 0, ptr [[TMP30]], align 2 +; CHECK-NEXT: [[GEP_0_1:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[GEP_0_1]], i64 14 +; CHECK-NEXT: store i16 0, ptr [[TMP38]], align 2 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], 1 +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[ADD_1]], 1 +; CHECK-NEXT: [[TMP54:%.*]] = trunc i32 [[SHR_2]] to i16 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[TMP30]], i64 2 +; CHECK-NEXT: store i16 [[TMP54]], ptr [[TMP46]], align 2 +; CHECK-NEXT: [[SUB_0:%.*]] = sub i32 0, [[MUL_0]] +; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[SUB_0]], 1 +; CHECK-NEXT: [[TMP70:%.*]] = trunc i32 [[SHR_3]] to i16 +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[TMP30]], i64 12 +; CHECK-NEXT: store i16 [[TMP70]], ptr [[TMP62]], align 2 +; CHECK-NEXT: [[OR_0:%.*]] = or i32 [[X]], 1 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[OR_0]], 1 +; CHECK-NEXT: [[SHR_4:%.*]] = lshr i32 [[ADD_2]], 1 +; CHECK-NEXT: [[TMP86:%.*]] = trunc i32 [[SHR_4]] to i16 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr i8, ptr [[TMP30]], i64 4 +; CHECK-NEXT: store i16 [[TMP86]], ptr [[TMP78]], align 2 +; CHECK-NEXT: [[GEP_0_2:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[GEP_0_2]], i64 10 +; CHECK-NEXT: store i16 0, ptr [[TMP94]], align 2 +; CHECK-NEXT: [[TRUNC_3:%.*]] = trunc i32 [[TMP22]] to i16 +; CHECK-NEXT: [[OR_1:%.*]] = or i16 [[TRUNC_3]], 1 +; CHECK-NEXT: [[TMP113:%.*]] = add i16 [[OR_1]], 1 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8 +; CHECK-NEXT: store i16 [[TMP113]], ptr [[TMP105]], align 2 +; CHECK-NEXT: [[TMP121:%.*]] = getelementptr i8, ptr [[TMP30]], i64 6 +; CHECK-NEXT: store i16 0, ptr [[TMP121]], align 2 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[TMP21]], 8 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[TMP21]], 128 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %temp1 = alloca [64 x i32], align 4 + call void @init(ptr %temp1) + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %1 = load i32, ptr %temp1, align 4 + %shr.0 = lshr i32 %x, 1 + %mul.0 = mul i32 %x, -171254 + %shr.1 = lshr i32 %mul.0, 1 + %add.0 = add i32 %shr.0, %shr.1 + %gep.0 = getelementptr i16, ptr %dst, i64 %iv + store i16 0, ptr %gep.0, align 2 + %gep.0.1 = getelementptr i16, ptr %dst, i64 %iv + %gep.14 = getelementptr i8, ptr %gep.0.1, i64 14 + store i16 0, ptr %gep.14, align 2 + %add.1 = add i32 %add.0, 1 + %shr.2 = lshr i32 %add.1, 1 + %trunc.0 = trunc i32 %shr.2 to i16 + %gep.2 = getelementptr i8, ptr %gep.0, i64 2 + store i16 %trunc.0, ptr %gep.2, align 2 + %sub.0 = sub i32 0, %mul.0 + %shr.3 = lshr i32 %sub.0, 1 + %trunc.1 = trunc i32 %shr.3 to i16 + %gep.12 = getelementptr i8, ptr %gep.0, i64 12 + store i16 %trunc.1, ptr %gep.12, align 2 + %or.0 = or i32 %x, 1 + %add.2 = add i32 %or.0, 1 + %shr.4 = lshr i32 %add.2, 1 + %trunc.2 = trunc i32 %shr.4 to i16 + %gep.4 = getelementptr i8, ptr %gep.0, i64 4 + store i16 %trunc.2, ptr %gep.4, align 2 + %gep.0.2 = getelementptr i16, ptr %dst, i64 %iv + %gep.10 = getelementptr i8, ptr %gep.0.2, i64 10 + store i16 0, ptr %gep.10, align 2 + %trunc.3 = trunc i32 %1 to i16 + %or.1 = or i16 %trunc.3, 1 + %add.3 = add i16 %or.1, 1 + %gep.8 = getelementptr i8, ptr %gep.0, i64 8 + store i16 %add.3, ptr %gep.8, align 2 + %gep.6 = getelementptr i8, ptr %gep.0, i64 6 + store i16 0, ptr %gep.6, align 2 + %iv.next = add i64 %iv, 8 + %ec = icmp ult i64 %iv, 128 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @test_prefer_vector_addressing(ptr %start, ptr %ms, ptr noalias %src) #0 { +; CHECK-LABEL: define void @test_prefer_vector_addressing( +; CHECK-SAME: ptr [[START:%.*]], ptr [[MS:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[MS1:%.*]] = ptrtoint ptr [[MS]] to i64 +; CHECK-NEXT: [[GEP_START:%.*]] = getelementptr i8, ptr [[START]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[START2]], 3 +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[MS1]], i64 [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[UMAX]], -3 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[START2]] +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], [[UMIN]] +; CHECK-NEXT: [[TMP4:%.*]] = udiv i64 [[TMP3]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UMIN]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[GEP_START]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[N_VEC]], 3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP9]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 9 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP11]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA14:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA14]] +; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA14]] +; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA14]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]] +; CHECK-NEXT: store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA19:![0-9]+]] +; CHECK-NEXT: store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA19]] +; CHECK-NEXT: store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA19]] +; CHECK-NEXT: store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA19]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; +entry: + %gep.start = getelementptr i8, ptr %start, i64 3 + br label %loop + +loop: + %ptr.iv = phi ptr [ %gep.start, %entry ], [ %ptr.iv.next, %loop ] + %recur = phi ptr [ %start, %entry ], [ %ptr.iv, %loop ] + %l = load i64, ptr %recur, align 1, !tbaa !0 + %gep.src = getelementptr i8, ptr %src, i64 %l + store i32 0, ptr %gep.src, align 4, !tbaa !5 + %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 3 + %ec = icmp ult ptr %ptr.iv, %ms + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @cost_scalar_load_of_address(ptr noalias %src, ptr %dst) { +; CHECK-LABEL: define void @cost_scalar_load_of_address( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[L_EXT:%.*]] = sext i32 [[L]] to i64 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[L_EXT]] +; CHECK-NEXT: store i32 0, ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 8 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src, align 4 + %l.ext = sext i32 %l to i64 + %gep.dst = getelementptr i32, ptr %dst, i64 %l.ext + store i32 0, ptr %gep.dst, align 4 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 8 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +%t = type { [3 x double] } +%t.2 = type { [ 64 x double ] } + +define double @test_scalarization_cost_for_load_of_address(ptr %src.0, ptr %src.1, ptr %src.2) { +; CHECK-LABEL: define double @test_scalarization_cost_for_load_of_address( +; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], ptr [[SRC_2:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi double [ 3.000000e+00, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr [[T:%.*]], ptr [[SRC_0]], i64 [[IV]] +; CHECK-NEXT: [[L_0:%.*]] = load double, ptr [[GEP_0]], align 8 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr i8, ptr [[GEP_0]], i64 8 +; CHECK-NEXT: [[L_1:%.*]] = load double, ptr [[GEP_8]], align 8 +; CHECK-NEXT: [[GEP_16:%.*]] = getelementptr i8, ptr [[GEP_0]], i64 16 +; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[GEP_16]], align 8 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul double [[L_0]], 3.000000e+00 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul double [[L_1]], 3.000000e+00 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul double [[L_2]], 3.000000e+00 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd double [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd double [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_SRC]], align 8 +; CHECK-NEXT: [[MUL256_US:%.*]] = fmul double [[ADD_1]], [[L]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[T_2:%.*]], ptr [[SRC_2]], i64 [[IV]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP11]], i64 72 +; CHECK-NEXT: [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[RED_NEXT]] = tail call double @llvm.fmuladd.f64(double [[MUL256_US]], double [[TMP17]], double [[RED]]) +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret double [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi double [ 3.000000e+00, %entry ], [ %red.next, %loop ] + %gep.0 = getelementptr %t, ptr %src.0, i64 %iv + %l.0 = load double, ptr %gep.0, align 8 + %gep.8 = getelementptr i8, ptr %gep.0, i64 8 + %l.1 = load double, ptr %gep.8, align 8 + %gep.16 = getelementptr i8, ptr %gep.0, i64 16 + %l.2 = load double, ptr %gep.16, align 8 + %mul.0 = fmul double %l.0, 3.000000e+00 + %mul.1 = fmul double %l.1, 3.000000e+00 + %mul.2 = fmul double %l.2, 3.000000e+00 + %add.0 = fadd double %mul.0, %mul.1 + %add.1 = fadd double %add.0, %mul.2 + %gep.src = getelementptr double, ptr %src.1, i64 %iv + %l = load double, ptr %gep.src, align 8 + %mul256.us = fmul double %add.1, %l + %gep.src.2 = getelementptr %t.2, ptr %src.2, i64 %iv + %gep.72 = getelementptr i8, ptr %gep.src.2, i64 72 + %l.p.2 = load ptr, ptr %gep.72, align 8 + %lv = load double, ptr %l.p.2, align 8 + %red.next = tail call double @llvm.fmuladd.f64(double %mul256.us, double %lv, double %red) + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret double %red.next +} + +attributes #0 = { "target-cpu"="neoverse-512tvb" } + +!0 = !{!1, !2, i64 0} +!1 = !{!"", !2, i64 0} +!2 = !{!"long long", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !3, i64 0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll index 403fc9f316d35..20409f66fc51f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll @@ -73,3 +73,26 @@ exit: %1 = select i1 %all.off, i32 1, i32 %0 ret i32 %1 } + +define i32 @select_vpinst_for_tail_folding(i8 %n) { +; CHECK: LV: Checking a loop in 'select_vpinst_for_tail_folding' +; CHECK: Cost of 1 for VF 2: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red> +; CHECK: Cost of 1 for VF 4: EMIT vp<{{.+}}> = select vp<{{.+}}>, ir<%red.next>, ir<%red> +; CHECK: LV: Selecting VF: 4 + +entry: + %c = icmp ne i8 %n, 0 + %ext = zext i1 %c to i32 + br label %loop + +loop: + %iv = phi i32 [ %ext, %entry ], [ %iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %iv.next = add i32 %iv, 1 + %red.next = mul i32 %red, %iv + %ec = icmp eq i32 %iv, 12 + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll new file mode 100644 index 0000000000000..fd83a012541b5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/replicating-load-store-costs.ll @@ -0,0 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target triple = "armv7-unknown-linux-gnueabihf" + +define void @replicating_load_used_by_other_load(i32 %arg, ptr %a, i32 %b) { +; CHECK-LABEL: define void @replicating_load_used_by_other_load( +; CHECK-SAME: i32 [[ARG:%.*]], ptr [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[ARG]], %[[ENTRY]] ] +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[IV]], 1 +; CHECK-NEXT: [[AND_1:%.*]] = and i32 [[IV]], 1 +; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[IV]], 2 +; CHECK-NEXT: [[SHL_2:%.*]] = shl i32 [[IV]], 1 +; CHECK-NEXT: [[AND_2:%.*]] = and i32 [[SHL_2]], 2 +; CHECK-NEXT: [[OR_1:%.*]] = or i32 [[AND_2]], [[AND_1]] +; CHECK-NEXT: [[OR_2:%.*]] = or i32 [[OR_1]], [[SHL_1]] +; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[B]], [[OR_2]] +; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[XOR_1]], [[ARG]] +; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[SHL_1]], 1 +; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[SHR]], [[ARG]] +; CHECK-NEXT: [[AND_3:%.*]] = and i32 [[XOR_3]], 1 +; CHECK-NEXT: [[AND_4:%.*]] = and i32 [[IV]], 2147483646 +; CHECK-NEXT: [[OR_3:%.*]] = or i32 [[AND_3]], [[AND_4]] +; CHECK-NEXT: [[AND_5:%.*]] = and i32 [[IV]], 254 +; CHECK-NEXT: [[SHL_3:%.*]] = shl i32 [[OR_3]], 1 +; CHECK-NEXT: [[XOR_4:%.*]] = xor i32 [[SHL_3]], 2 +; CHECK-NEXT: [[OR_4:%.*]] = or i32 [[AND_5]], [[XOR_4]] +; CHECK-NEXT: [[XOR_5:%.*]] = xor i32 [[SHR_2]], [[OR_4]] +; CHECK-NEXT: [[XOR_6:%.*]] = xor i32 [[XOR_5]], [[XOR_2]] +; CHECK-NEXT: [[AND_6:%.*]] = and i32 [[XOR_6]], 255 +; CHECK-NEXT: [[XOR_7:%.*]] = xor i32 [[AND_6]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i32 [[XOR_7]] +; CHECK-NEXT: [[LD:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD]] to i32 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr null, i32 [[ZEXT]] +; CHECK-NEXT: store i32 0, ptr [[GEP_2]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 100 +; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ %arg, %entry ] + %shr = lshr i32 %iv, 1 + %and.1 = and i32 %iv, 1 + %shl.1 = shl i32 %iv, 2 + %shl.2 = shl i32 %iv, 1 + %and.2 = and i32 %shl.2, 2 + %or.1 = or i32 %and.2, %and.1 + %or.2 = or i32 %or.1, %shl.1 + %xor.1 = xor i32 %b, %or.2 + %xor.2 = xor i32 %xor.1, %arg + %shr.2 = lshr i32 %shl.1, 1 + %xor.3 = xor i32 %shr, %arg + %and.3 = and i32 %xor.3, 1 + %and.4 = and i32 %iv, 2147483646 + %or.3 = or i32 %and.3, %and.4 + %and.5 = and i32 %iv, 254 + %shl.3 = shl i32 %or.3, 1 + %xor.4 = xor i32 %shl.3, 2 + %or.4 = or i32 %and.5, %xor.4 + %xor.5 = xor i32 %shr.2, %or.4 + %xor.6 = xor i32 %xor.5, %xor.2 + %and.6 = and i32 %xor.6, 255 + %xor.7 = xor i32 %and.6, 1 + %gep = getelementptr i8, ptr %a, i32 %xor.7 + %ld = load i8, ptr %gep, align 1 + %zext = zext i8 %ld to i32 + %gep.2 = getelementptr i32, ptr null, i32 %zext + store i32 0, ptr %gep.2, align 4 + %iv.next = add i32 %iv, 1 + %cmp = icmp eq i32 %iv.next, 100 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll index 06e345f9c12ec..934f1ecb49410 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll @@ -23,7 +23,38 @@ define void @outside_user_blocks_tail_folding(ptr nocapture readonly %ptr, i32 % ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 5 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 6 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 7 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 9 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 10 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 11 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[INDEX]], 13 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 14 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 15 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP18]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP19]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP20]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP21]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP10]] +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP11]] +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP12]] +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP13]] +; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP14]] +; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP15]] +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP16]] +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP17]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index c6661cb644d80..8c550a752610e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -1019,3 +1019,79 @@ latch: for.end: ret void } + +; Test for https://github.com/llvm/llvm-project/issues/159402. For invariant divisors, +; selects can be introduced outside the vector loop and their cost should not be +; considered for each loop iteration. +define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) { +; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ -12, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[NARROW_IV:%.*]] = phi i8 [ -12, [[ENTRY]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: br i1 [[C:%.*]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X:%.*]] +; CHECK-NEXT: [[UD_EXT:%.*]] = zext i8 [[UD]] to i16 +; CHECK-NEXT: [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y:%.*]] +; CHECK-NEXT: [[SD_EXT:%.*]] = sext i16 [[SD]] to i32 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nsw i16 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0 +; CHECK-NEXT: [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; CHECK: exit: +; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i32 [[MERGE_LCSSA]] +; +; FIXED-LABEL: @udiv_sdiv_with_invariant_divisors( +; FIXED-NEXT: entry: +; FIXED-NEXT: br label [[LOOP_HEADER:%.*]] +; FIXED: loop.header: +; FIXED-NEXT: [[IV:%.*]] = phi i16 [ -12, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; FIXED-NEXT: [[NARROW_IV:%.*]] = phi i8 [ -12, [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ] +; FIXED-NEXT: br i1 [[C:%.*]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; FIXED: then: +; FIXED-NEXT: [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X:%.*]] +; FIXED-NEXT: [[UD_EXT:%.*]] = zext i8 [[UD]] to i16 +; FIXED-NEXT: [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y:%.*]] +; FIXED-NEXT: [[SD_EXT:%.*]] = sext i16 [[SD]] to i32 +; FIXED-NEXT: br label [[LOOP_LATCH]] +; FIXED: loop.latch: +; FIXED-NEXT: [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ] +; FIXED-NEXT: [[IV_NEXT]] = add nsw i16 [[IV]], 1 +; FIXED-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0 +; FIXED-NEXT: [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8 +; FIXED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; FIXED: exit: +; FIXED-NEXT: [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ] +; FIXED-NEXT: ret i32 [[MERGE_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i16 [ -12, %entry ], [ %iv.next, %loop.latch ] + %narrow.iv = phi i8 [ -12, %entry ], [ %iv.next.trunc, %loop.latch ] + br i1 %c, label %loop.latch, label %then + +then: + %ud = udiv i8 %narrow.iv, %x + %ud.ext = zext i8 %ud to i16 + %sd = sdiv i16 %ud.ext, %y + %sd.ext = sext i16 %sd to i32 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ 0, %loop.header ], [ %sd.ext, %then ] + %iv.next = add nsw i16 %iv, 1 + %ec = icmp eq i16 %iv.next, 0 + %iv.next.trunc = trunc i16 %iv.next to i8 + br i1 %ec, label %exit, label %loop.header + +exit: + ret i32 %merge +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll index 875770cd047ed..b0d9cbe07ff7e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll @@ -102,10 +102,10 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) { ; CHECK-LABEL: @uniform_store_i1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]] @@ -116,12 +116,13 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1 +; CHECK-NEXT: [[VECTOR_GEP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[VECTOR_GEP1]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31 ; CHECK-NEXT: store i1 [[TMP8]], ptr [[DST:%.*]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll index 86758b5a24fe9..a3e3038c6b748 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll @@ -43,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: store float %v4, ptr %out4, align 4 ; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: store float %v4, ptr %out4, align 4 ; AVX512: LV: Found an estimated cost of 70 for VF 32 For instruction: store float %v4, ptr %out4, align 4 -; AVX512: LV: Found an estimated cost of 140 for VF 64 For instruction: store float %v4, ptr %out4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll index 53c8f59491e76..f112e82e5b291 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll @@ -43,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, ptr %out5, align 4 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, ptr %out5, align 4 ; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, ptr %out5, align 4 -; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, ptr %out5, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll index 3f4216bb3a1ef..7792ff36f9a7a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll @@ -43,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v2, ptr %out2, align 8 ; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v2, ptr %out2, align 8 ; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction: store double %v2, ptr %out2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll index 555bbe8e44269..110be8f6fb2f1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll @@ -39,7 +39,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: store double %v6, ptr %out6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll index f4fbbec3a46f5..b369f71627bbc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll @@ -43,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4 ; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4 ; AVX512: LV: Found an estimated cost of 70 for VF 32 For instruction: store i32 %v4, ptr %out4, align 4 -; AVX512: LV: Found an estimated cost of 140 for VF 64 For instruction: store i32 %v4, ptr %out4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll index 4f35f667276d8..22671885a0da3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll @@ -43,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4 ; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, ptr %out5, align 4 -; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, ptr %out5, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll index fe1dad3c3effc..b2f0c31f75c9d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll @@ -43,7 +43,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 ; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 ; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction: store i64 %v2, ptr %out2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll index 881c7867614b7..55debbdc64182 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll @@ -39,7 +39,6 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: store i64 %v6, ptr %out6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll new file mode 100644 index 0000000000000..72b59028d9ae3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll @@ -0,0 +1,1198 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s + +target triple = "x86_64-apple-macosx10.8.0" + +; Test case for https://github.com/llvm/llvm-project/issues/156091. +define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B, ptr align 4 noalias %C, ptr align 4 noalias %D, ptr noalias %E) #0 { +; CHECK-LABEL: @test_replicate_call_chain( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP76]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x float>, ptr [[TMP77]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt <16 x float> [[WIDE_LOAD1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = and <16 x i1> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP79]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison) +; CHECK-NEXT: [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]]) +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]]) +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]]) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5 +; CHECK-NEXT: [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6 +; CHECK-NEXT: [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]]) +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]]) +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8 +; CHECK-NEXT: [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]]) +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9 +; CHECK-NEXT: [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10 +; CHECK-NEXT: [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]]) +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11 +; CHECK-NEXT: [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]]) +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12 +; CHECK-NEXT: [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]]) +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13 +; CHECK-NEXT: [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]]) +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14 +; CHECK-NEXT: [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]]) +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15 +; CHECK-NEXT: [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]]) +; CHECK-NEXT: [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]]) +; CHECK-NEXT: [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]]) +; CHECK-NEXT: [[TMP45:%.*]] = tail call float @llvm.pow.f32(float [[TMP16]], float [[X]]) +; CHECK-NEXT: [[TMP46:%.*]] = tail call float @llvm.pow.f32(float [[TMP18]], float [[X]]) +; CHECK-NEXT: [[TMP47:%.*]] = tail call float @llvm.pow.f32(float [[TMP20]], float [[X]]) +; CHECK-NEXT: [[TMP48:%.*]] = tail call float @llvm.pow.f32(float [[TMP22]], float [[X]]) +; CHECK-NEXT: [[TMP49:%.*]] = tail call float @llvm.pow.f32(float [[TMP24]], float [[X]]) +; CHECK-NEXT: [[TMP50:%.*]] = tail call float @llvm.pow.f32(float [[TMP26]], float [[X]]) +; CHECK-NEXT: [[TMP51:%.*]] = tail call float @llvm.pow.f32(float [[TMP28]], float [[X]]) +; CHECK-NEXT: [[TMP52:%.*]] = tail call float @llvm.pow.f32(float [[TMP30]], float [[X]]) +; CHECK-NEXT: [[TMP53:%.*]] = tail call float @llvm.pow.f32(float [[TMP32]], float [[X]]) +; CHECK-NEXT: [[TMP54:%.*]] = tail call float @llvm.pow.f32(float [[TMP34]], float [[X]]) +; CHECK-NEXT: [[TMP55:%.*]] = tail call float @llvm.pow.f32(float [[TMP36]], float [[X]]) +; CHECK-NEXT: [[TMP56:%.*]] = tail call float @llvm.pow.f32(float [[TMP38]], float [[X]]) +; CHECK-NEXT: [[TMP57:%.*]] = tail call float @llvm.pow.f32(float [[TMP40]], float [[X]]) +; CHECK-NEXT: [[TMP58:%.*]] = tail call float @llvm.pow.f32(float [[TMP42]], float [[X]]) +; CHECK-NEXT: [[TMP59:%.*]] = insertelement <16 x float> poison, float [[TMP43]], i32 0 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x float> [[TMP59]], float [[TMP44]], i32 1 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <16 x float> [[TMP60]], float [[TMP45]], i32 2 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <16 x float> [[TMP61]], float [[TMP46]], i32 3 +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <16 x float> [[TMP62]], float [[TMP47]], i32 4 +; CHECK-NEXT: [[TMP64:%.*]] = insertelement <16 x float> [[TMP63]], float [[TMP48]], i32 5 +; CHECK-NEXT: [[TMP65:%.*]] = insertelement <16 x float> [[TMP64]], float [[TMP49]], i32 6 +; CHECK-NEXT: [[TMP66:%.*]] = insertelement <16 x float> [[TMP65]], float [[TMP50]], i32 7 +; CHECK-NEXT: [[TMP67:%.*]] = insertelement <16 x float> [[TMP66]], float [[TMP51]], i32 8 +; CHECK-NEXT: [[TMP68:%.*]] = insertelement <16 x float> [[TMP67]], float [[TMP52]], i32 9 +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <16 x float> [[TMP68]], float [[TMP53]], i32 10 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <16 x float> [[TMP69]], float [[TMP54]], i32 11 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x float> [[TMP70]], float [[TMP55]], i32 12 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <16 x float> [[TMP71]], float [[TMP56]], i32 13 +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <16 x float> [[TMP72]], float [[TMP57]], i32 14 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <16 x float> [[TMP73]], float [[TMP58]], i32 15 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr float, ptr [[TMP5]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr [[TMP78]], i32 4, <16 x i1> [[TMP7]]) +; CHECK-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr [[TMP78]], i32 4, <16 x i1> [[TMP6]]) +; CHECK-NEXT: store float 0.000000e+00, ptr [[E:%.*]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 +; CHECK-NEXT: br i1 [[TMP75]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 100, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[DEC_IV_NEXT:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV_INC:%.*]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[L_A:%.*]] = load float, ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[C_A:%.*]] = fcmp ogt float [[L_A]], 0.000000e+00 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[L_B:%.*]] = load float, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[C_B:%.*]] = fcmp ogt float [[L_B]], 0.000000e+00 +; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr float, ptr [[C]], i64 [[IV_INC]] +; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_A]], [[C_B]] +; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: store float 0.000000e+00, ptr [[GEP_C]], align 4 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: else: +; CHECK-NEXT: [[IV_MUL_2:%.*]] = shl i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV_MUL_2]] +; CHECK-NEXT: [[L_D:%.*]] = load float, ptr [[GEP_D]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[L_D]], 2.000000e+00 +; CHECK-NEXT: [[POW_1:%.*]] = tail call float @llvm.pow.f32(float [[MUL]], float [[X]]) +; CHECK-NEXT: [[POW_2:%.*]] = tail call float @llvm.pow.f32(float [[POW_1]], float [[X]]) +; CHECK-NEXT: store float [[POW_2]], ptr [[GEP_C]], align 4 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: store float 0.000000e+00, ptr [[E]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[DEC_IV_NEXT]] = add i64 [[DEC_IV]], -1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[DEC_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %dec.iv = phi i64 [ 100, %entry ], [ %dec.iv.next, %loop.latch ] + %iv.inc = add i64 %iv, 1 + %gep.A = getelementptr inbounds float, ptr %A, i64 %iv + %l.A = load float, ptr %gep.A, align 4 + %c.A = fcmp ogt float %l.A, 0.0 + %gep.B = getelementptr inbounds float, ptr %B, i64 %iv + %l.B = load float, ptr %gep.B, align 4 + %c.B = fcmp ogt float %l.B, 0.0 + %gep.C = getelementptr float, ptr %C, i64 %iv.inc + %and = and i1 %c.A, %c.B + br i1 %and, label %then, label %else + +then: + store float 0.0, ptr %gep.C, align 4 + br label %loop.latch + +else: + %iv.mul.2 = shl i64 %iv, 2 + %gep.D = getelementptr i8, ptr %D, i64 %iv.mul.2 + %l.D = load float, ptr %gep.D, align 4 + %mul = fmul float %l.D, 2.0 + %pow.1 = tail call float @llvm.pow.f32(float %mul, float %x) + %pow.2 = tail call float @llvm.pow.f32(float %pow.1, float %x) + store float %pow.2, ptr %gep.C, align 4 + br label %loop.latch + +loop.latch: + store float 0.000000e+00, ptr %E, align 4 + %iv.next = add i64 %iv, 1 + %dec.iv.next = add i64 %dec.iv, -1 + %ec = icmp ne i64 %dec.iv.next, 0 + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} + +define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 { +; CHECK-LABEL: @avx512_cond_load_cost( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = sub i32 0, [[TMP0]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or i32 [[DIV]], [[MUL]] +; CHECK-NEXT: [[EXT:%.*]] = sext i32 [[OR]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2 +; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8 +; CHECK-NEXT: [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]] +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i32 [[IV]], [[C]] +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i64 [[RES_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %c.1 = icmp slt i32 %iv, 0 + br i1 %c.1, label %if.then, label %loop.latch + +if.then: + %1 = urem i32 %a, %c + %mul = sub i32 0, %1 + %div = udiv i32 %c, %d + %or = or i32 %div, %mul + %ext = sext i32 %or to i64 + %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2 + %l = load i64, ptr %gep, align 8 + %or.2 = or i64 %l, %b + br label %loop.latch + +loop.latch: + %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ] + %iv.next = add i32 %iv, 1 + %ec = icmp ult i32 %iv, %c + br i1 %ec, label %loop.header, label %exit + +exit: + ret i64 %res +} + +define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { +; CHECK-LABEL: @cost_duplicate_recipe_for_sinking( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0 +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP25]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1 +; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK: pred.store.if8: +; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK: pred.store.continue9: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2 +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; CHECK: pred.store.if10: +; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK: pred.store.continue11: +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3 +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; CHECK: pred.store.if12: +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]] +; CHECK: pred.store.continue13: +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 +; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; CHECK: pred.store.if14: +; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] +; CHECK: pred.store.continue15: +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 +; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK: pred.store.if16: +; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK: pred.store.continue17: +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 +; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK: pred.store.if18: +; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK: pred.store.continue19: +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 +; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; CHECK: pred.store.if20: +; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2 +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0 +; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; CHECK: pred.store.if22: +; CHECK-NEXT: [[TMP107:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2 +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] +; CHECK: pred.store.continue23: +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1 +; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; CHECK: pred.store.if24: +; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9 +; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] +; CHECK: pred.store.continue25: +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2 +; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; CHECK: pred.store.if26: +; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10 +; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2 +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] +; CHECK: pred.store.continue27: +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3 +; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; CHECK: pred.store.if28: +; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11 +; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2 +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]] +; CHECK: pred.store.continue29: +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0 +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; CHECK: pred.store.if30: +; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] +; CHECK: pred.store.continue31: +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1 +; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; CHECK: pred.store.if32: +; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13 +; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] +; CHECK: pred.store.continue33: +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2 +; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] +; CHECK: pred.store.if34: +; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14 +; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] +; CHECK: pred.store.continue35: +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3 +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.if36: +; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15 +; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2 +; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] +; CHECK: pred.store.continue37: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0 +; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]] +; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ] +; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2 +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]] +; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8 +; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0 +; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; CHECK: pred.store.if43: +; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0 +; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2 +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]] +; CHECK: pred.store.continue44: +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1 +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; CHECK: pred.store.if45: +; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1 +; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2 +; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]] +; CHECK: pred.store.continue46: +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2 +; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] +; CHECK: pred.store.if47: +; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2 +; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2 +; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]] +; CHECK: pred.store.continue48: +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3 +; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]] +; CHECK: pred.store.if49: +; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3 +; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]] +; CHECK: pred.store.continue50: +; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4 +; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]] +; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]] +; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8 +; CHECK-NEXT: [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00 +; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]] +; CHECK: if.then: +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %iv.shl = shl nsw i64 %iv, 2 + %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl + %l = load double, ptr %gep.0, align 8 + %c = fcmp oeq double %l, 0.000000e+00 + br i1 %c, label %if.then, label %loop.latch + +if.then: + %gep.1 = getelementptr double, ptr %A, i64 %iv.shl + store double 0.000000e+00, ptr %gep.1, align 8 + br label %loop.latch + +loop.latch: + %iv.next = add nsw i64 %iv, 1 + %ec = icmp eq i64 %iv, %N + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +; Test for https://github.com/llvm/llvm-project/issues/129236. +define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) { +; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0 +; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32 +; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]] +; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0 +; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24 +; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: else: +; CHECK-NEXT: [[REM_I]] = urem i32 -1, [[CONV_I]] +; CHECK-NEXT: [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1 +; CHECK-NEXT: br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]] +; CHECK: loop.latch: +; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; CHECK: exit: +; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ] +; CHECK-NEXT: ret i32 [[P_2_LCSSA]] +; +entry: + %cmp.i = icmp eq i16 0, 0 + %conv.i = sext i16 0 to i32 + %conv5.i = sext i8 %a to i32 + br label %loop.header + +loop.header: + %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ] + br i1 %cmp.i, label %then, label %else + +then: + %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ] + %shr.i = ashr i32 %conv5.i, %p.1 + %tobool6.not.i = icmp eq i32 %shr.i, 0 + %sext.i = shl i32 %p.1, 24 + %2 = ashr exact i32 %sext.i, 24 + %3 = select i1 %tobool6.not.i, i32 %2, i32 0 + br label %loop.latch + +else: + %rem.i = urem i32 -1, %conv.i + %cmp3.i = icmp sgt i32 %rem.i, 1 + br i1 %cmp3.i, label %loop.latch, label %then + +loop.latch: + %p.2 = phi i32 [ 0, %else ], [ %3, %then ] + %iv.next = add i8 %iv, -1 + %ec = icmp eq i8 %iv.next, 0 + br i1 %ec, label %exit, label %loop.header + +exit: + ret i32 %p.2 +} + +; Test case for https://github.com/llvm/llvm-project/issues/156066. +define void @sdiv_by_zero(ptr noalias %src, ptr noalias %dst, i32 %d) #2 { +; CHECK-LABEL: @sdiv_by_zero( +; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE14:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP44]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = xor <8 x i1> [[TMP46]], splat (i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK: pred.sdiv.if: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = sdiv i32 [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] +; CHECK: pred.sdiv.continue: +; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]] +; CHECK: pred.sdiv.if1: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = sdiv i32 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP9]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE2]] +; CHECK: pred.sdiv.continue2: +; CHECK-NEXT: [[TMP11:%.*]] = phi <8 x i32> [ [[TMP6]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_SDIV_IF1]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]] +; CHECK: pred.sdiv.if3: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = sdiv i32 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP14]], i32 2 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE4]] +; CHECK: pred.sdiv.continue4: +; CHECK-NEXT: [[TMP16:%.*]] = phi <8 x i32> [ [[TMP11]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP15]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]] +; CHECK: pred.sdiv.if5: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = sdiv i32 [[TMP18]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP16]], i32 [[TMP19]], i32 3 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE6]] +; CHECK: pred.sdiv.continue6: +; CHECK-NEXT: [[TMP21:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_SDIV_IF5]] ] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]] +; CHECK: pred.sdiv.if7: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 4 +; CHECK-NEXT: [[TMP24:%.*]] = sdiv i32 [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP24]], i32 4 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE8]] +; CHECK: pred.sdiv.continue8: +; CHECK-NEXT: [[TMP26:%.*]] = phi <8 x i32> [ [[TMP21]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_SDIV_IF7]] ] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]] +; CHECK: pred.sdiv.if9: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 5 +; CHECK-NEXT: [[TMP29:%.*]] = sdiv i32 [[TMP28]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP29]], i32 5 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE10]] +; CHECK: pred.sdiv.continue10: +; CHECK-NEXT: [[TMP31:%.*]] = phi <8 x i32> [ [[TMP26]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_SDIV_IF9]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]] +; CHECK: pred.sdiv.if11: +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = sdiv i32 [[TMP33]], 0 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP34]], i32 6 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE12]] +; CHECK: pred.sdiv.continue12: +; CHECK-NEXT: [[TMP36:%.*]] = phi <8 x i32> [ [[TMP31]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP35]], [[PRED_SDIV_IF11]] ] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]] +; CHECK: pred.sdiv.if13: +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i32> [[WIDE_LOAD]], i32 7 +; CHECK-NEXT: [[TMP39:%.*]] = sdiv i32 [[TMP38]], 0 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <8 x i32> [[TMP36]], i32 [[TMP39]], i32 7 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE14]] +; CHECK: pred.sdiv.continue14: +; CHECK-NEXT: [[TMP41:%.*]] = phi <8 x i32> [ [[TMP36]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP40]], [[PRED_SDIV_IF13]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP46]], <8 x i32> zeroinitializer, <8 x i32> [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 0 +; CHECK-NEXT: store <8 x i32> [[PREDPHI]], ptr [[TMP45]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[BB:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[L]], 0 +; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: [[SDIV:%.*]] = sdiv i32 [[L]], 0 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ [[SDIV]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 16 +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +bb: + br label %loop.header + +loop.header: + %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %bb ] + %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src, align 4 + %icmp = icmp eq i32 %l, 0 + br i1 %icmp, label %loop.latch, label %then + +then: + %sdiv = sdiv i32 %l, 0 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %sdiv, %then ], [ 0, %loop.header ] + %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 %merge, ptr %gep.dst, align 4 + %iv.next = add i64 %iv, 1 + %ec = icmp ult i64 %iv, 16 + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} + +; Test case for https://github.com/llvm/llvm-project/issues/158660. +define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 { +; CHECK-LABEL: @test_predicated_udiv( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i1> poison, i1 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i1> [[BROADCAST_SPLATINSERT]], <32 x i1> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = xor <32 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE62:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE62]] ] +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i32> @llvm.usub.sat.v32i32(<32 x i32> [[VEC_IND]], <32 x i32> splat (i32 1)) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK: pred.udiv.if: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <32 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = udiv i32 [[TMP3]], [[D:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <32 x i32> poison, i32 [[TMP4]], i32 0 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] +; CHECK: pred.udiv.continue: +; CHECK-NEXT: [[TMP6:%.*]] = phi <32 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <32 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]] +; CHECK: pred.udiv.if1: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = udiv i32 [[TMP8]], [[D]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <32 x i32> [[TMP6]], i32 [[TMP9]], i32 1 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE2]] +; CHECK: pred.udiv.continue2: +; CHECK-NEXT: [[TMP11:%.*]] = phi <32 x i32> [ [[TMP6]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP10]], [[PRED_UDIV_IF1]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <32 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]] +; CHECK: pred.udiv.if3: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <32 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i32 [[TMP13]], [[D]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <32 x i32> [[TMP11]], i32 [[TMP14]], i32 2 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE4]] +; CHECK: pred.udiv.continue4: +; CHECK-NEXT: [[TMP16:%.*]] = phi <32 x i32> [ [[TMP11]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP15]], [[PRED_UDIV_IF3]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <32 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]] +; CHECK: pred.udiv.if5: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = udiv i32 [[TMP18]], [[D]] +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <32 x i32> [[TMP16]], i32 [[TMP19]], i32 3 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE6]] +; CHECK: pred.udiv.continue6: +; CHECK-NEXT: [[TMP21:%.*]] = phi <32 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP20]], [[PRED_UDIV_IF5]] ] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <32 x i1> [[TMP0]], i32 4 +; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]] +; CHECK: pred.udiv.if7: +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <32 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP24:%.*]] = udiv i32 [[TMP23]], [[D]] +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <32 x i32> [[TMP21]], i32 [[TMP24]], i32 4 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE8]] +; CHECK: pred.udiv.continue8: +; CHECK-NEXT: [[TMP26:%.*]] = phi <32 x i32> [ [[TMP21]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP25]], [[PRED_UDIV_IF7]] ] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <32 x i1> [[TMP0]], i32 5 +; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]] +; CHECK: pred.udiv.if9: +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <32 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: [[TMP29:%.*]] = udiv i32 [[TMP28]], [[D]] +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <32 x i32> [[TMP26]], i32 [[TMP29]], i32 5 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE10]] +; CHECK: pred.udiv.continue10: +; CHECK-NEXT: [[TMP31:%.*]] = phi <32 x i32> [ [[TMP26]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP30]], [[PRED_UDIV_IF9]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <32 x i1> [[TMP0]], i32 6 +; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]] +; CHECK: pred.udiv.if11: +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <32 x i32> [[TMP1]], i32 6 +; CHECK-NEXT: [[TMP34:%.*]] = udiv i32 [[TMP33]], [[D]] +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <32 x i32> [[TMP31]], i32 [[TMP34]], i32 6 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE12]] +; CHECK: pred.udiv.continue12: +; CHECK-NEXT: [[TMP36:%.*]] = phi <32 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP35]], [[PRED_UDIV_IF11]] ] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <32 x i1> [[TMP0]], i32 7 +; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]] +; CHECK: pred.udiv.if13: +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <32 x i32> [[TMP1]], i32 7 +; CHECK-NEXT: [[TMP39:%.*]] = udiv i32 [[TMP38]], [[D]] +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <32 x i32> [[TMP36]], i32 [[TMP39]], i32 7 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE14]] +; CHECK: pred.udiv.continue14: +; CHECK-NEXT: [[TMP41:%.*]] = phi <32 x i32> [ [[TMP36]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP40]], [[PRED_UDIV_IF13]] ] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <32 x i1> [[TMP0]], i32 8 +; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]] +; CHECK: pred.udiv.if15: +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <32 x i32> [[TMP1]], i32 8 +; CHECK-NEXT: [[TMP44:%.*]] = udiv i32 [[TMP43]], [[D]] +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <32 x i32> [[TMP41]], i32 [[TMP44]], i32 8 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE16]] +; CHECK: pred.udiv.continue16: +; CHECK-NEXT: [[TMP46:%.*]] = phi <32 x i32> [ [[TMP41]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP45]], [[PRED_UDIV_IF15]] ] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <32 x i1> [[TMP0]], i32 9 +; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18:%.*]] +; CHECK: pred.udiv.if17: +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <32 x i32> [[TMP1]], i32 9 +; CHECK-NEXT: [[TMP49:%.*]] = udiv i32 [[TMP48]], [[D]] +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <32 x i32> [[TMP46]], i32 [[TMP49]], i32 9 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE18]] +; CHECK: pred.udiv.continue18: +; CHECK-NEXT: [[TMP51:%.*]] = phi <32 x i32> [ [[TMP46]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP50]], [[PRED_UDIV_IF17]] ] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <32 x i1> [[TMP0]], i32 10 +; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20:%.*]] +; CHECK: pred.udiv.if19: +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <32 x i32> [[TMP1]], i32 10 +; CHECK-NEXT: [[TMP54:%.*]] = udiv i32 [[TMP53]], [[D]] +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <32 x i32> [[TMP51]], i32 [[TMP54]], i32 10 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE20]] +; CHECK: pred.udiv.continue20: +; CHECK-NEXT: [[TMP56:%.*]] = phi <32 x i32> [ [[TMP51]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP55]], [[PRED_UDIV_IF19]] ] +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <32 x i1> [[TMP0]], i32 11 +; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_UDIV_IF21:%.*]], label [[PRED_UDIV_CONTINUE22:%.*]] +; CHECK: pred.udiv.if21: +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <32 x i32> [[TMP1]], i32 11 +; CHECK-NEXT: [[TMP59:%.*]] = udiv i32 [[TMP58]], [[D]] +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <32 x i32> [[TMP56]], i32 [[TMP59]], i32 11 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE22]] +; CHECK: pred.udiv.continue22: +; CHECK-NEXT: [[TMP61:%.*]] = phi <32 x i32> [ [[TMP56]], [[PRED_UDIV_CONTINUE20]] ], [ [[TMP60]], [[PRED_UDIV_IF21]] ] +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <32 x i1> [[TMP0]], i32 12 +; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_UDIV_IF23:%.*]], label [[PRED_UDIV_CONTINUE24:%.*]] +; CHECK: pred.udiv.if23: +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <32 x i32> [[TMP1]], i32 12 +; CHECK-NEXT: [[TMP64:%.*]] = udiv i32 [[TMP63]], [[D]] +; CHECK-NEXT: [[TMP65:%.*]] = insertelement <32 x i32> [[TMP61]], i32 [[TMP64]], i32 12 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE24]] +; CHECK: pred.udiv.continue24: +; CHECK-NEXT: [[TMP66:%.*]] = phi <32 x i32> [ [[TMP61]], [[PRED_UDIV_CONTINUE22]] ], [ [[TMP65]], [[PRED_UDIV_IF23]] ] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <32 x i1> [[TMP0]], i32 13 +; CHECK-NEXT: br i1 [[TMP67]], label [[PRED_UDIV_IF25:%.*]], label [[PRED_UDIV_CONTINUE26:%.*]] +; CHECK: pred.udiv.if25: +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <32 x i32> [[TMP1]], i32 13 +; CHECK-NEXT: [[TMP69:%.*]] = udiv i32 [[TMP68]], [[D]] +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <32 x i32> [[TMP66]], i32 [[TMP69]], i32 13 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE26]] +; CHECK: pred.udiv.continue26: +; CHECK-NEXT: [[TMP71:%.*]] = phi <32 x i32> [ [[TMP66]], [[PRED_UDIV_CONTINUE24]] ], [ [[TMP70]], [[PRED_UDIV_IF25]] ] +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <32 x i1> [[TMP0]], i32 14 +; CHECK-NEXT: br i1 [[TMP72]], label [[PRED_UDIV_IF27:%.*]], label [[PRED_UDIV_CONTINUE28:%.*]] +; CHECK: pred.udiv.if27: +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <32 x i32> [[TMP1]], i32 14 +; CHECK-NEXT: [[TMP74:%.*]] = udiv i32 [[TMP73]], [[D]] +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <32 x i32> [[TMP71]], i32 [[TMP74]], i32 14 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE28]] +; CHECK: pred.udiv.continue28: +; CHECK-NEXT: [[TMP76:%.*]] = phi <32 x i32> [ [[TMP71]], [[PRED_UDIV_CONTINUE26]] ], [ [[TMP75]], [[PRED_UDIV_IF27]] ] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <32 x i1> [[TMP0]], i32 15 +; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_UDIV_IF29:%.*]], label [[PRED_UDIV_CONTINUE30:%.*]] +; CHECK: pred.udiv.if29: +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <32 x i32> [[TMP1]], i32 15 +; CHECK-NEXT: [[TMP79:%.*]] = udiv i32 [[TMP78]], [[D]] +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <32 x i32> [[TMP76]], i32 [[TMP79]], i32 15 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE30]] +; CHECK: pred.udiv.continue30: +; CHECK-NEXT: [[TMP81:%.*]] = phi <32 x i32> [ [[TMP76]], [[PRED_UDIV_CONTINUE28]] ], [ [[TMP80]], [[PRED_UDIV_IF29]] ] +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <32 x i1> [[TMP0]], i32 16 +; CHECK-NEXT: br i1 [[TMP82]], label [[PRED_UDIV_IF31:%.*]], label [[PRED_UDIV_CONTINUE32:%.*]] +; CHECK: pred.udiv.if31: +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <32 x i32> [[TMP1]], i32 16 +; CHECK-NEXT: [[TMP84:%.*]] = udiv i32 [[TMP83]], [[D]] +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <32 x i32> [[TMP81]], i32 [[TMP84]], i32 16 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE32]] +; CHECK: pred.udiv.continue32: +; CHECK-NEXT: [[TMP86:%.*]] = phi <32 x i32> [ [[TMP81]], [[PRED_UDIV_CONTINUE30]] ], [ [[TMP85]], [[PRED_UDIV_IF31]] ] +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <32 x i1> [[TMP0]], i32 17 +; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_UDIV_IF33:%.*]], label [[PRED_UDIV_CONTINUE34:%.*]] +; CHECK: pred.udiv.if33: +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <32 x i32> [[TMP1]], i32 17 +; CHECK-NEXT: [[TMP89:%.*]] = udiv i32 [[TMP88]], [[D]] +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <32 x i32> [[TMP86]], i32 [[TMP89]], i32 17 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE34]] +; CHECK: pred.udiv.continue34: +; CHECK-NEXT: [[TMP91:%.*]] = phi <32 x i32> [ [[TMP86]], [[PRED_UDIV_CONTINUE32]] ], [ [[TMP90]], [[PRED_UDIV_IF33]] ] +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <32 x i1> [[TMP0]], i32 18 +; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_UDIV_IF35:%.*]], label [[PRED_UDIV_CONTINUE36:%.*]] +; CHECK: pred.udiv.if35: +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <32 x i32> [[TMP1]], i32 18 +; CHECK-NEXT: [[TMP94:%.*]] = udiv i32 [[TMP93]], [[D]] +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <32 x i32> [[TMP91]], i32 [[TMP94]], i32 18 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE36]] +; CHECK: pred.udiv.continue36: +; CHECK-NEXT: [[TMP96:%.*]] = phi <32 x i32> [ [[TMP91]], [[PRED_UDIV_CONTINUE34]] ], [ [[TMP95]], [[PRED_UDIV_IF35]] ] +; CHECK-NEXT: [[TMP97:%.*]] = extractelement <32 x i1> [[TMP0]], i32 19 +; CHECK-NEXT: br i1 [[TMP97]], label [[PRED_UDIV_IF37:%.*]], label [[PRED_UDIV_CONTINUE38:%.*]] +; CHECK: pred.udiv.if37: +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <32 x i32> [[TMP1]], i32 19 +; CHECK-NEXT: [[TMP99:%.*]] = udiv i32 [[TMP98]], [[D]] +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <32 x i32> [[TMP96]], i32 [[TMP99]], i32 19 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE38]] +; CHECK: pred.udiv.continue38: +; CHECK-NEXT: [[TMP101:%.*]] = phi <32 x i32> [ [[TMP96]], [[PRED_UDIV_CONTINUE36]] ], [ [[TMP100]], [[PRED_UDIV_IF37]] ] +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <32 x i1> [[TMP0]], i32 20 +; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_UDIV_IF39:%.*]], label [[PRED_UDIV_CONTINUE40:%.*]] +; CHECK: pred.udiv.if39: +; CHECK-NEXT: [[TMP103:%.*]] = extractelement <32 x i32> [[TMP1]], i32 20 +; CHECK-NEXT: [[TMP104:%.*]] = udiv i32 [[TMP103]], [[D]] +; CHECK-NEXT: [[TMP105:%.*]] = insertelement <32 x i32> [[TMP101]], i32 [[TMP104]], i32 20 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE40]] +; CHECK: pred.udiv.continue40: +; CHECK-NEXT: [[TMP106:%.*]] = phi <32 x i32> [ [[TMP101]], [[PRED_UDIV_CONTINUE38]] ], [ [[TMP105]], [[PRED_UDIV_IF39]] ] +; CHECK-NEXT: [[TMP107:%.*]] = extractelement <32 x i1> [[TMP0]], i32 21 +; CHECK-NEXT: br i1 [[TMP107]], label [[PRED_UDIV_IF41:%.*]], label [[PRED_UDIV_CONTINUE42:%.*]] +; CHECK: pred.udiv.if41: +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <32 x i32> [[TMP1]], i32 21 +; CHECK-NEXT: [[TMP109:%.*]] = udiv i32 [[TMP108]], [[D]] +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <32 x i32> [[TMP106]], i32 [[TMP109]], i32 21 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE42]] +; CHECK: pred.udiv.continue42: +; CHECK-NEXT: [[TMP111:%.*]] = phi <32 x i32> [ [[TMP106]], [[PRED_UDIV_CONTINUE40]] ], [ [[TMP110]], [[PRED_UDIV_IF41]] ] +; CHECK-NEXT: [[TMP112:%.*]] = extractelement <32 x i1> [[TMP0]], i32 22 +; CHECK-NEXT: br i1 [[TMP112]], label [[PRED_UDIV_IF43:%.*]], label [[PRED_UDIV_CONTINUE44:%.*]] +; CHECK: pred.udiv.if43: +; CHECK-NEXT: [[TMP113:%.*]] = extractelement <32 x i32> [[TMP1]], i32 22 +; CHECK-NEXT: [[TMP114:%.*]] = udiv i32 [[TMP113]], [[D]] +; CHECK-NEXT: [[TMP115:%.*]] = insertelement <32 x i32> [[TMP111]], i32 [[TMP114]], i32 22 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE44]] +; CHECK: pred.udiv.continue44: +; CHECK-NEXT: [[TMP116:%.*]] = phi <32 x i32> [ [[TMP111]], [[PRED_UDIV_CONTINUE42]] ], [ [[TMP115]], [[PRED_UDIV_IF43]] ] +; CHECK-NEXT: [[TMP117:%.*]] = extractelement <32 x i1> [[TMP0]], i32 23 +; CHECK-NEXT: br i1 [[TMP117]], label [[PRED_UDIV_IF45:%.*]], label [[PRED_UDIV_CONTINUE46:%.*]] +; CHECK: pred.udiv.if45: +; CHECK-NEXT: [[TMP118:%.*]] = extractelement <32 x i32> [[TMP1]], i32 23 +; CHECK-NEXT: [[TMP119:%.*]] = udiv i32 [[TMP118]], [[D]] +; CHECK-NEXT: [[TMP120:%.*]] = insertelement <32 x i32> [[TMP116]], i32 [[TMP119]], i32 23 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE46]] +; CHECK: pred.udiv.continue46: +; CHECK-NEXT: [[TMP121:%.*]] = phi <32 x i32> [ [[TMP116]], [[PRED_UDIV_CONTINUE44]] ], [ [[TMP120]], [[PRED_UDIV_IF45]] ] +; CHECK-NEXT: [[TMP122:%.*]] = extractelement <32 x i1> [[TMP0]], i32 24 +; CHECK-NEXT: br i1 [[TMP122]], label [[PRED_UDIV_IF47:%.*]], label [[PRED_UDIV_CONTINUE48:%.*]] +; CHECK: pred.udiv.if47: +; CHECK-NEXT: [[TMP123:%.*]] = extractelement <32 x i32> [[TMP1]], i32 24 +; CHECK-NEXT: [[TMP124:%.*]] = udiv i32 [[TMP123]], [[D]] +; CHECK-NEXT: [[TMP125:%.*]] = insertelement <32 x i32> [[TMP121]], i32 [[TMP124]], i32 24 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE48]] +; CHECK: pred.udiv.continue48: +; CHECK-NEXT: [[TMP126:%.*]] = phi <32 x i32> [ [[TMP121]], [[PRED_UDIV_CONTINUE46]] ], [ [[TMP125]], [[PRED_UDIV_IF47]] ] +; CHECK-NEXT: [[TMP127:%.*]] = extractelement <32 x i1> [[TMP0]], i32 25 +; CHECK-NEXT: br i1 [[TMP127]], label [[PRED_UDIV_IF49:%.*]], label [[PRED_UDIV_CONTINUE50:%.*]] +; CHECK: pred.udiv.if49: +; CHECK-NEXT: [[TMP128:%.*]] = extractelement <32 x i32> [[TMP1]], i32 25 +; CHECK-NEXT: [[TMP129:%.*]] = udiv i32 [[TMP128]], [[D]] +; CHECK-NEXT: [[TMP130:%.*]] = insertelement <32 x i32> [[TMP126]], i32 [[TMP129]], i32 25 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE50]] +; CHECK: pred.udiv.continue50: +; CHECK-NEXT: [[TMP131:%.*]] = phi <32 x i32> [ [[TMP126]], [[PRED_UDIV_CONTINUE48]] ], [ [[TMP130]], [[PRED_UDIV_IF49]] ] +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <32 x i1> [[TMP0]], i32 26 +; CHECK-NEXT: br i1 [[TMP132]], label [[PRED_UDIV_IF51:%.*]], label [[PRED_UDIV_CONTINUE52:%.*]] +; CHECK: pred.udiv.if51: +; CHECK-NEXT: [[TMP133:%.*]] = extractelement <32 x i32> [[TMP1]], i32 26 +; CHECK-NEXT: [[TMP134:%.*]] = udiv i32 [[TMP133]], [[D]] +; CHECK-NEXT: [[TMP135:%.*]] = insertelement <32 x i32> [[TMP131]], i32 [[TMP134]], i32 26 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE52]] +; CHECK: pred.udiv.continue52: +; CHECK-NEXT: [[TMP136:%.*]] = phi <32 x i32> [ [[TMP131]], [[PRED_UDIV_CONTINUE50]] ], [ [[TMP135]], [[PRED_UDIV_IF51]] ] +; CHECK-NEXT: [[TMP137:%.*]] = extractelement <32 x i1> [[TMP0]], i32 27 +; CHECK-NEXT: br i1 [[TMP137]], label [[PRED_UDIV_IF53:%.*]], label [[PRED_UDIV_CONTINUE54:%.*]] +; CHECK: pred.udiv.if53: +; CHECK-NEXT: [[TMP138:%.*]] = extractelement <32 x i32> [[TMP1]], i32 27 +; CHECK-NEXT: [[TMP139:%.*]] = udiv i32 [[TMP138]], [[D]] +; CHECK-NEXT: [[TMP140:%.*]] = insertelement <32 x i32> [[TMP136]], i32 [[TMP139]], i32 27 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE54]] +; CHECK: pred.udiv.continue54: +; CHECK-NEXT: [[TMP141:%.*]] = phi <32 x i32> [ [[TMP136]], [[PRED_UDIV_CONTINUE52]] ], [ [[TMP140]], [[PRED_UDIV_IF53]] ] +; CHECK-NEXT: [[TMP142:%.*]] = extractelement <32 x i1> [[TMP0]], i32 28 +; CHECK-NEXT: br i1 [[TMP142]], label [[PRED_UDIV_IF55:%.*]], label [[PRED_UDIV_CONTINUE56:%.*]] +; CHECK: pred.udiv.if55: +; CHECK-NEXT: [[TMP143:%.*]] = extractelement <32 x i32> [[TMP1]], i32 28 +; CHECK-NEXT: [[TMP144:%.*]] = udiv i32 [[TMP143]], [[D]] +; CHECK-NEXT: [[TMP145:%.*]] = insertelement <32 x i32> [[TMP141]], i32 [[TMP144]], i32 28 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE56]] +; CHECK: pred.udiv.continue56: +; CHECK-NEXT: [[TMP146:%.*]] = phi <32 x i32> [ [[TMP141]], [[PRED_UDIV_CONTINUE54]] ], [ [[TMP145]], [[PRED_UDIV_IF55]] ] +; CHECK-NEXT: [[TMP147:%.*]] = extractelement <32 x i1> [[TMP0]], i32 29 +; CHECK-NEXT: br i1 [[TMP147]], label [[PRED_UDIV_IF57:%.*]], label [[PRED_UDIV_CONTINUE58:%.*]] +; CHECK: pred.udiv.if57: +; CHECK-NEXT: [[TMP148:%.*]] = extractelement <32 x i32> [[TMP1]], i32 29 +; CHECK-NEXT: [[TMP149:%.*]] = udiv i32 [[TMP148]], [[D]] +; CHECK-NEXT: [[TMP150:%.*]] = insertelement <32 x i32> [[TMP146]], i32 [[TMP149]], i32 29 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE58]] +; CHECK: pred.udiv.continue58: +; CHECK-NEXT: [[TMP151:%.*]] = phi <32 x i32> [ [[TMP146]], [[PRED_UDIV_CONTINUE56]] ], [ [[TMP150]], [[PRED_UDIV_IF57]] ] +; CHECK-NEXT: [[TMP152:%.*]] = extractelement <32 x i1> [[TMP0]], i32 30 +; CHECK-NEXT: br i1 [[TMP152]], label [[PRED_UDIV_IF59:%.*]], label [[PRED_UDIV_CONTINUE60:%.*]] +; CHECK: pred.udiv.if59: +; CHECK-NEXT: [[TMP153:%.*]] = extractelement <32 x i32> [[TMP1]], i32 30 +; CHECK-NEXT: [[TMP154:%.*]] = udiv i32 [[TMP153]], [[D]] +; CHECK-NEXT: [[TMP155:%.*]] = insertelement <32 x i32> [[TMP151]], i32 [[TMP154]], i32 30 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE60]] +; CHECK: pred.udiv.continue60: +; CHECK-NEXT: [[TMP156:%.*]] = phi <32 x i32> [ [[TMP151]], [[PRED_UDIV_CONTINUE58]] ], [ [[TMP155]], [[PRED_UDIV_IF59]] ] +; CHECK-NEXT: [[TMP157:%.*]] = extractelement <32 x i1> [[TMP0]], i32 31 +; CHECK-NEXT: br i1 [[TMP157]], label [[PRED_UDIV_IF61:%.*]], label [[PRED_UDIV_CONTINUE62]] +; CHECK: pred.udiv.if61: +; CHECK-NEXT: [[TMP158:%.*]] = extractelement <32 x i32> [[TMP1]], i32 31 +; CHECK-NEXT: [[TMP159:%.*]] = udiv i32 [[TMP158]], [[D]] +; CHECK-NEXT: [[TMP160:%.*]] = insertelement <32 x i32> [[TMP156]], i32 [[TMP159]], i32 31 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE62]] +; CHECK: pred.udiv.continue62: +; CHECK-NEXT: [[TMP161:%.*]] = phi <32 x i32> [ [[TMP156]], [[PRED_UDIV_CONTINUE60]] ], [ [[TMP160]], [[PRED_UDIV_IF61]] ] +; CHECK-NEXT: [[TMP162:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64> +; CHECK-NEXT: [[PREDPHI:%.*]] = select <32 x i1> [[BROADCAST_SPLAT]], <32 x i64> zeroinitializer, <32 x i64> [[TMP162]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], splat (i32 32) +; CHECK-NEXT: [[TMP163:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i32 31 +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT64:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT63]], <8 x i1> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP165:%.*]] = xor <8 x i1> [[BROADCAST_SPLAT64]], splat (i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT65:%.*]] = insertelement <8 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT66:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT65]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT66]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX67:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT86:%.*]], [[PRED_UDIV_CONTINUE84:%.*]] ] +; CHECK-NEXT: [[VEC_IND68:%.*]] = phi <8 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT87:%.*]], [[PRED_UDIV_CONTINUE84]] ] +; CHECK-NEXT: [[TMP166:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[VEC_IND68]], <8 x i32> splat (i32 1)) +; CHECK-NEXT: [[TMP167:%.*]] = extractelement <8 x i1> [[TMP165]], i32 0 +; CHECK-NEXT: br i1 [[TMP167]], label [[PRED_UDIV_IF69:%.*]], label [[PRED_UDIV_CONTINUE70:%.*]] +; CHECK: pred.udiv.if69: +; CHECK-NEXT: [[TMP168:%.*]] = extractelement <8 x i32> [[TMP166]], i32 0 +; CHECK-NEXT: [[TMP169:%.*]] = udiv i32 [[TMP168]], [[D]] +; CHECK-NEXT: [[TMP170:%.*]] = insertelement <8 x i32> poison, i32 [[TMP169]], i32 0 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE70]] +; CHECK: pred.udiv.continue70: +; CHECK-NEXT: [[TMP171:%.*]] = phi <8 x i32> [ poison, [[VEC_EPILOG_VECTOR_BODY]] ], [ [[TMP170]], [[PRED_UDIV_IF69]] ] +; CHECK-NEXT: [[TMP172:%.*]] = extractelement <8 x i1> [[TMP165]], i32 1 +; CHECK-NEXT: br i1 [[TMP172]], label [[PRED_UDIV_IF71:%.*]], label [[PRED_UDIV_CONTINUE72:%.*]] +; CHECK: pred.udiv.if71: +; CHECK-NEXT: [[TMP173:%.*]] = extractelement <8 x i32> [[TMP166]], i32 1 +; CHECK-NEXT: [[TMP174:%.*]] = udiv i32 [[TMP173]], [[D]] +; CHECK-NEXT: [[TMP175:%.*]] = insertelement <8 x i32> [[TMP171]], i32 [[TMP174]], i32 1 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE72]] +; CHECK: pred.udiv.continue72: +; CHECK-NEXT: [[TMP176:%.*]] = phi <8 x i32> [ [[TMP171]], [[PRED_UDIV_CONTINUE70]] ], [ [[TMP175]], [[PRED_UDIV_IF71]] ] +; CHECK-NEXT: [[TMP177:%.*]] = extractelement <8 x i1> [[TMP165]], i32 2 +; CHECK-NEXT: br i1 [[TMP177]], label [[PRED_UDIV_IF73:%.*]], label [[PRED_UDIV_CONTINUE74:%.*]] +; CHECK: pred.udiv.if73: +; CHECK-NEXT: [[TMP178:%.*]] = extractelement <8 x i32> [[TMP166]], i32 2 +; CHECK-NEXT: [[TMP179:%.*]] = udiv i32 [[TMP178]], [[D]] +; CHECK-NEXT: [[TMP180:%.*]] = insertelement <8 x i32> [[TMP176]], i32 [[TMP179]], i32 2 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE74]] +; CHECK: pred.udiv.continue74: +; CHECK-NEXT: [[TMP181:%.*]] = phi <8 x i32> [ [[TMP176]], [[PRED_UDIV_CONTINUE72]] ], [ [[TMP180]], [[PRED_UDIV_IF73]] ] +; CHECK-NEXT: [[TMP182:%.*]] = extractelement <8 x i1> [[TMP165]], i32 3 +; CHECK-NEXT: br i1 [[TMP182]], label [[PRED_UDIV_IF75:%.*]], label [[PRED_UDIV_CONTINUE76:%.*]] +; CHECK: pred.udiv.if75: +; CHECK-NEXT: [[TMP183:%.*]] = extractelement <8 x i32> [[TMP166]], i32 3 +; CHECK-NEXT: [[TMP184:%.*]] = udiv i32 [[TMP183]], [[D]] +; CHECK-NEXT: [[TMP185:%.*]] = insertelement <8 x i32> [[TMP181]], i32 [[TMP184]], i32 3 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE76]] +; CHECK: pred.udiv.continue76: +; CHECK-NEXT: [[TMP186:%.*]] = phi <8 x i32> [ [[TMP181]], [[PRED_UDIV_CONTINUE74]] ], [ [[TMP185]], [[PRED_UDIV_IF75]] ] +; CHECK-NEXT: [[TMP187:%.*]] = extractelement <8 x i1> [[TMP165]], i32 4 +; CHECK-NEXT: br i1 [[TMP187]], label [[PRED_UDIV_IF77:%.*]], label [[PRED_UDIV_CONTINUE78:%.*]] +; CHECK: pred.udiv.if77: +; CHECK-NEXT: [[TMP188:%.*]] = extractelement <8 x i32> [[TMP166]], i32 4 +; CHECK-NEXT: [[TMP189:%.*]] = udiv i32 [[TMP188]], [[D]] +; CHECK-NEXT: [[TMP190:%.*]] = insertelement <8 x i32> [[TMP186]], i32 [[TMP189]], i32 4 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE78]] +; CHECK: pred.udiv.continue78: +; CHECK-NEXT: [[TMP191:%.*]] = phi <8 x i32> [ [[TMP186]], [[PRED_UDIV_CONTINUE76]] ], [ [[TMP190]], [[PRED_UDIV_IF77]] ] +; CHECK-NEXT: [[TMP192:%.*]] = extractelement <8 x i1> [[TMP165]], i32 5 +; CHECK-NEXT: br i1 [[TMP192]], label [[PRED_UDIV_IF79:%.*]], label [[PRED_UDIV_CONTINUE80:%.*]] +; CHECK: pred.udiv.if79: +; CHECK-NEXT: [[TMP193:%.*]] = extractelement <8 x i32> [[TMP166]], i32 5 +; CHECK-NEXT: [[TMP194:%.*]] = udiv i32 [[TMP193]], [[D]] +; CHECK-NEXT: [[TMP195:%.*]] = insertelement <8 x i32> [[TMP191]], i32 [[TMP194]], i32 5 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE80]] +; CHECK: pred.udiv.continue80: +; CHECK-NEXT: [[TMP196:%.*]] = phi <8 x i32> [ [[TMP191]], [[PRED_UDIV_CONTINUE78]] ], [ [[TMP195]], [[PRED_UDIV_IF79]] ] +; CHECK-NEXT: [[TMP197:%.*]] = extractelement <8 x i1> [[TMP165]], i32 6 +; CHECK-NEXT: br i1 [[TMP197]], label [[PRED_UDIV_IF81:%.*]], label [[PRED_UDIV_CONTINUE82:%.*]] +; CHECK: pred.udiv.if81: +; CHECK-NEXT: [[TMP198:%.*]] = extractelement <8 x i32> [[TMP166]], i32 6 +; CHECK-NEXT: [[TMP199:%.*]] = udiv i32 [[TMP198]], [[D]] +; CHECK-NEXT: [[TMP200:%.*]] = insertelement <8 x i32> [[TMP196]], i32 [[TMP199]], i32 6 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE82]] +; CHECK: pred.udiv.continue82: +; CHECK-NEXT: [[TMP201:%.*]] = phi <8 x i32> [ [[TMP196]], [[PRED_UDIV_CONTINUE80]] ], [ [[TMP200]], [[PRED_UDIV_IF81]] ] +; CHECK-NEXT: [[TMP202:%.*]] = extractelement <8 x i1> [[TMP165]], i32 7 +; CHECK-NEXT: br i1 [[TMP202]], label [[PRED_UDIV_IF83:%.*]], label [[PRED_UDIV_CONTINUE84]] +; CHECK: pred.udiv.if83: +; CHECK-NEXT: [[TMP203:%.*]] = extractelement <8 x i32> [[TMP166]], i32 7 +; CHECK-NEXT: [[TMP204:%.*]] = udiv i32 [[TMP203]], [[D]] +; CHECK-NEXT: [[TMP205:%.*]] = insertelement <8 x i32> [[TMP201]], i32 [[TMP204]], i32 7 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE84]] +; CHECK: pred.udiv.continue84: +; CHECK-NEXT: [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ] +; CHECK-NEXT: [[TMP207:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64> +; CHECK-NEXT: [[PREDPHI85:%.*]] = select <8 x i1> [[BROADCAST_SPLAT64]], <8 x i64> zeroinitializer, <8 x i64> [[TMP207]] +; CHECK-NEXT: [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8) +; CHECK-NEXT: [[TMP208:%.*]] = icmp eq i32 [[INDEX_NEXT86]], 1000 +; CHECK-NEXT: br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i32 7 +; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL88:%.*]] = phi i32 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL88]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @llvm.usub.sat.i32(i32 [[IV]], i32 1) +; CHECK-NEXT: [[UDIV:%.*]] = udiv i32 [[CALL]], [[D]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[UDIV]] to i64 +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[MERGE:%.*]] = phi i64 [ [[ZEXT]], [[THEN]] ], [ 0, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i64 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP164]], [[MIDDLE_BLOCK]] ], [ [[TMP209]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[MERGE_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c, label %loop.latch, label %then + +then: + %call = tail call i32 @llvm.usub.sat.i32(i32 %iv, i32 1) + %udiv = udiv i32 %call, %d + %zext = zext i32 %udiv to i64 + br label %loop.latch + +loop.latch: + %merge = phi i64 [ %zext, %then ], [ 0, %loop.header ] + %iv.next = add i32 %iv, 1 + %ec = icmp eq i32 %iv, 1000 + br i1 %ec, label %exit, label %loop.header + +exit: + ret i64 %merge +} + +attributes #0 = { "target-cpu"="znver4" } +attributes #1 = { "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" } +attributes #2 = { "target-cpu"="znver3" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index e398586df348f..ef12b54fc3002 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -335,7 +335,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] ; CHECK-NEXT: [[UMIN10:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[A:%.*]]) ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[UMIN10]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 46 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 24 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[B]], i64 1) @@ -542,356 +542,6 @@ exit: ret i1 %any.of.next } -define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 { -; CHECK-LABEL: @avx512_cond_load_cost( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[C_1]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]] -; CHECK: if.then: -; CHECK-NEXT: [[TMP0:%.*]] = urem i32 [[A:%.*]], [[C:%.*]] -; CHECK-NEXT: [[MUL:%.*]] = sub i32 0, [[TMP0]] -; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[C]], [[D:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or i32 [[DIV]], [[MUL]] -; CHECK-NEXT: [[EXT:%.*]] = sext i32 [[OR]] to i64 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], i64 [[EXT]], i32 2 -; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[GEP]], align 8 -; CHECK-NEXT: [[OR_2:%.*]] = or i64 [[L]], [[B:%.*]] -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[RES:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[OR_2]], [[IF_THEN]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp ult i32 [[IV]], [[C]] -; CHECK-NEXT: br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT:%.*]] -; CHECK: exit: -; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[LOOP_LATCH]] ] -; CHECK-NEXT: ret i64 [[RES_LCSSA]] -; -entry: - br label %loop.header - -loop.header: - %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] - %c.1 = icmp slt i32 %iv, 0 - br i1 %c.1, label %if.then, label %loop.latch - -if.then: - %1 = urem i32 %a, %c - %mul = sub i32 0, %1 - %div = udiv i32 %c, %d - %or = or i32 %div, %mul - %ext = sext i32 %or to i64 - %gep = getelementptr { i64, i64, i64 }, ptr %src, i64 %ext, i32 2 - %l = load i64, ptr %gep, align 8 - %or.2 = or i64 %l, %b - br label %loop.latch - -loop.latch: - %res = phi i64 [ 0, %loop.header ], [ %or.2, %if.then ] - %iv.next = add i32 %iv, 1 - %ec = icmp ult i32 %iv, %c - br i1 %ec, label %loop.header, label %exit - -exit: - ret i64 %res -} - -define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { -; CHECK-LABEL: @cost_duplicate_recipe_for_sinking( -; CHECK-NEXT: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE37:%.*]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP5]], 2 -; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP6]], 2 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr nusw double, ptr [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <16 x double>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x double> [[WIDE_VEC1]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x double>, ptr [[TMP13]], align 8 -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x double> [[WIDE_VEC2]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <16 x double>, ptr [[TMP14]], align 8 -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x double> [[WIDE_VEC3]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC4]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC5]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC6]], zeroinitializer -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP19]], i32 0 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = shl nsw i64 [[TMP3]], 2 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP24]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP25]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP19]], i32 1 -; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] -; CHECK: pred.store.if8: -; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP28:%.*]] = shl nsw i64 [[TMP27]], 2 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP28]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP29]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.continue9: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP19]], i32 2 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] -; CHECK: pred.store.if10: -; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP32:%.*]] = shl nsw i64 [[TMP31]], 2 -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP32]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP33]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]] -; CHECK: pred.store.continue11: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP19]], i32 3 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] -; CHECK: pred.store.if12: -; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP36:%.*]] = shl nsw i64 [[TMP35]], 2 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP36]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP37]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]] -; CHECK: pred.store.continue13: -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP20]], i32 0 -; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] -; CHECK: pred.store.if14: -; CHECK-NEXT: [[TMP88:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP39:%.*]] = shl nsw i64 [[TMP88]], 2 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP39]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP40]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] -; CHECK: pred.store.continue15: -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i1> [[TMP20]], i32 1 -; CHECK-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] -; CHECK: pred.store.if16: -; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP43:%.*]] = shl nsw i64 [[TMP42]], 2 -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP43]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP44]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] -; CHECK: pred.store.continue17: -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i1> [[TMP20]], i32 2 -; CHECK-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] -; CHECK: pred.store.if18: -; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP47:%.*]] = shl nsw i64 [[TMP46]], 2 -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP47]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP48]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] -; CHECK: pred.store.continue19: -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP20]], i32 3 -; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] -; CHECK: pred.store.if20: -; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP51:%.*]] = shl nsw i64 [[TMP50]], 2 -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP51]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP52]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] -; CHECK: pred.store.continue21: -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i1> [[TMP21]], i32 0 -; CHECK-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] -; CHECK: pred.store.if22: -; CHECK-NEXT: [[TMP107:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP54:%.*]] = shl nsw i64 [[TMP107]], 2 -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP54]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP55]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] -; CHECK: pred.store.continue23: -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP21]], i32 1 -; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] -; CHECK: pred.store.if24: -; CHECK-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw i64 [[TMP57]], 2 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP58]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP59]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] -; CHECK: pred.store.continue25: -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP21]], i32 2 -; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] -; CHECK: pred.store.if26: -; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP62:%.*]] = shl nsw i64 [[TMP61]], 2 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP62]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP63]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] -; CHECK: pred.store.continue27: -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP21]], i32 3 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] -; CHECK: pred.store.if28: -; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP66:%.*]] = shl nsw i64 [[TMP65]], 2 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP66]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP67]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE29]] -; CHECK: pred.store.continue29: -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP22]], i32 0 -; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] -; CHECK: pred.store.if30: -; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP69:%.*]] = shl nsw i64 [[TMP108]], 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP69]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP70]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE31]] -; CHECK: pred.store.continue31: -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP22]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] -; CHECK: pred.store.if32: -; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[TMP72]], 2 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP73]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP74]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE33]] -; CHECK: pred.store.continue33: -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <4 x i1> [[TMP22]], i32 2 -; CHECK-NEXT: br i1 [[TMP75]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35:%.*]] -; CHECK: pred.store.if34: -; CHECK-NEXT: [[TMP76:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP77:%.*]] = shl nsw i64 [[TMP76]], 2 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP77]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP78]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE35]] -; CHECK: pred.store.continue35: -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP22]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37]] -; CHECK: pred.store.if36: -; CHECK-NEXT: [[TMP80:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP81:%.*]] = shl nsw i64 [[TMP80]], 2 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP81]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP82]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE37]] -; CHECK: pred.store.continue37: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF38:%.*]] = urem i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[N_MOD_VF38]], 0 -; CHECK-NEXT: [[TMP85:%.*]] = select i1 [[TMP84]], i64 4, i64 [[N_MOD_VF38]] -; CHECK-NEXT: [[N_VEC39:%.*]] = sub i64 [[TMP0]], [[TMP85]] -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX40:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL1]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_STORE_CONTINUE50:%.*]] ] -; CHECK-NEXT: [[TMP87:%.*]] = shl nsw i64 [[INDEX40]], 2 -; CHECK-NEXT: [[TMP89:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[TMP87]] -; CHECK-NEXT: [[WIDE_VEC41:%.*]] = load <16 x double>, ptr [[TMP89]], align 8 -; CHECK-NEXT: [[STRIDED_VEC42:%.*]] = shufflevector <16 x double> [[WIDE_VEC41]], <16 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP90:%.*]] = fcmp oeq <4 x double> [[STRIDED_VEC42]], zeroinitializer -; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i1> [[TMP90]], i32 0 -; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] -; CHECK: pred.store.if43: -; CHECK-NEXT: [[TMP86:%.*]] = add i64 [[INDEX40]], 0 -; CHECK-NEXT: [[TMP92:%.*]] = shl nsw i64 [[TMP86]], 2 -; CHECK-NEXT: [[TMP93:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP92]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP93]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]] -; CHECK: pred.store.continue44: -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP90]], i32 1 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] -; CHECK: pred.store.if45: -; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[INDEX40]], 1 -; CHECK-NEXT: [[TMP96:%.*]] = shl nsw i64 [[TMP95]], 2 -; CHECK-NEXT: [[TMP97:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP96]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP97]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]] -; CHECK: pred.store.continue46: -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP90]], i32 2 -; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] -; CHECK: pred.store.if47: -; CHECK-NEXT: [[TMP99:%.*]] = add i64 [[INDEX40]], 2 -; CHECK-NEXT: [[TMP100:%.*]] = shl nsw i64 [[TMP99]], 2 -; CHECK-NEXT: [[TMP101:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP100]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP101]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE48]] -; CHECK: pred.store.continue48: -; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i1> [[TMP90]], i32 3 -; CHECK-NEXT: br i1 [[TMP102]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50]] -; CHECK: pred.store.if49: -; CHECK-NEXT: [[TMP103:%.*]] = add i64 [[INDEX40]], 3 -; CHECK-NEXT: [[TMP104:%.*]] = shl nsw i64 [[TMP103]], 2 -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP104]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP105]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE50]] -; CHECK: pred.store.continue50: -; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[INDEX40]], 4 -; CHECK-NEXT: [[TMP106:%.*]] = icmp eq i64 [[INDEX_NEXT51]], [[N_VEC39]] -; CHECK-NEXT: br i1 [[TMP106]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[IV_SHL:%.*]] = shl nsw i64 [[IV]], 2 -; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr nusw double, ptr [[A]], i64 [[IV_SHL]] -; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP_0]], align 8 -; CHECK-NEXT: [[C:%.*]] = fcmp oeq double [[L]], 0.000000e+00 -; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]] -; CHECK: if.then: -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr double, ptr [[A]], i64 [[IV_SHL]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[GEP_1]], align 8 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: loop.latch: -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void -; -entry: - br label %loop.header - -loop.header: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] - %iv.shl = shl nsw i64 %iv, 2 - %gep.0 = getelementptr nusw double, ptr %A, i64 %iv.shl - %l = load double, ptr %gep.0, align 8 - %c = fcmp oeq double %l, 0.000000e+00 - br i1 %c, label %if.then, label %loop.latch - -if.then: - %gep.1 = getelementptr double, ptr %A, i64 %iv.shl - store double 0.000000e+00, ptr %gep.1, align 8 - br label %loop.latch - -loop.latch: - %iv.next = add nsw i64 %iv, 1 - %ec = icmp eq i64 %iv, %N - br i1 %ec, label %exit, label %loop.header - -exit: - ret void -} - define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-LABEL: @cost_assume( ; CHECK-NEXT: entry: @@ -926,7 +576,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP9]], [[BIN_RDX]] @@ -947,7 +597,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[C]]) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]] ; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP12]], [[LOOP]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[DOTLCSSA]] @@ -991,7 +641,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) @@ -1015,7 +665,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: store i32 [[RED_NEXT]], ptr [[DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 29 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -1060,7 +710,7 @@ define i64 @live_in_known_1_via_scev() { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_PHI]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 -; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]]) ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -1074,7 +724,7 @@ define i64 @live_in_known_1_via_scev() { ; CHECK-NEXT: [[RED_MUL]] = mul nsw i64 [[RED]], [[P_EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RES]] @@ -1121,7 +771,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 1), [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]]) ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -1138,7 +788,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { ; CHECK-NEXT: [[RED_MUL]] = mul i64 [[SHL]], [[RED]] ; CHECK-NEXT: [[IV_NEXT_I_I_I]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_MUL_LCSSA:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RED_MUL_LCSSA]] @@ -1179,7 +829,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i32 @@ -1195,7 +845,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-NEXT: [[OR]] = or i32 [[AND]], [[CONV]] ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 16 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP1]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[OR_LCSSA]] @@ -1268,7 +918,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]] @@ -1303,7 +953,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]]) ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]] @@ -1320,7 +970,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2 ; CHECK-NEXT: [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SELECT_NEXT_LCSSA]] @@ -1342,72 +992,6 @@ exit: ret i32 %select.next } -; Test for https://github.com/llvm/llvm-project/issues/129236. -define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) { -; CHECK-LABEL: @cost_ashr_with_op_known_invariant_via_scev( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i16 0, 0 -; CHECK-NEXT: [[CONV_I:%.*]] = sext i16 0 to i32 -; CHECK-NEXT: [[CONV5_I:%.*]] = sext i8 [[A:%.*]] to i32 -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 100, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: br i1 [[CMP_I]], label [[THEN:%.*]], label [[ELSE:%.*]] -; CHECK: then: -; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[REM_I:%.*]], [[ELSE]] ], [ 0, [[LOOP_HEADER]] ] -; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[CONV5_I]], [[P_1]] -; CHECK-NEXT: [[TOBOOL6_NOT_I:%.*]] = icmp eq i32 [[SHR_I]], 0 -; CHECK-NEXT: [[SEXT_I:%.*]] = shl i32 [[P_1]], 24 -; CHECK-NEXT: [[TMP0:%.*]] = ashr exact i32 [[SEXT_I]], 24 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TOBOOL6_NOT_I]], i32 [[TMP0]], i32 0 -; CHECK-NEXT: br label [[LOOP_LATCH]] -; CHECK: else: -; CHECK-NEXT: [[REM_I]] = urem i32 -1, [[CONV_I]] -; CHECK-NEXT: [[CMP3_I:%.*]] = icmp sgt i32 [[REM_I]], 1 -; CHECK-NEXT: br i1 [[CMP3_I]], label [[LOOP_LATCH]], label [[THEN]] -; CHECK: loop.latch: -; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 0, [[ELSE]] ], [ [[TMP1]], [[THEN]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], -1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]] -; CHECK: exit: -; CHECK-NEXT: [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ] -; CHECK-NEXT: ret i32 [[P_2_LCSSA]] -; -entry: - %cmp.i = icmp eq i16 0, 0 - %conv.i = sext i16 0 to i32 - %conv5.i = sext i8 %a to i32 - br label %loop.header - -loop.header: - %iv = phi i8 [ 100, %entry ], [ %iv.next, %loop.latch ] - br i1 %cmp.i, label %then, label %else - -then: - %p.1 = phi i32 [ %rem.i, %else ], [ 0, %loop.header ] - %shr.i = ashr i32 %conv5.i, %p.1 - %tobool6.not.i = icmp eq i32 %shr.i, 0 - %sext.i = shl i32 %p.1, 24 - %2 = ashr exact i32 %sext.i, 24 - %3 = select i1 %tobool6.not.i, i32 %2, i32 0 - br label %loop.latch - -else: - %rem.i = urem i32 -1, %conv.i - %cmp3.i = icmp sgt i32 %rem.i, 1 - br i1 %cmp3.i, label %loop.latch, label %then - -loop.latch: - %p.2 = phi i32 [ 0, %else ], [ %3, %then ] - %iv.next = add i8 %iv, -1 - %ec = icmp eq i8 %iv.next, 0 - br i1 %ec, label %exit, label %loop.header - -exit: - ret i32 %p.2 -} - declare void @llvm.assume(i1 noundef) #0 attributes #0 = { "target-cpu"="penryn" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index 86fa5d6820416..c7a2bb4fa04d7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -368,28 +368,28 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x ptr> [[TMP17]], ptr [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x ptr> [[TMP18]], ptr [[TMP15]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x ptr> [[TMP19]], ptr [[TMP16]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2 ; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3 -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x ptr> [[TMP33]], ptr [[TMP20]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x ptr> [[TMP34]], ptr [[TMP31]], i32 2 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x ptr> [[TMP35]], ptr [[TMP32]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP28]], <4 x ptr> [[TMP38]], <4 x i32> +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP19]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[STRIDED_VEC]], i32 3 ; CHECK-NEXT: store float [[TMP30]], ptr [[C:%.*]], align 4 @@ -403,7 +403,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index de6418066dea0..d6514871b82d5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -26,8 +26,16 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP9]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP18:%.*]] = and i64 [[TMP10]], 1 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 @@ -35,7 +43,7 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i8> [[WIDE_LOAD]], <8 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP29]] = xor <8 x i8> [[REVERSE]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP10]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll index 59317fa463709..fb6c116569a0b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -69,8 +69,12 @@ define i32 @main(ptr %ptr) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP21]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP28]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4 ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll new file mode 100644 index 0000000000000..7c1efe8c922c6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll @@ -0,0 +1,589 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -p loop-vectorize -mtriple=x86_64-linux-gnu -S %s | FileCheck --check-prefix=I64 %s +; RUN: opt -p loop-vectorize -mtriple=i386-pc-linux-gnu -S %s | FileCheck --check-prefix=I32 %s + + +define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 { +; I64-LABEL: define void @test_store_initially_interleave( +; I64-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { +; I64-NEXT: [[ITER_CHECK:.*:]] +; I64-NEXT: [[TMP4:%.*]] = add i32 [[N]], 1 +; I64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP4]], 4 +; I64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; I64: [[VECTOR_SCEVCHECK]]: +; I64-NEXT: [[TMP1:%.*]] = icmp slt i32 [[N]], 0 +; I64-NEXT: br i1 [[TMP1]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; I64: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; I64-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP4]], 16 +; I64-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; I64: [[VECTOR_PH]]: +; I64-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP4]], 16 +; I64-NEXT: [[TMP2:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; I64-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 16, i32 [[N_MOD_VF]] +; I64-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP4]], [[TMP3]] +; I64-NEXT: br label %[[VECTOR_BODY:.*]] +; I64: [[VECTOR_BODY]]: +; I64-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; I64-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; I64-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; I64-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) +; I64-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) +; I64-NEXT: [[IV:%.*]] = add i32 [[INDEX]], 0 +; I64-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; I64-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 2 +; I64-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 3 +; I64-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 4 +; I64-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 5 +; I64-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 6 +; I64-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 7 +; I64-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 8 +; I64-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 9 +; I64-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 10 +; I64-NEXT: [[TMP15:%.*]] = add i32 [[INDEX]], 11 +; I64-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 12 +; I64-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 13 +; I64-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 14 +; I64-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 15 +; I64-NEXT: [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double> +; I64-NEXT: [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I64-NEXT: [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I64-NEXT: [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I64-NEXT: [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]] +; I64-NEXT: [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] +; I64-NEXT: [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] +; I64-NEXT: [[TMP27:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]] +; I64-NEXT: [[TMP28:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]] +; I64-NEXT: [[TMP29:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]] +; I64-NEXT: [[TMP30:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]] +; I64-NEXT: [[TMP31:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]] +; I64-NEXT: [[TMP32:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]] +; I64-NEXT: [[TMP33:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]] +; I64-NEXT: [[TMP34:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]] +; I64-NEXT: [[TMP35:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]] +; I64-NEXT: [[TMP36:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]] +; I64-NEXT: [[TMP37:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]] +; I64-NEXT: [[TMP38:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]] +; I64-NEXT: [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP19]] +; I64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ADD_PTR_I]], align 4 +; I64-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP25]], align 4 +; I64-NEXT: [[TMP42:%.*]] = load ptr, ptr [[TMP26]], align 4 +; I64-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP27]], align 4 +; I64-NEXT: [[TMP44:%.*]] = load ptr, ptr [[TMP28]], align 4 +; I64-NEXT: [[TMP45:%.*]] = load ptr, ptr [[TMP29]], align 4 +; I64-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP30]], align 4 +; I64-NEXT: [[TMP47:%.*]] = load ptr, ptr [[TMP31]], align 4 +; I64-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP32]], align 4 +; I64-NEXT: [[TMP49:%.*]] = load ptr, ptr [[TMP33]], align 4 +; I64-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP34]], align 4 +; I64-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP35]], align 4 +; I64-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP36]], align 4 +; I64-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4 +; I64-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4 +; I64-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4 +; I64-NEXT: [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0 +; I64-NEXT: store double [[CONV]], ptr [[TMP0]], align 4 +; I64-NEXT: [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1 +; I64-NEXT: store double [[TMP57]], ptr [[TMP41]], align 4 +; I64-NEXT: [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2 +; I64-NEXT: store double [[TMP58]], ptr [[TMP42]], align 4 +; I64-NEXT: [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3 +; I64-NEXT: store double [[TMP59]], ptr [[TMP43]], align 4 +; I64-NEXT: [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0 +; I64-NEXT: store double [[TMP60]], ptr [[TMP44]], align 4 +; I64-NEXT: [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1 +; I64-NEXT: store double [[TMP61]], ptr [[TMP45]], align 4 +; I64-NEXT: [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2 +; I64-NEXT: store double [[TMP62]], ptr [[TMP46]], align 4 +; I64-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3 +; I64-NEXT: store double [[TMP63]], ptr [[TMP47]], align 4 +; I64-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0 +; I64-NEXT: store double [[TMP64]], ptr [[TMP48]], align 4 +; I64-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1 +; I64-NEXT: store double [[TMP65]], ptr [[TMP49]], align 4 +; I64-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2 +; I64-NEXT: store double [[TMP66]], ptr [[TMP50]], align 4 +; I64-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3 +; I64-NEXT: store double [[TMP67]], ptr [[TMP51]], align 4 +; I64-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0 +; I64-NEXT: store double [[TMP68]], ptr [[TMP52]], align 4 +; I64-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1 +; I64-NEXT: store double [[TMP69]], ptr [[TMP53]], align 4 +; I64-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2 +; I64-NEXT: store double [[TMP70]], ptr [[TMP54]], align 4 +; I64-NEXT: [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3 +; I64-NEXT: store double [[TMP71]], ptr [[TMP55]], align 4 +; I64-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; I64-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; I64-NEXT: [[TMP72:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; I64-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; I64: [[MIDDLE_BLOCK]]: +; I64-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] +; I64: [[VEC_EPILOG_ITER_CHECK]]: +; I64-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP4]], [[N_VEC]] +; I64-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[N_VEC_REMAINING]], 4 +; I64-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; I64: [[VEC_EPILOG_PH]]: +; I64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; I64-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP4]], 4 +; I64-NEXT: [[TMP73:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0 +; I64-NEXT: [[TMP74:%.*]] = select i1 [[TMP73]], i32 4, i32 [[N_MOD_VF2]] +; I64-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP4]], [[TMP74]] +; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; I64-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; I64-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; I64: [[VEC_EPILOG_VECTOR_BODY]]: +; I64-NEXT: [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; I64-NEXT: [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; I64-NEXT: [[TMP75:%.*]] = add i32 [[INDEX4]], 0 +; I64-NEXT: [[TMP76:%.*]] = add i32 [[INDEX4]], 1 +; I64-NEXT: [[TMP77:%.*]] = add i32 [[INDEX4]], 2 +; I64-NEXT: [[TMP78:%.*]] = add i32 [[INDEX4]], 3 +; I64-NEXT: [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double> +; I64-NEXT: [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] +; I64-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] +; I64-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]] +; I64-NEXT: [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]] +; I64-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4 +; I64-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4 +; I64-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4 +; I64-NEXT: [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4 +; I64-NEXT: [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0 +; I64-NEXT: store double [[TMP88]], ptr [[TMP84]], align 4 +; I64-NEXT: [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1 +; I64-NEXT: store double [[TMP89]], ptr [[TMP85]], align 4 +; I64-NEXT: [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2 +; I64-NEXT: store double [[TMP90]], ptr [[TMP86]], align 4 +; I64-NEXT: [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3 +; I64-NEXT: store double [[TMP91]], ptr [[TMP87]], align 4 +; I64-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4 +; I64-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4) +; I64-NEXT: [[TMP92:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]] +; I64-NEXT: br i1 [[TMP92]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; I64: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; I64-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] +; I64: [[VEC_EPILOG_SCALAR_PH]]: +; +; I32-LABEL: define void @test_store_initially_interleave( +; I32-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { +; I32-NEXT: [[ENTRY:.*:]] +; I32-NEXT: [[TMP0:%.*]] = add i32 [[N]], 1 +; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], 4 +; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; I32: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; I32-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP0]], 16 +; I32-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; I32: [[VECTOR_PH]]: +; I32-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16 +; I32-NEXT: [[TMP1:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; I32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 16, i32 [[N_MOD_VF]] +; I32-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP2]] +; I32-NEXT: br label %[[VECTOR_BODY:.*]] +; I32: [[VECTOR_BODY]]: +; I32-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; I32-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; I32-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; I32-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) +; I32-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) +; I32-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 +; I32-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 1 +; I32-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 2 +; I32-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 3 +; I32-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 4 +; I32-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 5 +; I32-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 6 +; I32-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 7 +; I32-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 8 +; I32-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 9 +; I32-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 10 +; I32-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 11 +; I32-NEXT: [[TMP40:%.*]] = add i32 [[INDEX]], 12 +; I32-NEXT: [[TMP41:%.*]] = add i32 [[INDEX]], 13 +; I32-NEXT: [[TMP42:%.*]] = add i32 [[INDEX]], 14 +; I32-NEXT: [[TMP43:%.*]] = add i32 [[INDEX]], 15 +; I32-NEXT: [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double> +; I32-NEXT: [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double> +; I32-NEXT: [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double> +; I32-NEXT: [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double> +; I32-NEXT: [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]] +; I32-NEXT: [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]] +; I32-NEXT: [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]] +; I32-NEXT: [[TMP18:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]] +; I32-NEXT: [[TMP19:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]] +; I32-NEXT: [[TMP20:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]] +; I32-NEXT: [[TMP21:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]] +; I32-NEXT: [[TMP22:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]] +; I32-NEXT: [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]] +; I32-NEXT: [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]] +; I32-NEXT: [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]] +; I32-NEXT: [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]] +; I32-NEXT: [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP40]] +; I32-NEXT: [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP41]] +; I32-NEXT: [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP42]] +; I32-NEXT: [[TMP71:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP43]] +; I32-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP15]], align 4 +; I32-NEXT: [[TMP24:%.*]] = load ptr, ptr [[TMP16]], align 4 +; I32-NEXT: [[TMP25:%.*]] = load ptr, ptr [[TMP17]], align 4 +; I32-NEXT: [[TMP26:%.*]] = load ptr, ptr [[TMP18]], align 4 +; I32-NEXT: [[TMP27:%.*]] = load ptr, ptr [[TMP19]], align 4 +; I32-NEXT: [[TMP28:%.*]] = load ptr, ptr [[TMP20]], align 4 +; I32-NEXT: [[TMP29:%.*]] = load ptr, ptr [[TMP21]], align 4 +; I32-NEXT: [[TMP30:%.*]] = load ptr, ptr [[TMP22]], align 4 +; I32-NEXT: [[TMP47:%.*]] = load ptr, ptr [[TMP56]], align 4 +; I32-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP57]], align 4 +; I32-NEXT: [[TMP49:%.*]] = load ptr, ptr [[TMP58]], align 4 +; I32-NEXT: [[TMP50:%.*]] = load ptr, ptr [[TMP59]], align 4 +; I32-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP60]], align 4 +; I32-NEXT: [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4 +; I32-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4 +; I32-NEXT: [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4 +; I32-NEXT: [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0 +; I32-NEXT: store double [[TMP31]], ptr [[TMP23]], align 4 +; I32-NEXT: [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1 +; I32-NEXT: store double [[TMP32]], ptr [[TMP24]], align 4 +; I32-NEXT: [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2 +; I32-NEXT: store double [[TMP33]], ptr [[TMP25]], align 4 +; I32-NEXT: [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3 +; I32-NEXT: store double [[TMP34]], ptr [[TMP26]], align 4 +; I32-NEXT: [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0 +; I32-NEXT: store double [[TMP35]], ptr [[TMP27]], align 4 +; I32-NEXT: [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1 +; I32-NEXT: store double [[TMP36]], ptr [[TMP28]], align 4 +; I32-NEXT: [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2 +; I32-NEXT: store double [[TMP37]], ptr [[TMP29]], align 4 +; I32-NEXT: [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3 +; I32-NEXT: store double [[TMP38]], ptr [[TMP30]], align 4 +; I32-NEXT: [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0 +; I32-NEXT: store double [[TMP63]], ptr [[TMP47]], align 4 +; I32-NEXT: [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1 +; I32-NEXT: store double [[TMP64]], ptr [[TMP48]], align 4 +; I32-NEXT: [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2 +; I32-NEXT: store double [[TMP65]], ptr [[TMP49]], align 4 +; I32-NEXT: [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3 +; I32-NEXT: store double [[TMP66]], ptr [[TMP50]], align 4 +; I32-NEXT: [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0 +; I32-NEXT: store double [[TMP67]], ptr [[TMP51]], align 4 +; I32-NEXT: [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1 +; I32-NEXT: store double [[TMP68]], ptr [[TMP52]], align 4 +; I32-NEXT: [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2 +; I32-NEXT: store double [[TMP69]], ptr [[TMP53]], align 4 +; I32-NEXT: [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3 +; I32-NEXT: store double [[TMP70]], ptr [[TMP54]], align 4 +; I32-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; I32-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; I32-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; I32-NEXT: br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; I32: [[MIDDLE_BLOCK]]: +; I32-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] +; I32: [[VEC_EPILOG_ITER_CHECK]]: +; I32-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP0]], [[N_VEC]] +; I32-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[N_VEC_REMAINING]], 4 +; I32-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; I32: [[VEC_EPILOG_PH]]: +; I32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; I32-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP0]], 4 +; I32-NEXT: [[TMP72:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0 +; I32-NEXT: [[TMP73:%.*]] = select i1 [[TMP72]], i32 4, i32 [[N_MOD_VF2]] +; I32-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP0]], [[TMP73]] +; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; I32-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; I32-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; I32: [[VEC_EPILOG_VECTOR_BODY]]: +; I32-NEXT: [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; I32-NEXT: [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; I32-NEXT: [[TMP74:%.*]] = add i32 [[INDEX4]], 0 +; I32-NEXT: [[TMP75:%.*]] = add i32 [[INDEX4]], 1 +; I32-NEXT: [[TMP76:%.*]] = add i32 [[INDEX4]], 2 +; I32-NEXT: [[TMP77:%.*]] = add i32 [[INDEX4]], 3 +; I32-NEXT: [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double> +; I32-NEXT: [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]] +; I32-NEXT: [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]] +; I32-NEXT: [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]] +; I32-NEXT: [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]] +; I32-NEXT: [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4 +; I32-NEXT: [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4 +; I32-NEXT: [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4 +; I32-NEXT: [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4 +; I32-NEXT: [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0 +; I32-NEXT: store double [[TMP87]], ptr [[TMP83]], align 4 +; I32-NEXT: [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1 +; I32-NEXT: store double [[TMP88]], ptr [[TMP84]], align 4 +; I32-NEXT: [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2 +; I32-NEXT: store double [[TMP89]], ptr [[TMP85]], align 4 +; I32-NEXT: [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3 +; I32-NEXT: store double [[TMP90]], ptr [[TMP86]], align 4 +; I32-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4 +; I32-NEXT: [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4) +; I32-NEXT: [[TMP91:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]] +; I32-NEXT: br i1 [[TMP91]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; I32: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; I32-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] +; I32: [[VEC_EPILOG_SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + %conv = uitofp i32 %iv to double + %add.ptr.i = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 %iv + %0 = load ptr, ptr %add.ptr.i, align 4 + store double %conv, ptr %0, align 4 + %inc = add i32 %iv, 1 + %ec = icmp eq i32 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +define void @test_store_loaded_value(ptr noalias %src, ptr noalias %dst, i32 %n) #0 { +; I64-LABEL: define void @test_store_loaded_value( +; I64-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; I64-NEXT: [[BB:.*:]] +; I64-NEXT: [[PRE:%.*]] = icmp slt i32 [[N]], 1 +; I64-NEXT: br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]] +; I64: [[PH]]: +; I64-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; I64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4 +; I64-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; I64: [[VECTOR_PH]]: +; I64-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4 +; I64-NEXT: [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]] +; I64-NEXT: br label %[[VECTOR_BODY:.*]] +; I64: [[VECTOR_BODY]]: +; I64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; I64-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; I64-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; I64-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; I64-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; I64-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]] +; I64-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]] +; I64-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]] +; I64-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; I64-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8 +; I64-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8 +; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8 +; I64-NEXT: [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8 +; I64-NEXT: [[TMP12:%.*]] = shl i64 [[TMP0]], 1 +; I64-NEXT: [[TMP13:%.*]] = shl i64 [[TMP1]], 1 +; I64-NEXT: [[TMP14:%.*]] = shl i64 [[TMP2]], 1 +; I64-NEXT: [[TMP15:%.*]] = shl i64 [[TMP3]], 1 +; I64-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]] +; I64-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]] +; I64-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]] +; I64-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]] +; I64-NEXT: store double [[TMP8]], ptr [[TMP16]], align 8 +; I64-NEXT: store double [[TMP9]], ptr [[TMP17]], align 8 +; I64-NEXT: store double [[TMP10]], ptr [[TMP18]], align 8 +; I64-NEXT: store double [[TMP11]], ptr [[TMP19]], align 8 +; I64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; I64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; I64-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; I64: [[MIDDLE_BLOCK]]: +; I64-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]] +; I64-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] +; I64: [[SCALAR_PH]]: +; +; I32-LABEL: define void @test_store_loaded_value( +; I32-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; I32-NEXT: [[BB:.*:]] +; I32-NEXT: [[PRE:%.*]] = icmp slt i32 [[N]], 1 +; I32-NEXT: br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]] +; I32: [[PH]]: +; I32-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4 +; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; I32: [[VECTOR_PH]]: +; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4 +; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]] +; I32-NEXT: br label %[[VECTOR_BODY:.*]] +; I32: [[VECTOR_BODY]]: +; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; I32-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; I32-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; I32-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; I32-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]] +; I32-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]] +; I32-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]] +; I32-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] +; I32-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8 +; I32-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8 +; I32-NEXT: [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8 +; I32-NEXT: [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8 +; I32-NEXT: [[TMP12:%.*]] = shl i64 [[TMP0]], 1 +; I32-NEXT: [[TMP13:%.*]] = shl i64 [[TMP1]], 1 +; I32-NEXT: [[TMP14:%.*]] = shl i64 [[TMP2]], 1 +; I32-NEXT: [[TMP15:%.*]] = shl i64 [[TMP3]], 1 +; I32-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]] +; I32-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]] +; I32-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]] +; I32-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]] +; I32-NEXT: store double [[TMP8]], ptr [[TMP16]], align 8 +; I32-NEXT: store double [[TMP9]], ptr [[TMP17]], align 8 +; I32-NEXT: store double [[TMP10]], ptr [[TMP18]], align 8 +; I32-NEXT: store double [[TMP11]], ptr [[TMP19]], align 8 +; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; I32-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; I32-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; I32: [[MIDDLE_BLOCK]]: +; I32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]] +; I32-NEXT: br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]] +; I32: [[SCALAR_PH]]: +; +bb: + %pre = icmp slt i32 %n, 1 + br i1 %pre, label %exit, label %ph + +ph: + %n.ext = zext i32 %n to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i8, ptr %src, i64 %iv + %l = load double, ptr %gep.src, align 8 + %sext = shl i64 %iv, 1 + %gep.dst = getelementptr i8, ptr %dst, i64 %sext + store double %l, ptr %gep.dst, align 8 + %ec = icmp eq i64 %iv.next, %n.ext + br i1 %ec, label %exit, label %loop, !llvm.loop !0 + +exit: + ret void +} + +declare i1 @cond() + +define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %ptr.c) { +; I64-LABEL: define double @test_load_used_by_other_load_scev( +; I64-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) { +; I64-NEXT: [[ENTRY:.*]]: +; I64-NEXT: br label %[[OUTER_LOOP:.*]] +; I64: [[OUTER_LOOP_LOOPEXIT:.*]]: +; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], [[INNER_LOOP:%.*]] ], [ [[TMP29:%.*]], %[[MIDDLE_BLOCK:.*]] ] +; I64-NEXT: br label %[[OUTER_LOOP]] +; I64: [[OUTER_LOOP]]: +; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ] +; I64-NEXT: [[COND:%.*]] = call i1 @cond() +; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], [[EXIT:label %.*]] +; I64: [[INNER_LOOP_PREHEADER]]: +; I64-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; I64: [[VECTOR_PH]]: +; I64-NEXT: br label %[[VECTOR_BODY:.*]] +; I64: [[VECTOR_BODY]]: +; I64-NEXT: [[TMP0:%.*]] = add i64 0, 1 +; I64-NEXT: [[TMP1:%.*]] = add i64 1, 1 +; I64-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]] +; I64-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]] +; I64-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]] +; I64-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]] +; I64-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8 +; I64-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8 +; I64-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]] +; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]] +; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8 +; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0 +; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; I64-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer +; I64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8 +; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8 +; I64-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8 +; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8 +; I64-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0 +; I64-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1 +; I64-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer +; I64-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0 +; I64-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer +; I64-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> +; I64-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer +; I64-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer +; I64-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00) +; I64-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8 +; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8 +; I64-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0 +; I64-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1 +; I64-NEXT: [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]] +; I64-NEXT: [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]] +; I64-NEXT: br label %[[MIDDLE_BLOCK]] +; I64: [[MIDDLE_BLOCK]]: +; I64-NEXT: [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1 +; I64-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; I64-NEXT: br i1 true, label %[[OUTER_LOOP_LOOPEXIT]], label %[[SCALAR_PH]] +; I64: [[SCALAR_PH]]: +; +; I32-LABEL: define double @test_load_used_by_other_load_scev( +; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) { +; I32-NEXT: [[ENTRY:.*]]: +; I32-NEXT: br label %[[OUTER_LOOP:.*]] +; I32: [[OUTER_LOOP]]: +; I32-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ] +; I32-NEXT: [[COND:%.*]] = call i1 @cond() +; I32-NEXT: br i1 [[COND]], label %[[INNER_LOOP]], label %[[EXIT:.*]] +; I32: [[INNER_LOOP]]: +; I32-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[OUTER_LOOP]] ], [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ] +; I32-NEXT: [[ACCUM_INNER:%.*]] = phi double [ [[ACCUM]], %[[OUTER_LOOP]] ], [ [[MUL1:%.*]], %[[INNER_LOOP]] ] +; I32-NEXT: [[IDX_PLUS1:%.*]] = add i64 [[IV]], 1 +; I32-NEXT: [[GEP_C:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[IDX_PLUS1]] +; I32-NEXT: [[GEP_A_I64:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[IDX_PLUS1]] +; I32-NEXT: [[LOAD_IDX:%.*]] = load i64, ptr [[GEP_A_I64]], align 8 +; I32-NEXT: [[GEP_B:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[LOAD_IDX]] +; I32-NEXT: [[LOAD_A:%.*]] = load double, ptr [[PTR_A]], align 8 +; I32-NEXT: [[ADD1:%.*]] = fadd double [[LOAD_A]], 0.000000e+00 +; I32-NEXT: [[GEP_C_OFFSET:%.*]] = getelementptr i8, ptr [[GEP_C]], i64 8 +; I32-NEXT: [[LOAD_C:%.*]] = load double, ptr [[GEP_C_OFFSET]], align 8 +; I32-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00 +; I32-NEXT: [[MUL2:%.*]] = fmul double [[LOAD_C]], 0.000000e+00 +; I32-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00 +; I32-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00 +; I32-NEXT: [[LOAD_B:%.*]] = load double, ptr [[GEP_B]], align 8 +; I32-NEXT: [[DIV:%.*]] = fdiv double [[LOAD_B]], [[ADD3]] +; I32-NEXT: [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]] +; I32-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; I32-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1 +; I32-NEXT: br i1 [[EXITCOND]], label %[[OUTER_LOOP]], label %[[INNER_LOOP]] +; I32: [[EXIT]]: +; I32-NEXT: ret double [[ACCUM]] +; +entry: + br label %outer.loop + +outer.loop: + %accum = phi double [ 0.0, %entry ], [ %result, %inner.loop ] + %cond = call i1 @cond() + br i1 %cond, label %inner.loop, label %exit + +inner.loop: + %iv = phi i64 [ 0, %outer.loop ], [ %iv.next, %inner.loop ] + %accum.inner = phi double [ %accum, %outer.loop ], [ %mul1, %inner.loop ] + %idx.plus1 = add i64 %iv, 1 + %gep.c = getelementptr i8, ptr %ptr.c, i64 %idx.plus1 + %gep.a.i64 = getelementptr i64, ptr %ptr.a, i64 %idx.plus1 + %load.idx = load i64, ptr %gep.a.i64, align 8 + %gep.b = getelementptr double, ptr %ptr.b, i64 %load.idx + %load.a = load double, ptr %ptr.a, align 8 + %add1 = fadd double %load.a, 0.000000e+00 + %gep.c.offset = getelementptr i8, ptr %gep.c, i64 8 + %load.c = load double, ptr %gep.c.offset, align 8 + %mul1 = fmul double %add1, 0.000000e+00 + %mul2 = fmul double %load.c, 0.000000e+00 + %add2 = fadd double %mul2, 0.000000e+00 + %add3 = fadd double %add2, 1.000000e+00 + %load.b = load double, ptr %gep.b, align 8 + %div = fdiv double %load.b, %add3 + %result = fsub double %accum.inner, %div + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv, 1 + br i1 %exitcond, label %outer.loop, label %inner.loop + +exit: + ret double %accum +} + +attributes #0 = { "target-cpu"="znver2" } + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll index 166875dd55aae..b612bfb88198e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll @@ -5,7 +5,7 @@ ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %shift = ashr i32 %val, %k ; CHECK: Cost of 2 for VF 2: WIDEN ir<%shift> = ashr ir<%val>, ir<%k> ; CHECK: Cost of 2 for VF 4: WIDEN ir<%shift> = ashr ir<%val>, ir<%k> -define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr #0 { +define void @foo(ptr nocapture %p, i32 %k) local_unnamed_addr { entry: br label %body @@ -21,5 +21,31 @@ body: exit: ret void +} + +; CHECK: 'shift_and_masked_load_store' +; CHECK: Cost of 1 for VF 2: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2> +; CHECK: Cost of 1 for VF 4: CLONE ir<%shifted> = lshr vp<{{.+}}>, ir<2> +; CHECK: Cost of 4 for VF 8: WIDEN ir<%shifted> = lshr ir<%iv>, ir<2> +define void @shift_and_masked_load_store(i64 %trip.count) #0 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %shifted = lshr i64 %iv, 2 + %masked.idx = and i64 %shifted, 1 + %load.ptr = getelementptr i16, ptr poison, i64 %masked.idx + %val = load i16, ptr %load.ptr, align 2 + %store.idx = shl nuw i64 %iv, 2 + %store.ptr = getelementptr i8, ptr poison, i64 %store.idx + store i16 %val, ptr %store.ptr, align 2 + %iv.next = add i64 %iv, 1 + %cmp = icmp eq i64 %iv, %trip.count + br i1 %cmp, label %exit, label %loop +exit: + ret void } + +attributes #0 = { "target-features"="+avx2" "tune-cpu"="alderlake" } diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll index df926fa6d189c..b62d9445f2597 100644 --- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll +++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll @@ -117,7 +117,10 @@ define void @const_fold_select(ptr %dst, i64 %d) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[D]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[D]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], splat (i64 1) +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index a02fddc4cf72d..10db620d83f0d 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -134,14 +134,18 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX2]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i64 3 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = insertelement <4 x i1> poison, i1 [[CMP]], i64 3 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META12:![0-9]+]] -; CHECK-NEXT: store i32 [[K]], ptr [[A]], align 4, !alias.scope [[META12]] +; CHECK-NEXT: store i32 [[TMP1]], ptr [[A]], align 4, !alias.scope [[META12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index c5d318141369b..2848a2ce3d878 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -136,18 +136,18 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3 +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3 +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3 +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -516,18 +516,18 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 3 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP9]], 3 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 3 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP11]], 3 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP8]], 3 +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP9]], 3 +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP11]], 3 +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -708,18 +708,18 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = fcmp fast ueq float [[TMP8]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = fcmp fast ueq float [[TMP9]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = fcmp fast ueq float [[TMP11]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -900,18 +900,18 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP5]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4 ; CHECK-VF1IC4-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4 -; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00 -; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = xor i1 [[TMP12]], true -; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = xor i1 [[TMP13]], true -; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = xor i1 [[TMP14]], true -; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = xor i1 [[TMP15]], true -; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP16]] -; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP17]] -; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP18]] -; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP19]] +; CHECK-VF1IC4-NEXT: [[TMP16:%.*]] = fcmp ueq float [[TMP8]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP17:%.*]] = fcmp ueq float [[TMP9]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP18:%.*]] = fcmp ueq float [[TMP10]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP19:%.*]] = fcmp ueq float [[TMP11]], 3.000000e+00 +; CHECK-VF1IC4-NEXT: [[TMP12:%.*]] = xor i1 [[TMP16]], true +; CHECK-VF1IC4-NEXT: [[TMP13:%.*]] = xor i1 [[TMP17]], true +; CHECK-VF1IC4-NEXT: [[TMP14:%.*]] = xor i1 [[TMP18]], true +; CHECK-VF1IC4-NEXT: [[TMP15:%.*]] = xor i1 [[TMP19]], true +; CHECK-VF1IC4-NEXT: [[TMP20]] = or i1 [[VEC_PHI]], [[TMP12]] +; CHECK-VF1IC4-NEXT: [[TMP21]] = or i1 [[VEC_PHI1]], [[TMP13]] +; CHECK-VF1IC4-NEXT: [[TMP22]] = or i1 [[VEC_PHI2]], [[TMP14]] +; CHECK-VF1IC4-NEXT: [[TMP23]] = or i1 [[VEC_PHI3]], [[TMP15]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -1059,8 +1059,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF1IC4: [[VECTOR_PH]]: ; CHECK-VF1IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; CHECK-VF1IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = icmp eq i32 [[A]], 3 -; CHECK-VF1IC4-NEXT: [[TMP4:%.*]] = xor i1 [[TMP0]], true +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = icmp eq i32 [[A]], 3 +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = xor i1 [[TMP1]], true ; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF1IC4: [[VECTOR_BODY]]: ; CHECK-VF1IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -1068,10 +1068,10 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF1IC4-NEXT: [[VEC_PHI1:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] -; CHECK-VF1IC4-NEXT: [[TMP5]] = or i1 [[VEC_PHI]], [[TMP4]] -; CHECK-VF1IC4-NEXT: [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP4]] -; CHECK-VF1IC4-NEXT: [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP4]] -; CHECK-VF1IC4-NEXT: [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP4]] +; CHECK-VF1IC4-NEXT: [[TMP5]] = or i1 [[VEC_PHI]], [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP6]] = or i1 [[VEC_PHI1]], [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP7]] = or i1 [[VEC_PHI2]], [[TMP0]] +; CHECK-VF1IC4-NEXT: [[TMP8]] = or i1 [[VEC_PHI3]], [[TMP0]] ; CHECK-VF1IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF1IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF1IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll index fb32e907c7d44..3084f9987262d 100644 --- a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll +++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll @@ -19,10 +19,10 @@ define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) { ; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8> ; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3 ; CHECK-NEXT: store i8 [[TMP3]], ptr [[DST]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -85,13 +85,12 @@ define void @single_scalar_cast_stored(ptr %src, ptr %dst, i32 %n) { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i16> [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i16> [[BROADCAST_SPLAT]], splat (i16 15) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i16> zeroinitializer, <4 x i16> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: store i16 [[TMP5]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; CHECK-NEXT: store i16 [[TMP4]], ptr [[DST]], align 2, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll index 3b442a9ab4d3c..ba450e6878fe6 100644 --- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll +++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll @@ -83,6 +83,8 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -90,8 +92,8 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 % ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]] ; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0 @@ -166,6 +168,8 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -173,8 +177,8 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i16 [[TMP3]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 ; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP4]], ptr poison, ptr [[B]] ; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[PREDPHI]], align 2 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll index df8123d5fc2d0..67844faccaf0c 100644 --- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll +++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll @@ -26,7 +26,14 @@ define void @basic_loop(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 @@ -89,7 +96,14 @@ define void @metadata(ptr nocapture readonly %ptr, i32 %size, ptr %pos) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP6]] +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i32 [[TMP9]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[NEXT_GEP]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 732214aa1449e..0cf5c1a715538 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -905,7 +905,8 @@ define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l> -; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1> +; CHECK-NEXT: EMIT vp<[[EXT:%.+]]> = extract-last-element ir<%zext> +; CHECK-NEXT: CLONE store vp<[[EXT]]>, ir<%p1> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 4a4bda254bf88..f2a128b05e3fb 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -272,8 +272,9 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%lv> = load ir<%A> -; CHECK-NEXT: WIDEN ir<%cmp> = icmp uge ir<%iv>, ir<%k> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = logical-and vp<[[MASK]]>, ir<%cmp> +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%cmp> +; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = logical-and vp<[[MASK]]>, vp<[[NOT]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -980,12 +981,13 @@ define void @sinking_requires_duplication(ptr %addr) { ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%addr>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN ir<%0> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%pred> = fcmp une ir<%0>, ir<0.000000e+00> +; CHECK-NEXT: WIDEN ir<%pred> = fcmp oeq ir<%0>, ir<0.000000e+00> +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%pred> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK ir<%pred> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[NOT]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: @@ -1150,12 +1152,13 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-NEXT: CLONE ir<%ptr.iv.next> = getelementptr inbounds vp<[[PTR_IV]]>, ir<-1> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%ptr.iv.next>, vp<[[VF]]> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%c.1> = icmp ne ir<%l>, ir<0> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<0> +; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK ir<%c.1> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[NOT]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: