308 changes: 308 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4221,6 +4221,46 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
setValue(&I, StoreNode);
}

void SelectionDAGBuilder::visitStoreVP(const CallInst &I) {
SDLoc sdl = getCurSDLoc();

auto getVPStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
Value * &VLen, unsigned & Alignment) {
// llvm.masked.store.*(Src0, Ptr, Mask, VLen)
Src0 = I.getArgOperand(0);
Ptr = I.getArgOperand(1);
Alignment = I.getParamAlignment(1);
Mask = I.getArgOperand(2);
VLen = I.getArgOperand(3);
};

Value *PtrOperand, *MaskOperand, *Src0Operand, *VLenOperand;
unsigned Alignment = 0;
getVPStoreOps(PtrOperand, MaskOperand, Src0Operand, VLenOperand, Alignment);

SDValue Ptr = getValue(PtrOperand);
SDValue Src0 = getValue(Src0Operand);
SDValue Mask = getValue(MaskOperand);
SDValue VLen = getValue(VLenOperand);

EVT VT = Src0.getValueType();
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);

AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);

MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(PtrOperand),
MachineMemOperand::MOStore, VT.getStoreSize(),
Alignment, AAInfo);
SDValue StoreNode = DAG.getStoreVP(getRoot(), sdl, Src0, Ptr, Mask, VLen, VT,
MMO, false /* Truncating */);
DAG.setRoot(StoreNode);
setValue(&I, StoreNode);
}

// Get a uniform base for the Gather/Scatter intrinsic.
// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
// We try to represent it as a base pointer + vector of indices.
Expand Down Expand Up @@ -4465,6 +4505,162 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
setValue(&I, Gather);
}

void SelectionDAGBuilder::visitGatherVP(const CallInst &I) {
SDLoc sdl = getCurSDLoc();

// @llvm.evl.gather.*(Ptrs, Mask, VLen)
const Value *Ptr = I.getArgOperand(0);
SDValue Mask = getValue(I.getArgOperand(1));
SDValue VLen = getValue(I.getArgOperand(2));

const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
unsigned Alignment = I.getParamAlignment(0);
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);

AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

SDValue Root = DAG.getRoot();
SDValue Base;
SDValue Index;
ISD::MemIndexType IndexType;
SDValue Scale;
const Value *BasePtr = Ptr;
bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale, this);
bool ConstantMemory = false;
if (UniformBase && AA &&
AA->pointsToConstantMemory(
MemoryLocation(BasePtr,
LocationSize::precise(
DAG.getDataLayout().getTypeStoreSize(I.getType())),
AAInfo))) {
// Do not serialize (non-volatile) loads of constant memory with anything.
Root = DAG.getEntryNode();
ConstantMemory = true;
}

MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
MachineMemOperand::MOLoad, VT.getStoreSize(),
Alignment, AAInfo, Ranges);

if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
IndexType = ISD::SIGNED_SCALED;
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { Root, Base, Index, Scale, Mask, VLen };
SDValue Gather = DAG.getGatherVP(DAG.getVTList(VT, MVT::Other), VT, sdl, Ops, MMO, IndexType);

SDValue OutChain = Gather.getValue(1);
if (!ConstantMemory)
PendingLoads.push_back(OutChain);
setValue(&I, Gather);
}

void SelectionDAGBuilder::visitScatterVP(const CallInst &I) {
SDLoc sdl = getCurSDLoc();

// llvm.evl.scatter.*(Src0, Ptrs, Mask, VLen)
const Value *Ptr = I.getArgOperand(1);
SDValue Src0 = getValue(I.getArgOperand(0));
SDValue Mask = getValue(I.getArgOperand(2));
SDValue VLen = getValue(I.getArgOperand(3));
EVT VT = Src0.getValueType();
unsigned Alignment = I.getParamAlignment(1);
if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);

SDValue Base;
SDValue Index;
ISD::MemIndexType IndexType;
SDValue Scale;
const Value *BasePtr = Ptr;
bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale, this);

const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
MachineMemOperand *MMO = DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
MachineMemOperand::MOStore, VT.getStoreSize(),
Alignment, AAInfo);
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
IndexType = ISD::SIGNED_SCALED;
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { getRoot(), Src0, Base, Index, Scale, Mask, VLen };
SDValue Scatter = DAG.getScatterVP(DAG.getVTList(MVT::Other), VT, sdl,
Ops, MMO, IndexType);
DAG.setRoot(Scatter);
setValue(&I, Scatter);
}

void SelectionDAGBuilder::visitLoadVP(const CallInst &I) {
SDLoc sdl = getCurSDLoc();

auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &VLen,
unsigned& Alignment) {
// @llvm.evl.load.*(Ptr, Mask, Vlen)
Ptr = I.getArgOperand(0);
Alignment = I.getParamAlignment(0);
Mask = I.getArgOperand(1);
VLen = I.getArgOperand(2);
};

Value *PtrOperand, *MaskOperand, *VLenOperand;
unsigned Alignment;
getMaskedLoadOps(PtrOperand, MaskOperand, VLenOperand, Alignment);

SDValue Ptr = getValue(PtrOperand);
SDValue VLen = getValue(VLenOperand);
SDValue Mask = getValue(MaskOperand);

// infer the return type
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SmallVector<EVT, 4> ValValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValValueVTs);
EVT VT = ValValueVTs[0];
assert((ValValueVTs.size() == 1) && "splitting not implemented");

if (!Alignment)
Alignment = DAG.getEVTAlignment(VT);

AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

// Do not serialize masked loads of constant memory with anything.
bool AddToChain =
!AA || !AA->pointsToConstantMemory(MemoryLocation(
PtrOperand,
LocationSize::precise(
DAG.getDataLayout().getTypeStoreSize(I.getType())),
AAInfo));
SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();

MachineMemOperand *MMO =
DAG.getMachineFunction().
getMachineMemOperand(MachinePointerInfo(PtrOperand),
MachineMemOperand::MOLoad, VT.getStoreSize(),
Alignment, AAInfo, Ranges);

SDValue Load = DAG.getLoadVP(VT, sdl, InChain, Ptr, Mask, VLen, VT, MMO,
ISD::NON_EXTLOAD);
if (AddToChain)
PendingLoads.push_back(Load.getValue(1));
setValue(&I, Load);
}

void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
SDLoc dl = getCurSDLoc();
AtomicOrdering SuccessOrdering = I.getSuccessOrdering();
Expand Down Expand Up @@ -6132,6 +6328,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
#include "llvm/IR/ConstrainedOps.def"
visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
return;

#define REGISTER_VP_INTRINSIC(VPID,MASKPOS,VLENPOS) case Intrinsic::VPID:
#include "llvm/IR/VPIntrinsics.def"
visitVectorPredicationIntrinsic(cast<VPIntrinsic>(I));
return;

case Intrinsic::fmuladd: {
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
Expand Down Expand Up @@ -6980,6 +7182,112 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
setValue(&FPI, FPResult);
}

void SelectionDAGBuilder::visitCmpVP(const VPIntrinsic &I) {
ISD::CondCode Condition;
CmpInst::Predicate predicate = I.getCmpPredicate();
bool IsFP = I.getOperand(0)->getType()->isFPOrFPVectorTy();
if (IsFP) {
Condition = getFCmpCondCode(predicate);
auto *FPMO = dyn_cast<FPMathOperator>(&I);
if ((FPMO && FPMO->hasNoNaNs()) || TM.Options.NoNaNsFPMath)
Condition = getFCmpCodeWithoutNaN(Condition);

} else {
Condition = getICmpCondCode(predicate);
}

SDValue Op1 = getValue(I.getOperand(0));
SDValue Op2 = getValue(I.getOperand(1));
// #2 is the condition code
SDValue MaskOp = getValue(I.getOperand(3));
SDValue LenOp = getValue(I.getOperand(4));

EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
I.getType());
setValue(&I, DAG.getVPSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition, MaskOp, LenOp));
}

void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
const VPIntrinsic &VPIntrin) {
SDLoc sdl = getCurSDLoc();
unsigned Opcode;
switch (VPIntrin.getIntrinsicID()) {
default:
llvm_unreachable("Unforeseen intrinsic"); // Can't reach here.

case Intrinsic::vp_load:
visitLoadVP(VPIntrin);
return;
case Intrinsic::vp_store:
visitStoreVP(VPIntrin);
return;
case Intrinsic::vp_gather:
visitGatherVP(VPIntrin);
return;
case Intrinsic::vp_scatter:
visitScatterVP(VPIntrin);
return;

case Intrinsic::vp_fcmp:
case Intrinsic::vp_icmp:
visitCmpVP(VPIntrin);
return;

// Generic mappings
#define HANDLE_VP_TO_SDNODE(VPID, NODEID) \
case Intrinsic::VPID: Opcode = ISD::NODEID; break;
#include "llvm/IR/VPIntrinsics.def"
}

// TODO memory evl: SDValue Chain = getRoot();

SmallVector<EVT, 4> ValueVTs;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
ComputeValueVTs(TLI, DAG.getDataLayout(), VPIntrin.getType(), ValueVTs);
SDVTList VTs = DAG.getVTList(ValueVTs);

// ValueVTs.push_back(MVT::Other); // Out chain

// Request Operands
SmallVector<SDValue,7> OpValues;
auto ExceptPosOpt = VPIntrinsic::GetExceptionBehaviorParamPos(VPIntrin.getIntrinsicID());
auto RoundingModePosOpt = VPIntrinsic::GetRoundingModeParamPos(VPIntrin.getIntrinsicID());
for (int i = 0; i < (int) VPIntrin.getNumArgOperands(); ++i) {
if (ExceptPosOpt && (i == ExceptPosOpt.getValue())) continue;
if (RoundingModePosOpt && (i == RoundingModePosOpt.getValue())) continue;
OpValues.push_back(getValue(VPIntrin.getArgOperand(i)));
}
SDValue Result = DAG.getNode(Opcode, sdl, VTs, OpValues);


SDNodeFlags NodeFlags;

// set exception flags where appropriate
NodeFlags.setNoFPExcept(!VPIntrin.isConstrainedOp());

// copy FMF where available
auto * FPIntrin = dyn_cast<FPMathOperator>(&VPIntrin);
if (FPIntrin) NodeFlags.copyFMF(*FPIntrin);

if (VPIntrin.isReductionOp()) {
NodeFlags.setVectorReduction(true);
}

// Attach chain
SDValue VPResult;
if (Result.getNode()->getNumValues() == 2) {
SDValue OutChain = Result.getValue(1);
DAG.setRoot(OutChain);
VPResult = Result.getValue(0);
} else {
VPResult = Result;
}

// attach flags and return
if (NodeFlags.isDefined()) VPResult.getNode()->setFlags(NodeFlags);
setValue(&VPIntrin, VPResult);
}

std::pair<SDValue, SDValue>
SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
const BasicBlock *EHPadBB) {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,12 @@ class SelectionDAGBuilder {
void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
void visitVectorPredicationIntrinsic(const VPIntrinsic &VPI);
void visitCmpVP(const VPIntrinsic &I);
void visitLoadVP(const CallInst &I);
void visitStoreVP(const CallInst &I);
void visitGatherVP(const CallInst &I);
void visitScatterVP(const CallInst &I);

void visitVAStart(const CallInst &I);
void visitVAArg(const VAArgInst &I);
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::VECREDUCE_UMIN: return "vecreduce_umin";
case ISD::VECREDUCE_FMAX: return "vecreduce_fmax";
case ISD::VECREDUCE_FMIN: return "vecreduce_fmin";

// Vector Predication
#define REGISTER_VP_SDNODE(NODEID,NAME,MASKPOS,VLENPOS) \
case ISD::NODEID: return NAME;
#include "llvm/IR/VPIntrinsics.def"
}
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/CodeGen/TargetPassConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,11 @@ void TargetPassConfig::addIRPasses() {
// Instrument function entry and exit, e.g. with calls to mcount().
addPass(createPostInlineEntryExitInstrumenterPass());

// Expand vector predication intrinsics into standard IR instructions.
// This pass has to run before ScalarizeMaskedMemIntrin and ExpandReduction
// passes since it emits those kinds of intrinsics.
addPass(createExpandVectorPredicationPass());

// Add scalarization of target's unsupported masked memory intrinsics pass.
// the unsupported intrinsic will be replaced with a chain of basic blocks,
// that stores/loads element one-by-one if the appropriate mask bit is set.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/IR/Attributes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
return "builtin";
if (hasAttribute(Attribute::Convergent))
return "convergent";
if (hasAttribute(Attribute::VectorLength))
return "vlen";
if (hasAttribute(Attribute::SwiftError))
return "swifterror";
if (hasAttribute(Attribute::SwiftSelf))
Expand All @@ -346,6 +348,10 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
return "inreg";
if (hasAttribute(Attribute::JumpTable))
return "jumptable";
if (hasAttribute(Attribute::Mask))
return "mask";
if (hasAttribute(Attribute::Passthru))
return "passthru";
if (hasAttribute(Attribute::MinSize))
return "minsize";
if (hasAttribute(Attribute::Naked))
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/IR/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,18 @@ add_llvm_component_library(LLVMCore
PassManager.cpp
PassRegistry.cpp
PassTimingInfo.cpp
PredicatedInst.cpp
SafepointIRVerifier.cpp
ProfileSummary.cpp
Statepoint.cpp
Type.cpp
TypeFinder.cpp
Use.cpp
User.cpp
VPBuilder.cpp
Value.cpp
ValueSymbolTable.cpp
Verifier.cpp

ADDITIONAL_HEADER_DIRS
${LLVM_MAIN_INCLUDE_DIR}/llvm/IR

Expand Down
22 changes: 21 additions & 1 deletion llvm/lib/IR/FPEnv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/FPEnv.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/Metadata.h"

namespace llvm {

Expand Down Expand Up @@ -91,4 +92,23 @@ getAPFloatRoundingMode(fp::RoundingMode RM) {
}
llvm_unreachable("Unexpected rounding mode");
}

Value *GetConstrainedFPExcept(LLVMContext &Context,
fp::ExceptionBehavior UseExcept) {
Optional<StringRef> ExceptStr = ExceptionBehaviorToStr(UseExcept);
assert(ExceptStr.hasValue() && "Garbage strict exception behavior!");
auto *ExceptMDS = MDString::get(Context, ExceptStr.getValue());

return MetadataAsValue::get(Context, ExceptMDS);
}

Value *GetConstrainedFPRounding(LLVMContext &Context,
fp::RoundingMode UseRounding) {
Optional<StringRef> RoundingStr = RoundingModeToStr(UseRounding);
assert(RoundingStr.hasValue() && "Garbage strict rounding mode!");
auto *RoundingMDS = MDString::get(Context, RoundingStr.getValue());

return MetadataAsValue::get(Context, RoundingMDS);
}

} // namespace llvm
54 changes: 54 additions & 0 deletions llvm/lib/IR/IRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,60 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
return createCallHelper(TheFn, Ops, this, Name);
}

/// Create a call to a vector-predicated intrinsic (VP).
/// \p OC - The LLVM IR Opcode of the operation
/// \p VecOpArray - Intrinsic operand list
/// \p FMFSource - Copy source for Fast Math Flags
/// \p Name - name of the result variable
Instruction *IRBuilderBase::CreateVectorPredicatedInst(unsigned OC,
ArrayRef<Value *> Params,
Instruction *FMFSource,
const Twine &Name) {

Module *M = BB->getParent()->getParent();

Intrinsic::ID VPID = VPIntrinsic::GetForOpcode(OC);
auto VPFunc = VPIntrinsic::GetDeclarationForParams(M, VPID, Params);
auto *VPCall = createCallHelper(VPFunc, Params, this, Name);

// transfer fast math flags
if (FMFSource && isa<FPMathOperator>(FMFSource)) {
VPCall->copyFastMathFlags(FMFSource);
}

return VPCall;
}

/// Create a call to a vector-predicated comparison intrinsic (VP).
/// \p Pred - comparison predicate
/// \p FirstOp - First vector operand
/// \p SndOp - Second vector operand
/// \p Mask - Mask operand
/// \p VectorLength - Vector length operand
/// \p Name - name of the result variable
Instruction *IRBuilderBase::CreateVectorPredicatedCmp(
CmpInst::Predicate Pred, Value *FirstParam, Value *SndParam,
Value *MaskParam, Value *VectorLengthParam, const Twine &Name) {

Module *M = BB->getParent()->getParent();

// encode comparison predicate as MD
uint8_t RawPred = static_cast<uint8_t>(Pred);
auto Int8Ty = Type::getInt8Ty(getContext());
auto PredParam = ConstantInt::get(Int8Ty, RawPred, false);

Intrinsic::ID VPID = FirstParam->getType()->isIntOrIntVectorTy()
? Intrinsic::vp_icmp
: Intrinsic::vp_fcmp;

auto VPFunc = VPIntrinsic::GetDeclarationForParams(
M, VPID, {FirstParam, SndParam, PredParam, MaskParam, VectorLengthParam});

return createCallHelper(
VPFunc, {FirstParam, SndParam, PredParam, MaskParam, VectorLengthParam},
this, Name);
}

/// Create a call to a Masked Gather intrinsic.
/// \p Ptrs - vector of pointers for loading
/// \p Align - alignment for one element
Expand Down
523 changes: 498 additions & 25 deletions llvm/lib/IR/IntrinsicInst.cpp

Large diffs are not rendered by default.

115 changes: 115 additions & 0 deletions llvm/lib/IR/PredicatedInst.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#include <llvm/IR/InstrTypes.h>
#include <llvm/IR/Instruction.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/IntrinsicInst.h>
#include <llvm/IR/PredicatedInst.h>

namespace {
using namespace llvm;
using ShortValueVec = SmallVector<Value *, 4>;
} // namespace

namespace llvm {

bool PredicatedInstruction::canIgnoreVectorLengthParam() const {
auto VPI = dyn_cast<VPIntrinsic>(this);
if (!VPI)
return true;

return VPI->canIgnoreVectorLengthParam();
}

FastMathFlags PredicatedInstruction::getFastMathFlags() const {
return cast<Instruction>(this)->getFastMathFlags();
}

void PredicatedOperator::copyIRFlags(const Value *V, bool IncludeWrapFlags) {
auto *I = dyn_cast<Instruction>(this);
if (I)
I->copyIRFlags(V, IncludeWrapFlags);
}

bool
PredicatedInstruction::isVectorReduction() const {
auto VPI = dyn_cast<VPIntrinsic>(this);
if (VPI) {
return VPI->isReductionOp();
}
auto II = dyn_cast<IntrinsicInst>(this);
if (!II) return false;

switch (II->getIntrinsicID()) {
default:
return false;

case Intrinsic::experimental_vector_reduce_add:
case Intrinsic::experimental_vector_reduce_mul:
case Intrinsic::experimental_vector_reduce_and:
case Intrinsic::experimental_vector_reduce_or:
case Intrinsic::experimental_vector_reduce_xor:
case Intrinsic::experimental_vector_reduce_smin:
case Intrinsic::experimental_vector_reduce_smax:
case Intrinsic::experimental_vector_reduce_umin:
case Intrinsic::experimental_vector_reduce_umax:
case Intrinsic::experimental_vector_reduce_v2_fadd:
case Intrinsic::experimental_vector_reduce_v2_fmul:
case Intrinsic::experimental_vector_reduce_fmin:
case Intrinsic::experimental_vector_reduce_fmax:
return true;
}
}

Instruction *PredicatedBinaryOperator::Create(
Module *Mod, Value *Mask, Value *VectorLen, Instruction::BinaryOps Opc,
Value *V1, Value *V2, const Twine &Name, BasicBlock *InsertAtEnd,
Instruction *InsertBefore) {
assert(!(InsertAtEnd && InsertBefore));
auto VPID = VPIntrinsic::GetForOpcode(Opc);

// Default Code Path
if ((!Mod || (!Mask && !VectorLen)) || VPID == Intrinsic::not_intrinsic) {
if (InsertAtEnd) {
return BinaryOperator::Create(Opc, V1, V2, Name, InsertAtEnd);
} else {
return BinaryOperator::Create(Opc, V1, V2, Name, InsertBefore);
}
}

assert(Mod && "Need a module to emit VP Intrinsics");

// Fetch the VP intrinsic
auto &VecTy = cast<VectorType>(*V1->getType());
auto TypeTokens = VPIntrinsic::GetTypeTokens(VPID);
auto *VPFunc = Intrinsic::getDeclaration(
Mod, VPID,
VPIntrinsic::EncodeTypeTokens(TypeTokens, &VecTy, nullptr, VecTy));

// Encode default environment fp behavior
LLVMContext &Ctx = V1->getContext();
SmallVector<Value *, 6> BinOpArgs({V1, V2});
if (VPIntrinsic::HasRoundingModeParam(VPID)) {
BinOpArgs.push_back(
GetConstrainedFPRounding(Ctx, fp::RoundingMode::rmToNearest));
}
if (VPIntrinsic::HasExceptionBehaviorParam(VPID)) {
BinOpArgs.push_back(
GetConstrainedFPExcept(Ctx, fp::ExceptionBehavior::ebIgnore));
}

BinOpArgs.push_back(Mask);
BinOpArgs.push_back(VectorLen);

CallInst *CI;
if (InsertAtEnd) {
CI = CallInst::Create(VPFunc, BinOpArgs, Name, InsertAtEnd);
} else {
CI = CallInst::Create(VPFunc, BinOpArgs, Name, InsertBefore);
}

// the VP inst does not touch memory if the exception behavior is
// "fpecept.ignore"
CI->setDoesNotAccessMemory();
return CI;
}

} // namespace llvm
182 changes: 182 additions & 0 deletions llvm/lib/IR/VPBuilder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#include <llvm/ADT/SmallVector.h>
#include <llvm/IR/FPEnv.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/Intrinsics.h>
#include <llvm/IR/PredicatedInst.h>
#include <llvm/IR/VPBuilder.h>

namespace {
using namespace llvm;
using ShortTypeVec = VPIntrinsic::ShortTypeVec;
using ShortValueVec = SmallVector<Value *, 4>;
} // namespace

namespace llvm {

Module &VPBuilder::getModule() const {
return *Builder.GetInsertBlock()->getParent()->getParent();
}

Value &VPBuilder::RequestPred() {
if (Mask)
return *Mask;

auto *boolTy = Builder.getInt1Ty();
auto *maskTy = VectorType::get(boolTy, StaticVectorLength);
return *ConstantInt::getAllOnesValue(maskTy);
}

Value &VPBuilder::RequestEVL() {
if (ExplicitVectorLength)
return *ExplicitVectorLength;

auto *intTy = Builder.getInt32Ty();
return *ConstantInt::get(intTy, StaticVectorLength);
}

Value *VPBuilder::CreateVectorCopy(Instruction &Inst, ValArray VecOpArray) {
auto OC = Inst.getOpcode();
auto VPID = VPIntrinsic::GetForOpcode(OC);
if (VPID == Intrinsic::not_intrinsic) {
return nullptr;
}

Optional<int> MaskPosOpt = VPIntrinsic::GetMaskParamPos(VPID);
Optional<int> VLenPosOpt = VPIntrinsic::GetVectorLengthParamPos(VPID);
Optional<int> FPRoundPosOpt = VPIntrinsic::GetRoundingModeParamPos(VPID);
Optional<int> FPExceptPosOpt =
VPIntrinsic::GetExceptionBehaviorParamPos(VPID);

Optional<int> CmpPredPos = None;
if (isa<CmpInst>(Inst)) {
CmpPredPos = 2;
}

// TODO transfer alignment

// construct VP vector operands (including pred and evl)
SmallVector<Value *, 6> VecParams;
for (size_t i = 0; i < Inst.getNumOperands() + 5; ++i) {
if (MaskPosOpt && (i == (size_t)MaskPosOpt.getValue())) {
// First operand of select is mask (singular exception)
if (VPID != Intrinsic::vp_select)
VecParams.push_back(&RequestPred());
}
if (VLenPosOpt && (i == (size_t)VLenPosOpt.getValue())) {
VecParams.push_back(&RequestEVL());
}
if (FPRoundPosOpt && (i == (size_t)FPRoundPosOpt.getValue())) {
// TODO decode fp env from constrained intrinsics
VecParams.push_back(GetConstrainedFPRounding(
Builder.getContext(), fp::RoundingMode::rmToNearest));
}
if (FPExceptPosOpt && (i == (size_t)FPExceptPosOpt.getValue())) {
// TODO decode fp env from constrained intrinsics
VecParams.push_back(GetConstrainedFPExcept(
Builder.getContext(), fp::ExceptionBehavior::ebIgnore));
}
if (CmpPredPos && (i == (size_t)CmpPredPos.getValue())) {
auto &CmpI = cast<CmpInst>(Inst);
VecParams.push_back(ConstantInt::get(
Type::getInt8Ty(Builder.getContext()), CmpI.getPredicate()));
}
if (i < VecOpArray.size())
VecParams.push_back(VecOpArray[i]);
}

Type *ScaRetTy = Inst.getType();
Type *VecRetTy = ScaRetTy->isVoidTy() ? ScaRetTy : &getVectorType(*ScaRetTy);
auto &M = *Builder.GetInsertBlock()->getParent()->getParent();
auto VPDecl =
VPIntrinsic::GetDeclarationForParams(&M, VPID, VecParams, VecRetTy);

return Builder.CreateCall(VPDecl, VecParams, Inst.getName() + ".vp");
}

VectorType &VPBuilder::getVectorType(Type &ElementTy) {
return *VectorType::get(&ElementTy, StaticVectorLength);
}

Value &VPBuilder::CreateContiguousStore(Value &Val, Value &ElemPointer,
MaybeAlign AlignOpt) {
auto &PointerTy = cast<PointerType>(*ElemPointer.getType());
auto &VecTy = getVectorType(*PointerTy.getPointerElementType());
auto *VecPtrTy = VecTy.getPointerTo(PointerTy.getAddressSpace());
auto *VecPtr = Builder.CreatePointerCast(&ElemPointer, VecPtrTy);

auto *StoreFunc = Intrinsic::getDeclaration(&getModule(), Intrinsic::vp_store,
{&VecTy, VecPtrTy});
ShortValueVec Args{&Val, VecPtr, &RequestPred(), &RequestEVL()};
CallInst &StoreCall = *Builder.CreateCall(StoreFunc, Args);
if (AlignOpt.hasValue()) {
unsigned PtrPos =
VPIntrinsic::GetMemoryPointerParamPos(Intrinsic::vp_store).getValue();
StoreCall.addParamAttr(
PtrPos, Attribute::getWithAlignment(getContext(), AlignOpt.getValue()));
}
return StoreCall;
}

Value &VPBuilder::CreateContiguousLoad(Value &ElemPointer,
MaybeAlign AlignOpt) {
auto &PointerTy = cast<PointerType>(*ElemPointer.getType());
auto &VecTy = getVectorType(*PointerTy.getPointerElementType());
auto *VecPtrTy = VecTy.getPointerTo(PointerTy.getAddressSpace());
auto *VecPtr = Builder.CreatePointerCast(&ElemPointer, VecPtrTy);

auto *LoadFunc = Intrinsic::getDeclaration(&getModule(), Intrinsic::vp_load,
{&VecTy, VecPtrTy});
ShortValueVec Args{VecPtr, &RequestPred(), &RequestEVL()};
CallInst &LoadCall = *Builder.CreateCall(LoadFunc, Args);
if (AlignOpt.hasValue()) {
unsigned PtrPos =
VPIntrinsic::GetMemoryPointerParamPos(Intrinsic::vp_load).getValue();
LoadCall.addParamAttr(
PtrPos, Attribute::getWithAlignment(getContext(), AlignOpt.getValue()));
}
return LoadCall;
}

Value &VPBuilder::CreateScatter(Value &Val, Value &PointerVec,
MaybeAlign AlignOpt) {
auto *ScatterFunc =
Intrinsic::getDeclaration(&getModule(), Intrinsic::vp_scatter,
{Val.getType(), PointerVec.getType()});
ShortValueVec Args{&Val, &PointerVec, &RequestPred(), &RequestEVL()};
CallInst &ScatterCall = *Builder.CreateCall(ScatterFunc, Args);
if (AlignOpt.hasValue()) {
unsigned PtrPos =
VPIntrinsic::GetMemoryPointerParamPos(Intrinsic::vp_scatter).getValue();
ScatterCall.addParamAttr(
PtrPos, Attribute::getWithAlignment(getContext(), AlignOpt.getValue()));
}
return ScatterCall;
}

Value &VPBuilder::CreateGather(Value &PointerVec, MaybeAlign AlignOpt) {
auto &PointerVecTy = cast<VectorType>(*PointerVec.getType());
auto &ElemTy = *cast<PointerType>(*PointerVecTy.getVectorElementType())
.getPointerElementType();
auto &VecTy = *VectorType::get(&ElemTy, PointerVecTy.getNumElements());
auto *GatherFunc = Intrinsic::getDeclaration(
&getModule(), Intrinsic::vp_gather, {&VecTy, &PointerVecTy});

ShortValueVec Args{&PointerVec, &RequestPred(), &RequestEVL()};
CallInst &GatherCall = *Builder.CreateCall(GatherFunc, Args);
if (AlignOpt.hasValue()) {
unsigned PtrPos =
VPIntrinsic::GetMemoryPointerParamPos(Intrinsic::vp_gather).getValue();
GatherCall.addParamAttr(
PtrPos, Attribute::getWithAlignment(getContext(), AlignOpt.getValue()));
}
return GatherCall;
}

Value *VPBuilder::CreateVectorShift(Value *SrcVal, Value *Amount, Twine Name) {
auto D = VPIntrinsic::GetDeclarationForParams(
&getModule(), Intrinsic::vp_vshift, {SrcVal, Amount});
return Builder.CreateCall(D, {SrcVal, Amount, &RequestPred(), &RequestEVL()},
Name);
}

} // namespace llvm
45 changes: 45 additions & 0 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
#include "llvm/IR/ModuleSlotTracker.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/FPEnv.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
Expand Down Expand Up @@ -477,6 +478,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
void visitUserOp2(Instruction &I) { visitUserOp1(I); }
void visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call);
void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
void visitVPIntrinsic(VPIntrinsic &FPI);
void visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII);
void visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI);
void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
Expand Down Expand Up @@ -1705,11 +1707,14 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
if (Attrs.isEmpty())
return;

bool SawMask = false;
bool SawNest = false;
bool SawPassthru = false;
bool SawReturned = false;
bool SawSRet = false;
bool SawSwiftSelf = false;
bool SawSwiftError = false;
bool SawVectorLength = false;

// Verify return value attributes.
AttributeSet RetAttrs = Attrs.getRetAttributes();
Expand Down Expand Up @@ -1778,12 +1783,33 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
SawSwiftError = true;
}

if (ArgAttrs.hasAttribute(Attribute::VectorLength)) {
Assert(!SawVectorLength, "Cannot have multiple 'vlen' parameters!",
V);
SawVectorLength = true;
}

if (ArgAttrs.hasAttribute(Attribute::Passthru)) {
Assert(!SawPassthru, "Cannot have multiple 'passthru' parameters!",
V);
SawPassthru = true;
}

if (ArgAttrs.hasAttribute(Attribute::Mask)) {
Assert(!SawMask, "Cannot have multiple 'mask' parameters!",
V);
SawMask = true;
}

if (ArgAttrs.hasAttribute(Attribute::InAlloca)) {
Assert(i == FT->getNumParams() - 1,
"inalloca isn't on the last parameter!", V);
}
}

Assert(!SawPassthru || SawMask,
"Cannot have 'passthru' parameter without 'mask' parameter!", V);

if (!Attrs.hasAttributes(AttributeList::FunctionIndex))
return;

Expand Down Expand Up @@ -4371,6 +4397,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
#include "llvm/IR/ConstrainedOps.def"
visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
break;

#define REGISTER_VP_INTRINSIC(VPID,MASKPOS,VLENPOS) \
case Intrinsic::VPID:
#include "llvm/IR/VPIntrinsics.def"
visitVPIntrinsic(cast<VPIntrinsic>(Call));
break;

case Intrinsic::dbg_declare: // llvm.dbg.declare
Assert(isa<MetadataAsValue>(Call.getArgOperand(0)),
"invalid llvm.dbg.declare intrinsic call 1", Call);
Expand Down Expand Up @@ -4813,6 +4846,18 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
return nullptr;
}

void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
Assert(!VPI.isConstrainedOp(),
"VP intrinsics only support the default fp environment for now "
"(round.tonearest; fpexcept.ignore).");
if (VPI.isConstrainedOp()) {
Assert(VPI.getExceptionBehavior() != None,
"invalid exception behavior argument", &VPI);
Assert(VPI.getRoundingMode() != None, "invalid rounding mode argument",
&VPI);
}
}

void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
unsigned NumOperands;
bool HasRoundingMD;
Expand Down
107 changes: 68 additions & 39 deletions llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/PredicatedInst.h"
#include "llvm/IR/VPBuilder.h"
#include "llvm/IR/MatcherCast.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/AlignOf.h"
Expand Down Expand Up @@ -2054,29 +2057,33 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {

/// This eliminates floating-point negation in either 'fneg(X)' or
/// 'fsub(-0.0, X)' form by combining into a constant operand.
template<typename MatchContextType>
static Instruction *foldFNegIntoConstant(Instruction &I) {
Value *X;
Constant *C;

MatchContextType MC(cast<Value>(&I));
MatchContextBuilder<MatchContextType> MCBuilder(MC);

// Fold negation into constant operand. This is limited with one-use because
// fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv.
// -(X * C) --> X * (-C)
// FIXME: It's arguable whether these should be m_OneUse or not. The current
// belief is that the FNeg allows for better reassociation opportunities.
if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C))))))
return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
// -(X * C) --> X * (-C)
if (MC.try_match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C))))))
return MCBuilder.CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
// -(X / C) --> X / (-C)
if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C))))))
return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
if (MC.try_match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C))))))
return MCBuilder.CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
// -(C / X) --> (-C) / X
if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
if (MC.try_match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
return MCBuilder.CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);

// With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]:
// -(X + C) --> -X + -C --> -C - X
if (I.hasNoSignedZeros() &&
match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C))))))
return BinaryOperator::CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I);
MC.try_match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C))))))
return MCBuilder.CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I);

return nullptr;
}
Expand Down Expand Up @@ -2104,7 +2111,7 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
SQ.getWithInstruction(&I)))
return replaceInstUsesWith(I, V);

if (Instruction *X = foldFNegIntoConstant(I))
if (Instruction *X = foldFNegIntoConstant<EmptyContext>(I))
return X;

Value *X, *Y;
Expand All @@ -2120,6 +2127,17 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
return nullptr;
}

Instruction *InstCombiner::visitPredicatedFSub(PredicatedBinaryOperator& I) {
auto * Inst = cast<Instruction>(&I);
PredicatedContext PC(&I);
if (Value *V = SimplifyPredicatedFSubInst(I.getOperand(0), I.getOperand(1),
I.getFastMathFlags(),
SQ.getWithInstruction(Inst), PC))
return replaceInstUsesWith(*Inst, V);

return visitFSubGeneric<Instruction, PredicatedContext>(*Inst);
}

Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
I.getFastMathFlags(),
Expand All @@ -2129,6 +2147,14 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
if (Instruction *X = foldVectorBinop(I))
return X;

return visitFSubGeneric<BinaryOperator, EmptyContext>(I);
}

template<typename BinaryOpTy, typename MatchContextType>
Instruction *InstCombiner::visitFSubGeneric(BinaryOpTy &I) {
MatchContextType MC(cast<Value>(&I));
MatchContextBuilder<MatchContextType> MCBuilder(MC);

// Subtraction from -0.0 is the canonical form of fneg.
// fsub -0.0, X ==> fneg X
// fsub nsz 0.0, X ==> fneg nsz X
Expand All @@ -2137,10 +2163,10 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
// fsub -0.0, Denorm ==> +-0
// fneg Denorm ==> -Denorm
Value *Op;
if (match(&I, m_FNeg(m_Value(Op))))
return UnaryOperator::CreateFNegFMF(Op, &I);
if (MC.try_match(&I, m_FNeg(m_Value(Op))))
return MCBuilder.CreateFNegFMF(Op, &I);

if (Instruction *X = foldFNegIntoConstant(I))
if (Instruction *X = foldFNegIntoConstant<MatchContextType>(I))
return X;

if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
Expand All @@ -2157,17 +2183,17 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
// killed later. We still limit that particular transform with 'hasOneUse'
// because an fneg is assumed better/cheaper than a generic fsub.
if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) {
if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
if (MC.try_match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
Value *NewSub = MCBuilder.CreateFSubFMF(Builder, Y, X, &I);
return MCBuilder.CreateFAddFMF(Op0, NewSub, &I);
}
}

// (-X) - Op1 --> -(X + Op1)
if (I.hasNoSignedZeros() && !isa<ConstantExpr>(Op0) &&
match(Op0, m_OneUse(m_FNeg(m_Value(X))))) {
Value *FAdd = Builder.CreateFAddFMF(X, Op1, &I);
return UnaryOperator::CreateFNegFMF(FAdd, &I);
MC.try_match(Op0, m_OneUse(m_FNeg(m_Value(X))))) {
Value *FAdd = MCBuilder.CreateFAddFMF(Builder, X, Op1, &I);
return MCBuilder.CreateFNegFMF(FAdd, &I);
}

if (isa<Constant>(Op0))
Expand All @@ -2178,22 +2204,22 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
// X - C --> X + (-C)
// But don't transform constant expressions because there's an inverse fold
// for X + (-Y) --> X - Y.
if (match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1))
return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
if (MC.try_match(Op1, m_Constant(C)) && !isa<ConstantExpr>(Op1))
return MCBuilder.CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);

// X - (-Y) --> X + Y
if (match(Op1, m_FNeg(m_Value(Y))))
return BinaryOperator::CreateFAddFMF(Op0, Y, &I);
if (MC.try_match(Op1, m_FNeg(m_Value(Y))))
return MCBuilder.CreateFAddFMF(Op0, Y, &I);

// Similar to above, but look through a cast of the negated value:
// X - (fptrunc(-Y)) --> X + fptrunc(Y)
Type *Ty = I.getType();
if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y))))))
return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I);
if (MC.try_match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y))))))
return MCBuilder.CreateFAddFMF(Op0, MCBuilder.CreateFPTrunc(Builder, Y, Ty), &I);

// X - (fpext(-Y)) --> X + fpext(Y)
if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);
if (MC.try_match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
return MCBuilder.CreateFAddFMF(Op0, MCBuilder.CreateFPExt(Builder, Y, Ty), &I);

// Similar to above, but look through fmul/fdiv of the negated value:
// Op0 - (-X * Y) --> Op0 + (X * Y)
Expand All @@ -2211,32 +2237,35 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
}

// Handle special cases for FSub with selects feeding the operation
if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
return replaceInstUsesWith(I, V);
if (auto * PlainBinOp = dyn_cast<BinaryOperator>(&I))
if (Value *V = SimplifySelectsFeedingBinaryOp(*PlainBinOp, Op0, Op1))
return replaceInstUsesWith(I, V);

if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
// (Y - X) - Y --> -X
if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
return UnaryOperator::CreateFNegFMF(X, &I);
if (MC.try_match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
return MCBuilder.CreateFNegFMF(X, &I);

// Y - (X + Y) --> -X
// Y - (Y + X) --> -X
if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
return UnaryOperator::CreateFNegFMF(X, &I);
if (MC.try_match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
return MCBuilder.CreateFNegFMF(X, &I);

// (X * C) - X --> X * (C - 1.0)
if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
if (MC.try_match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0));
return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I);
return MCBuilder.CreateFMulFMF(Op1, CSubOne, &I);
}
// X - (X * C) --> X * (1.0 - C)
if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) {
if (MC.try_match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) {
Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C);
return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
return MCBuilder.CreateFMulFMF(Op0, OneSubC, &I);
}

if (Instruction *F = factorizeFAddFSub(I, Builder))
return F;
if (auto * PlainBinOp = dyn_cast<BinaryOperator>(&I)) {
if (Instruction *F = factorizeFAddFSub(*PlainBinOp, Builder))
return F;
}

// TODO: This performs reassociative folds for FP ops. Some fraction of the
// functionality has been subsumed by simple pattern matching here and in
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PredicatedInst.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsX86.h"
Expand Down Expand Up @@ -1878,6 +1879,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return &CI;
}

// Predicated instruction patterns
auto * VPInst = dyn_cast<VPIntrinsic>(&CI);
if (VPInst) {
auto * PredInst = cast<PredicatedInstruction>(VPInst);
auto Result = visitPredicatedInstruction(PredInst);
if (Result) return Result;
}

IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
if (!II) return visitCallBase(CI);

Expand Down Expand Up @@ -1954,7 +1963,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
if (Changed) return II;
}

// For vector result intrinsics, use the generic demanded vector support.
// For vector result intrinsics, use the generic demanded vector support to
// simplify any operands before moving on to the per-intrinsic rules.
if (II->getType()->isVectorTy()) {
auto VWidth = II->getType()->getVectorNumElements();
APInt UndefElts(VWidth, 0);
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PredicatedInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Use.h"
Expand Down Expand Up @@ -369,6 +370,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
Value *OptimizePointerDifference(
Value *LHS, Value *RHS, Type *Ty, bool isNUW);
Instruction *visitSub(BinaryOperator &I);
template<typename BinaryOpTy, typename MatcherType> Instruction *visitFSubGeneric(BinaryOpTy &I);
Instruction *visitPredicatedFSub(PredicatedBinaryOperator &I);
Instruction *visitFSub(BinaryOperator &I);
Instruction *visitMul(BinaryOperator &I);
Instruction *visitFMul(BinaryOperator &I);
Expand Down Expand Up @@ -445,6 +448,16 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
Instruction *visitVAEndInst(VAEndInst &I);
Instruction *visitFreeze(FreezeInst &I);

// Entry point to VPIntrinsic
Instruction *visitPredicatedInstruction(PredicatedInstruction * PI) {
switch (PI->getOpcode()) {
default:
return nullptr;
case Instruction::FSub:
return visitPredicatedFSub(cast<PredicatedBinaryOperator>(*PI));
}
}

/// Specify what to return for unhandled instructions.
Instruction *visitInstruction(Instruction &I) { return nullptr; }

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Utils/CodeExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
case Attribute::InaccessibleMemOnly:
case Attribute::InaccessibleMemOrArgMemOnly:
case Attribute::JumpTable:
case Attribute::Mask:
case Attribute::Naked:
case Attribute::Nest:
case Attribute::NoAlias:
Expand All @@ -872,6 +873,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
case Attribute::NoSync:
case Attribute::None:
case Attribute::NonNull:
case Attribute::Passthru:
case Attribute::ReadNone:
case Attribute::ReadOnly:
case Attribute::Returned:
Expand All @@ -883,6 +885,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
case Attribute::SwiftError:
case Attribute::SwiftSelf:
case Attribute::WillReturn:
case Attribute::VectorLength:
case Attribute::WriteOnly:
case Attribute::ZExt:
case Attribute::ImmArg:
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/Bitcode/attributes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,11 @@ define void @f63() sanitize_memtag
ret void;
}

; CHECK: define <8 x double> @f64(<8 x double> passthru %0, <8 x i1> mask %1, i32 vlen %2) {
define <8 x double> @f64(<8 x double> passthru, <8 x i1> mask, i32 vlen) {
ret <8 x double> undef
}

; CHECK: attributes #0 = { noreturn }
; CHECK: attributes #1 = { nounwind }
; CHECK: attributes #2 = { readnone }
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AArch64/O0-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
; CHECK-NEXT: Lower constant intrinsics
; CHECK-NEXT: Remove unreachable blocks from the CFG
; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: AArch64 Stack Tagging
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AArch64/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
; CHECK-NEXT: Constant Hoisting
; CHECK-NEXT: Partially inline calls to library functions
; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Dominator Tree Construction
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/ARM/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
; CHECK-NEXT: Constant Hoisting
; CHECK-NEXT: Partially inline calls to library functions
; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Dominator Tree Construction
Expand Down
245 changes: 245 additions & 0 deletions llvm/test/CodeGen/Generic/expand-vp.ll

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions llvm/test/CodeGen/X86/O0-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
; CHECK-NEXT: Lower constant intrinsics
; CHECK-NEXT: Remove unreachable blocks from the CFG
; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Expand indirectbr instructions
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/X86/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
; CHECK-NEXT: Constant Hoisting
; CHECK-NEXT: Partially inline calls to library functions
; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Dominator Tree Construction
Expand Down
45 changes: 45 additions & 0 deletions llvm/test/Transforms/InstCombine/vp-fsub.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
; RUN: opt < %s -instcombine -S | FileCheck %s

; PR4374

define <4 x float> @test1_vp(<4 x float> %x, <4 x float> %y, <4 x i1> %M, i32 %L) {
; CHECK-LABEL: @test1_vp(
;
%t1 = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #0
%t2 = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, <4 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #0
ret <4 x float> %t2
}

; Can't do anything with the test above because -0.0 - 0.0 = -0.0, but if we have nsz:
; -(X - Y) --> Y - X

; TODO predicated FAdd folding
define <4 x float> @neg_sub_nsz_vp(<4 x float> %x, <4 x float> %y, <4 x i1> %M, i32 %L) {
; CH***-LABEL: @neg_sub_nsz_vp(
;
%t1 = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #0
%t2 = call nsz <4 x float> @llvm.vp.fsub.v4f32(<4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, <4 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #0
ret <4 x float> %t2
}

; With nsz: Z - (X - Y) --> Z + (Y - X)

define <4 x float> @sub_sub_nsz_vp(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x i1> %M, i32 %L) {
; CHECK-LABEL: @sub_sub_nsz_vp(
; CHECK-NEXT: %1 = call nsz <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %y, <4 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #
; CHECK-NEXT: %t2 = call nsz <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %z, <4 x float> %1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #
; CHECK-NEXT: ret <4 x float> %t2
%t1 = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %x, <4 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #0
%t2 = call nsz <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %z, <4 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <4 x i1> %M, i32 %L) #0
ret <4 x float> %t2
}



; Function Attrs: nounwind readnone
declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata, <4 x i1> mask, i32 vlen)

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, metadata, metadata, <4 x i1> mask, i32 vlen)

attributes #0 = { readnone }
55 changes: 55 additions & 0 deletions llvm/test/Transforms/InstSimplify/vp-fsub.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
; RUN: opt < %s -instsimplify -S | FileCheck %s

define <8 x double> @fsub_fadd_fold_vp_xy(<8 x double> %x, <8 x double> %y, <8 x i1> %m, i32 %len) {
; CHECK-LABEL: fsub_fadd_fold_vp_xy
; CHECK: ret <8 x double> %x
%tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %x, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
%res0 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
ret <8 x double> %res0
}

define <8 x double> @fsub_fadd_fold_vp_zw(<8 x double> %z, <8 x double> %w, <8 x i1> %m, i32 %len) {
; CHECK-LABEL: fsub_fadd_fold_vp_zw
; CHECK: ret <8 x double> %z
%tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %w, <8 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
%res1 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %w, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
ret <8 x double> %res1
}

; REQUIRES-CONSTRAINED-VP: define <8 x double> @fsub_fadd_fold_vp_yx_fpexcept(<8 x double> %x, <8 x double> %y, <8 x i1> %m, i32 %len) #0 {
; REQUIRES-CONSTRAINED-VP: ; *HECK-LABEL: fsub_fadd_fold_vp_yx
; REQUIRES-CONSTRAINED-VP: ; *HECK-NEXT: %tmp =
; REQUIRES-CONSTRAINED-VP: ; *HECK-NEXT: %res2 =
; REQUIRES-CONSTRAINED-VP: ; *HECK-NEXT: ret
; REQUIRES-CONSTRAINED-VP: %tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %y, <8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict", <8 x i1> %m, i32 %len)
; REQUIRES-CONSTRAINED-VP: %res2 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict", <8 x i1> %m, i32 %len)
; REQUIRES-CONSTRAINED-VP: ret <8 x double> %res2
; REQUIRES-CONSTRAINED-VP: }

define <8 x double> @fsub_fadd_fold_vp_yx_olen(<8 x double> %x, <8 x double> %y, <8 x i1> %m, i32 %len, i32 %otherLen) {
; CHECK-LABEL: fsub_fadd_fold_vp_yx_olen
; CHECK-NEXT: %tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %y, <8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %otherLen)
; CHECK-NEXT: %res3 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
; CHECK-NEXT: ret <8 x double> %res3
%tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %y, <8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %otherLen)
%res3 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
ret <8 x double> %res3
}

define <8 x double> @fsub_fadd_fold_vp_yx_omask(<8 x double> %x, <8 x double> %y, <8 x i1> %m, i32 %len, <8 x i1> %othermask) {
; CHECK-LABEL: fsub_fadd_fold_vp_yx_omask
; CHECK-NEXT: %tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %y, <8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
; CHECK-NEXT: %res4 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %othermask, i32 %len)
; CHECK-NEXT: ret <8 x double> %res4
%tmp = call reassoc nsz <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %y, <8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %len)
%res4 = call reassoc nsz <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %tmp, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %othermask, i32 %len)
ret <8 x double> %res4
}

; Function Attrs: nounwind readnone
declare <8 x double> @llvm.vp.fadd.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)

; Function Attrs: nounwind readnone
declare <8 x double> @llvm.vp.fsub.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)

attributes #0 = { strictfp }
17 changes: 17 additions & 0 deletions llvm/test/Verifier/vp-intrinsics-constrained.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
; RUN: not opt -S < %s |& FileCheck %s
; CHECK: VP intrinsics only support the default fp environment for now (round.tonearest; fpexcept.ignore).
; CHECK: error: input module is broken!

define void @test_vp_strictfp(<8 x double> %f0, <8 x double> %f1, <8 x double> %f2, <8 x double> %f3, <8 x i1> %m, i32 %n) #0 {
%r0 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.strict", <8 x i1> %m, i32 %n)
ret void
}

define void @test_vp_rounding(<8 x double> %f0, <8 x double> %f1, <8 x double> %f2, <8 x double> %f3, <8 x i1> %m, i32 %n) #0 {
%r0 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tozero", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
ret void
}

declare <8 x double> @llvm.vp.fadd.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)

attributes #0 = { strictfp }
182 changes: 169 additions & 13 deletions llvm/test/Verifier/vp-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,174 @@ define void @test_vp_int(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) {
ret void
}

define void @test_vp_constrainedfp(<8 x double> %f0, <8 x double> %f1, <8 x double> %f2, <8 x double> %f3, <8 x i1> %m, i32 %n) {
%r0 = call <8 x double> @llvm.vp.fadd.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r1 = call <8 x double> @llvm.vp.fsub.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r2 = call <8 x double> @llvm.vp.fmul.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r3 = call <8 x double> @llvm.vp.fdiv.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r4 = call <8 x double> @llvm.vp.frem.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r5 = call <8 x double> @llvm.vp.fma.v8f64(<8 x double> %f0, <8 x double> %f1, <8 x double> %f2, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r6 = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %f2, metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r7 = call <8 x double> @llvm.vp.minnum.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r8 = call <8 x double> @llvm.vp.maxnum.v8f64(<8 x double> %f0, <8 x double> %f1, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
ret void
}

define void @test_vp_fpcast(<8 x double> %x, <8 x i64> %y, <8 x float> %z, <8 x i1> %m, i32 %n) {
%r0 = call <8 x i64> @llvm.vp.fptosi.v8i64v8f64(<8 x double> %x, metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r1 = call <8 x i64> @llvm.vp.fptoui.v8i64v8f64(<8 x double> %x, metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r2 = call <8 x double> @llvm.vp.sitofp.v8f64v8i64(<8 x i64> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r3 = call <8 x double> @llvm.vp.uitofp.v8f64v8i64(<8 x i64> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r4 = call <8 x double> @llvm.vp.rint.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r7 = call <8 x double> @llvm.vp.round.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%rA = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%rB = call <8 x double> @llvm.vp.ceil.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%rC = call <8 x double> @llvm.vp.floor.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%rD = call <8 x double> @llvm.vp.trunc.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%rE = call <8 x float> @llvm.vp.fptrunc.v8f32v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%rF = call <8 x double> @llvm.vp.fpext.v8f64v8f32(<8 x float> %z, metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
ret void
}

define void @test_vp_fpfuncs(<8 x double> %x, <8 x double> %y, <8 x i1> %m, i32 %n) {
%r0 = call <8 x double> @llvm.vp.pow.v8f64(<8 x double> %x, <8 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r1 = call <8 x double> @llvm.vp.powi.v8f64(<8 x double> %x, i32 %n, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r2 = call <8 x double> @llvm.vp.sqrt.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r3 = call <8 x double> @llvm.vp.sin.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r4 = call <8 x double> @llvm.vp.cos.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r5 = call <8 x double> @llvm.vp.log.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r6 = call <8 x double> @llvm.vp.log10.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r7 = call <8 x double> @llvm.vp.log2.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r8 = call <8 x double> @llvm.vp.exp.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
%r9 = call <8 x double> @llvm.vp.exp2.v8f64(<8 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore", <8 x i1> %m, i32 %n)
ret void
}

define void @test_mem(<16 x i32*> %p0, <16 x i32>* %p1, <16 x i32> %i0, <16 x i1> %m, i32 %n) {
call void @llvm.vp.store.v16i32.p0v16i32(<16 x i32> %i0, <16 x i32>* %p1, <16 x i1> %m, i32 %n)
call void @llvm.vp.scatter.v16i32.v16p0i32(<16 x i32> %i0 , <16 x i32*> %p0, <16 x i1> %m, i32 %n)
%l0 = call <16 x i32> @llvm.vp.load.v16i32.p0v16i32(<16 x i32>* %p1, <16 x i1> %m, i32 %n)
%l1 = call <16 x i32> @llvm.vp.gather.v16i32.v16p0i32(<16 x i32*> %p0, <16 x i1> %m, i32 %n)
ret void
}

define void @test_reduce_fp(<16 x float> %v, <16 x i1> %m, i32 %n) {
%r0 = call float @llvm.vp.reduce.fadd.v16f32(float 0.0, <16 x float> %v, <16 x i1> %m, i32 %n)
%r1 = call float @llvm.vp.reduce.fmul.v16f32(float 42.0, <16 x float> %v, <16 x i1> %m, i32 %n)
%r2 = call float @llvm.vp.reduce.fmin.v16f32(<16 x float> %v, <16 x i1> %m, i32 %n)
%r3 = call float @llvm.vp.reduce.fmax.v16f32(<16 x float> %v, <16 x i1> %m, i32 %n)
ret void
}

define void @test_reduce_int(<16 x i32> %v, <16 x i1> %m, i32 %n) {
%r0 = call i32 @llvm.vp.reduce.add.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r1 = call i32 @llvm.vp.reduce.mul.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r2 = call i32 @llvm.vp.reduce.and.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r3 = call i32 @llvm.vp.reduce.xor.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r4 = call i32 @llvm.vp.reduce.or.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r5 = call i32 @llvm.vp.reduce.smin.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r6 = call i32 @llvm.vp.reduce.smax.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r7 = call i32 @llvm.vp.reduce.umin.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
%r8 = call i32 @llvm.vp.reduce.umax.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
ret void
}

define void @test_shuffle(<16 x float> %v0, <16 x float> %v1, <16 x i1> %m, i32 %k, i32 %n) {
%r0 = call <16 x float> @llvm.vp.select.v16f32(<16 x i1> %m, <16 x float> %v0, <16 x float> %v1, i32 %n)
%r1 = call <16 x float> @llvm.vp.compose.v16f32(<16 x float> %v0, <16 x float> %v1, i32 %k, i32 %n)
%r2 = call <16 x float> @llvm.vp.vshift.v16f32(<16 x float> %v0, i32 %k, <16 x i1> %m, i32 %n)
%r3 = call <16 x float> @llvm.vp.compress.v16f32(<16 x float> %v0, <16 x i1> %m, i32 %n)
%r4 = call <16 x float> @llvm.vp.expand.v16f32(<16 x float> %v0, <16 x i1> %m, i32 %n)
ret void
}

define void @test_xcmp(<16 x i32> %i0, <16 x i32> %i1, <16 x float> %f0, <16 x float> %f1,<16 x i1> %m, i32 %n) {
%r0 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> %i0, <16 x i32> %i1, i8 38, <16 x i1> %m, i32 %n)
%r1 = call <16 x i1> @llvm.vp.fcmp.v16f32(<16 x float> %f0, <16 x float> %f1, i8 10, <16 x i1> %m, i32 %n)
ret void
}

; integer arith
declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
; bit arith
declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)
declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen)

; floating point arith
declare <8 x double> @llvm.vp.fadd.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.fsub.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.fma.v8f64(<8 x double>, <8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.fneg.v8f64(<8 x double>, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.minnum.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.maxnum.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen)

; cast & conversions
declare <8 x i64> @llvm.vp.fptosi.v8i64v8f64(<8 x double>, metadata, <8 x i1> mask, i32 vlen)
declare <8 x i64> @llvm.vp.fptoui.v8i64v8f64(<8 x double>, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.sitofp.v8f64v8i64(<8 x i64>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.uitofp.v8f64v8i64(<8 x i64>, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.rint.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.round.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.ceil.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.floor.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.trunc.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x float> @llvm.vp.fptrunc.v8f32v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.fpext.v8f64v8f32(<8 x float> %x, metadata, <8 x i1> mask, i32 vlen)

; math ops
declare <8 x double> @llvm.vp.pow.v8f64(<8 x double> %x, <8 x double> %y, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.powi.v8f64(<8 x double> %x, i32 %y, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.sqrt.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.sin.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.cos.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.log.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.log10.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.log2.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.exp.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)
declare <8 x double> @llvm.vp.exp2.v8f64(<8 x double> %x, metadata, metadata, <8 x i1> mask, i32 vlen)

; memory
declare void @llvm.vp.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, <16 x i1> mask, i32 vlen)
declare <16 x i32> @llvm.vp.load.v16i32.p0v16i32(<16 x i32>*, <16 x i1> mask, i32 vlen)
declare void @llvm.vp.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, <16 x i1> mask, i32 vlen)
declare <16 x i32> @llvm.vp.gather.v16i32.v16p0i32(<16 x i32*>, <16 x i1> mask, i32 vlen)

; reductions
declare float @llvm.vp.reduce.fadd.v16f32(float, <16 x float>, <16 x i1> mask, i32 vlen)
declare float @llvm.vp.reduce.fmul.v16f32(float, <16 x float>, <16 x i1> mask, i32 vlen)
declare float @llvm.vp.reduce.fmin.v16f32(<16 x float>, <16 x i1> mask, i32 vlen)
declare float @llvm.vp.reduce.fmax.v16f32(<16 x float>, <16 x i1> mask, i32 vlen)
declare i32 @llvm.vp.reduce.add.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen)
declare i32 @llvm.vp.reduce.mul.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen)
declare i32 @llvm.vp.reduce.and.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen)
declare i32 @llvm.vp.reduce.xor.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen)
declare i32 @llvm.vp.reduce.or.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen)
declare i32 @llvm.vp.reduce.smax.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
declare i32 @llvm.vp.reduce.smin.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
declare i32 @llvm.vp.reduce.umax.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)
declare i32 @llvm.vp.reduce.umin.v16i32(<16 x i32> %v, <16 x i1> %m, i32 %n)

; shuffles
declare <16 x float> @llvm.vp.select.v16f32(<16 x i1>, <16 x float>, <16 x float>, i32 vlen)
declare <16 x float> @llvm.vp.compose.v16f32(<16 x float>, <16 x float>, i32, i32 vlen)
declare <16 x float> @llvm.vp.vshift.v16f32(<16 x float>, i32, <16 x i1>, i32 vlen)
declare <16 x float> @llvm.vp.compress.v16f32(<16 x float>, <16 x i1>, i32 vlen)
declare <16 x float> @llvm.vp.expand.v16f32(<16 x float>, <16 x i1> mask, i32 vlen)

; icmp , fcmp
declare <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32>, <16 x i32>, i8, <16 x i1> mask, i32 vlen)
declare <16 x i1> @llvm.vp.fcmp.v16f32(<16 x float>, <16 x float>, i8, <16 x i1> mask, i32 vlen)
13 changes: 13 additions & 0 deletions llvm/test/Verifier/vp_attributes.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s

declare void @a(<16 x i1> mask %a, <16 x i1> mask %b)
; CHECK: Cannot have multiple 'mask' parameters!

declare void @b(<16 x i1> mask %a, i32 vlen %x, i32 vlen %y)
; CHECK: Cannot have multiple 'vlen' parameters!

declare <16 x double> @c(<16 x double> passthru %a)
; CHECK: Cannot have 'passthru' parameter without 'mask' parameter!

declare <16 x double> @d(<16 x double> passthru %a, <16 x i1> mask %M, <16 x double> passthru %b)
; CHECK: Cannot have multiple 'passthru' parameters!
1 change: 1 addition & 0 deletions llvm/tools/llc/llc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ int main(int argc, char **argv) {
initializeVectorization(*Registry);
initializeScalarizeMaskedMemIntrinPass(*Registry);
initializeExpandReductionsPass(*Registry);
initializeExpandVectorPredicationPass(*Registry);
initializeHardwareLoopsPass(*Registry);

// Initialize debugging passes.
Expand Down
1 change: 1 addition & 0 deletions llvm/tools/opt/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ int main(int argc, char **argv) {
initializePostInlineEntryExitInstrumenterPass(Registry);
initializeUnreachableBlockElimLegacyPassPass(Registry);
initializeExpandReductionsPass(Registry);
initializeExpandVectorPredicationPass(Registry);
initializeWasmEHPreparePass(Registry);
initializeWriteBitcodePassPass(Registry);
initializeHardwareLoopsPass(Registry);
Expand Down
99 changes: 86 additions & 13 deletions llvm/unittests/IR/VPIntrinsicTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,74 @@ class VPIntrinsicTest : public testing::Test {

std::unique_ptr<Module> CreateVPDeclarationModule() {
return parseAssemblyString(
" declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
" declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ",
" declare <8 x double> @llvm.vp.fadd.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.fsub.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.fma.v8f64(<8 x double>, <8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.fneg.v8f64(<8 x double>, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.minnum.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.maxnum.v8f64(<8 x double>, <8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1> mask, i32 vlen) "
" declare void @llvm.vp.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, <16 x i1> mask, i32 vlen) "
" declare void @llvm.vp.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, <16 x i1> mask, i32 vlen) "
" declare <16 x i32> @llvm.vp.load.v16i32.p0v16i32(<16 x i32>*, <16 x i1> mask, i32 vlen) "
" declare <16 x i32> @llvm.vp.gather.v16i32.v16p0i32(<16 x i32*>, <16 x i1> mask, i32 vlen) "
" declare float @llvm.vp.reduce.fadd.v16f32(float, <16 x float>, <16 x i1> mask, i32 vlen) "
" declare float @llvm.vp.reduce.fmul.v16f32(float, <16 x float>, <16 x i1> mask, i32 vlen) "
" declare float @llvm.vp.reduce.fmin.v16f32(<16 x float>, <16 x i1> mask, i32 vlen) "
" declare float @llvm.vp.reduce.fmax.v16f32(<16 x float>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.add.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.mul.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.and.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.xor.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.or.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.smin.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.smax.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.umin.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare i32 @llvm.vp.reduce.umax.v16i32(<16 x i32>, <16 x i1> mask, i32 vlen) "
" declare <16 x float> @llvm.vp.select.v16f32(<16 x i1>, <16 x float>, <16 x float>, i32 vlen) "
" declare <16 x float> @llvm.vp.compose.v16f32(<16 x float>, <16 x float>, i32, i32 vlen) "
" declare <16 x float> @llvm.vp.vshift.v16f32(<16 x float>, i32, <16 x i1>, i32 vlen) "
" declare <16 x float> @llvm.vp.compress.v16f32(<16 x float>, <16 x i1>, i32 vlen) "
" declare <16 x float> @llvm.vp.expand.v16f32(<16 x float>, <16 x i1> mask, i32 vlen) "
" declare <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32>, <16 x i32>, i8, <16 x i1> mask, i32 vlen) "
" declare <16 x i1> @llvm.vp.fcmp.v16f32(<16 x float>, <16 x float>, i8, <16 x i1> mask, i32 vlen) "
" declare <8 x i64> @llvm.vp.fptosi.v8i64v8f64(<8 x double>, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x i64> @llvm.vp.fptoui.v8i64v8f64(<8 x double>, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.sitofp.v8f64v8i64(<8 x i64>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.uitofp.v8f64v8i64(<8 x i64>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.rint.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.round.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.ceil.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.floor.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.trunc.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x float> @llvm.vp.fptrunc.v8f32v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.fpext.v8f64v8f32(<8 x float>, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.pow.v8f64(<8 x double>, <8 x double> %y, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.powi.v8f64(<8 x double>, i32 %y, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.sqrt.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.sin.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.cos.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.log.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.log10.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.log2.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.exp.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) "
" declare <8 x double> @llvm.vp.exp2.v8f64(<8 x double>, metadata, metadata, <8 x i1> mask, i32 vlen) ",
Err, C);
}
};
Expand Down Expand Up @@ -116,6 +171,24 @@ TEST_F(VPIntrinsicTest, GetParamPos) {
Type *VecLenParamType = F.getArg(VecLenParamPos.getValue())->getType();
ASSERT_TRUE(VecLenParamType->isIntegerTy(32));
}

Optional<int> MemPtrParamPos = VPIntrinsic::GetMemoryPointerParamPos(F.getIntrinsicID());
if (MemPtrParamPos.hasValue()) {
Type *MemPtrParamType = F.getArg(MemPtrParamPos.getValue())->getType();
ASSERT_TRUE(MemPtrParamType->isPtrOrPtrVectorTy());
}

Optional<int> RoundingParamPos = VPIntrinsic::GetRoundingModeParamPos(F.getIntrinsicID());
if (RoundingParamPos.hasValue()) {
Type *RoundingParamType = F.getArg(RoundingParamPos.getValue())->getType();
ASSERT_TRUE(RoundingParamType->isMetadataTy());
}

Optional<int> ExceptParamPos = VPIntrinsic::GetExceptionBehaviorParamPos(F.getIntrinsicID());
if (ExceptParamPos.hasValue()) {
Type *ExceptParamType = F.getArg(ExceptParamPos.getValue())->getType();
ASSERT_TRUE(ExceptParamType->isMetadataTy());
}
}
}

Expand Down
5 changes: 4 additions & 1 deletion llvm/utils/TableGen/CodeGenIntrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,10 @@ struct CodeGenIntrinsic {
ReadOnly,
WriteOnly,
ReadNone,
ImmArg
ImmArg,
Mask,
VectorLength,
Passthru
};

std::vector<std::pair<unsigned, ArgAttribute>> ArgumentAttributes;
Expand Down
9 changes: 9 additions & 0 deletions llvm/utils/TableGen/CodeGenTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,15 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
} else if (Property->isSubClassOf("Returned")) {
unsigned ArgNo = Property->getValueAsInt("ArgNo");
ArgumentAttributes.push_back(std::make_pair(ArgNo, Returned));
} else if (Property->isSubClassOf("VectorLength")) {
unsigned ArgNo = Property->getValueAsInt("ArgNo");
ArgumentAttributes.push_back(std::make_pair(ArgNo, VectorLength));
} else if (Property->isSubClassOf("Mask")) {
unsigned ArgNo = Property->getValueAsInt("ArgNo");
ArgumentAttributes.push_back(std::make_pair(ArgNo, Mask));
} else if (Property->isSubClassOf("Passthru")) {
unsigned ArgNo = Property->getValueAsInt("ArgNo");
ArgumentAttributes.push_back(std::make_pair(ArgNo, Passthru));
} else if (Property->isSubClassOf("ReadOnly")) {
unsigned ArgNo = Property->getValueAsInt("ArgNo");
ArgumentAttributes.push_back(std::make_pair(ArgNo, ReadOnly));
Expand Down
18 changes: 18 additions & 0 deletions llvm/utils/TableGen/IntrinsicEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,24 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
OS << "Attribute::Returned";
addComma = true;
break;
case CodeGenIntrinsic::VectorLength:
if (addComma)
OS << ",";
OS << "Attribute::VectorLength";
addComma = true;
break;
case CodeGenIntrinsic::Mask:
if (addComma)
OS << ",";
OS << "Attribute::Mask";
addComma = true;
break;
case CodeGenIntrinsic::Passthru:
if (addComma)
OS << ",";
OS << "Attribute::Passthru";
addComma = true;
break;
case CodeGenIntrinsic::ReadOnly:
if (addComma)
OS << ",";
Expand Down