From e3b60597769f79a8abc19fb8ef1f321d9adc1358 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 7 Apr 2020 14:45:16 +0100 Subject: [PATCH] [X86][SSE] combineX86ShufflesConstants - early out for zeroable vectors (PR45443) Shuffle combining can insert zero byte sized elements into the shuffle mask, which combineX86ShufflesConstants will attempt to fold without taking into account whether the byte-sized type is legal (e.g. AVX512F only targets). If we have a full-zeroable vector then we should just return a zero version of the root type, otherwise if the type isn't valid we should bail. Fixes PR45443 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 +++++++- llvm/test/CodeGen/X86/pr45443.ll | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/pr45443.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 777e376d3c4ff..a9db423caec2d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34749,6 +34749,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef Ops, return SDValue(); // Shuffle the constant bits according to the mask. + SDLoc DL(Root); APInt UndefElts(NumMaskElts, 0); APInt ZeroElts(NumMaskElts, 0); APInt ConstantElts(NumMaskElts, 0); @@ -34786,6 +34787,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef Ops, } assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()); + // Attempt to create a zero vector. + if ((UndefElts | ZeroElts).isAllOnesValue()) + return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL); + // Create the constant data. MVT MaskSVT; if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) @@ -34794,8 +34799,9 @@ static SDValue combineX86ShufflesConstants(ArrayRef Ops, MaskSVT = MVT::getIntegerVT(MaskSizeInBits); MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); + if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) + return SDValue(); - SDLoc DL(Root); SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); return DAG.getBitcast(VT, CstOp); } diff --git a/llvm/test/CodeGen/X86/pr45443.ll b/llvm/test/CodeGen/X86/pr45443.ll new file mode 100644 index 0000000000000..1e40ab94e9ca4 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr45443.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64 + +define <16 x float> @PR45443() { +; CHECK-LABEL: PR45443: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0 +; CHECK-NEXT: ret{{[l|q]}} +bb: + %tmp = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> , <4 x i32> ) + %tmp4 = tail call fast <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> , <16 x float> undef) + %tmp5 = icmp ult <16 x i32> %tmp, + %tmp6 = and <16 x i32> %tmp, + %tmp7 = icmp ne <16 x i32> %tmp6, zeroinitializer + %tmp8 = and <16 x i1> %tmp7, %tmp5 + %tmp9 = select fast <16 x i1> %tmp8, <16 x float> , <16 x float> %tmp4 + ret <16 x float> %tmp9 +} +declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) +declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>)