-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] combineINSERT_SUBVECTOR - fold insert_subvector(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast) #133083
Conversation
…bvector(broadcast)) -> blend shuffle(base,broadcast) If the broadcast is already the full vector width, try to prefer a blend over a vector insertion which is usually a lower latency (and sometimes a lower uop count).
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the broadcast is already the full vector width, try to prefer a blend over a vector insertion which is usually a lower latency (and sometimes a lower uop count). Full diff: https://github.com/llvm/llvm-project/pull/133083.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b128a6dadbbb6..0828fc008c59a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58794,6 +58794,8 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
uint64_t IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
+ int VecNumElts = OpVT.getVectorNumElements();
+ int SubVecNumElts = SubVecVT.getVectorNumElements();
if (Vec.isUndef() && SubVec.isUndef())
return DAG.getUNDEF(OpVT);
@@ -58853,10 +58855,9 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
(IdxVal != 0 ||
!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
+ SDValue ExtSrc = SubVec.getOperand(0);
int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
- int VecNumElts = OpVT.getVectorNumElements();
- int SubVecNumElts = SubVecVT.getVectorNumElements();
SmallVector<int, 64> Mask(VecNumElts);
// First create an identity shuffle mask.
for (int i = 0; i != VecNumElts; ++i)
@@ -58864,8 +58865,24 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
// Now insert the extracted portion.
for (int i = 0; i != SubVecNumElts; ++i)
Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
-
- return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+ return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
+ }
+ // If we're broadcasting, see if we can use a blend instead of
+ // extract/insert pair. For subvector broadcasts, we must ensure that the
+ // subvector is aligned with the insertion/extractions.
+ if (ExtSrc.getOpcode() == X86ISD::VBROADCAST ||
+ ExtSrc.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ (ExtSrc.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
+ (ExtIdxVal % SubVecNumElts) == 0 && (IdxVal % SubVecNumElts) == 0 &&
+ cast<MemIntrinsicSDNode>(ExtSrc)->getMemoryVT() == SubVecVT)) {
+ SmallVector<int, 64> Mask(VecNumElts);
+ // First create an identity shuffle mask.
+ for (int i = 0; i != VecNumElts; ++i)
+ Mask[i] = i;
+ // Now blend the broadcast.
+ for (int i = 0; i != SubVecNumElts; ++i)
+ Mask[i + IdxVal] = i + IdxVal + VecNumElts;
+ return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
}
}
@@ -58913,7 +58930,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
// If we're splatting the lower half subvector of a full vector load into the
// upper half, attempt to create a subvector broadcast.
// TODO: Drop hasOneUse checks.
- if (IdxVal == (OpVT.getVectorNumElements() / 2) &&
+ if (IdxVal == (VecNumElts / 2) &&
Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits()) &&
(Vec.hasOneUse() || SubVec.hasOneUse())) {
auto *VecLd = dyn_cast<LoadSDNode>(Vec);
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 2dfa515d9f05c..ce62d7a278b1a 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -2837,8 +2837,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -2855,8 +2854,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -2872,7 +2870,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3100,8 +3098,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3118,8 +3115,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3135,7 +3131,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -3868,12 +3864,11 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
@@ -3885,12 +3880,11 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 305509ca7fc3f..52f856befa130 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -2239,7 +2239,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2253,7 +2253,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2267,7 +2267,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -2458,7 +2458,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2472,7 +2472,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx)
@@ -2486,7 +2486,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14]
; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -3095,7 +3095,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
@@ -3107,7 +3107,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index e88a651d29cef..89330122fa239 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -350,7 +350,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6],ymm5[7]
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7]
; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,u,2,u,u,u,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7]
|
@@ -2837,8 +2837,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v | |||
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] | |||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] | |||
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 | |||
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did we insert the value we never used?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think its a hasOneUse issue - let me take a look.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it was multiple uses (2 EXTRACT_SUBVECTOR of different widths) - I've created #133130 but this needs a little work.
@@ -2239,7 +2239,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e | |||
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] | |||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] | |||
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 | |||
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 | |||
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why broadcast matters here? I assume we can replace vinsert with vpbland in any case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have an existing fold to convert INSERT_SUBVECTOR to SHUFFLE_VECTOR - but we don't fold cases where we extract the lowest subvector as it will either infinite loop or create cross-lane shuffles. BROADCAST is a special case as we know we can avoid cross-lane shuffles entirely.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking some peephole optimizaion, but I didn't find an existence. X86FixupInstTuning might be the most relevant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
@@ -2239,7 +2239,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e | |||
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] | |||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] | |||
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 | |||
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 | |||
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking some peephole optimizaion, but I didn't find an existence. X86FixupInstTuning might be the most relevant.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/7703 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/51/builds/13341 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/64/builds/2635 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/66/builds/11786 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/10220 Here is the relevant piece of the build log for the reference
|
This change is causing some of our internal tests to time out after 5 minutes (they were completing in 3 minutes before). |
This reverts commit 6c21716. It touches a common file as llvm#133083, which is causing failures
…tor(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast)" (#133340) Reverts llvm/llvm-project#133083 This causes BuildBot failures, and causes time outs in some of our internal tests (3 min => over 5 min).
@googlewalt Please can you send me a repro asap - cheers |
Haven't figured out how to get the IR repro yet, but it looks like this non-deterministically hitting an infinite loop under Sample thread dumps from three separate cases:
|
If you have the full IR, are you able to use bugpoint with a large timeout value (and a bit of patience!)? |
The timeout (probably infinite loop) happens when a tool lowers to MCJIT machine code with this snippet: llvm::MCContext* mc_context;
llvm::legacy::PassManager codegen_passes;
target_machine->addPassesToEmitMC(codegen_passes, mc_context, ostream);
codegen_passes.run(module); // Timeout/infinite loop happens here (source) I dumped the IR before that snippet, and it looks something like: ; ModuleID = '/tmp/repro.ll'
source_filename = "__repro"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) uwtable
define noalias noundef ptr @widget(ptr readonly captures(none) %arg) local_unnamed_addr #0 {
bb:
%getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 24
%load = load ptr, ptr %getelementptr, align 8
%load1 = load ptr, ptr %load, align 8, !invariant.load !2, !dereferenceable !3, !align !4
%getelementptr2 = getelementptr i8, ptr %load, i64 16
%load3 = load ptr, ptr %getelementptr2, align 8, !invariant.load !2, !dereferenceable !5, !align !4
%load4 = load i64, ptr %load1, align 32, !invariant.load !2, !noalias !6
%add = add i64 %load4, 1
%insertelement = insertelement <16 x i64> poison, i64 %load4, i64 0
%insertelement5 = insertelement <16 x i64> %insertelement, i64 %add, i64 1
%shufflevector = shufflevector <16 x i64> %insertelement5, <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%icmp = icmp ugt <16 x i64> %shufflevector, <i64 9223372036854775806, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>
%icmp6 = icmp slt <16 x i64> %shufflevector, <i64 9223372036854775806, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>
%shufflevector7 = shufflevector <16 x i1> %icmp, <16 x i1> %icmp6, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%zext = zext <16 x i1> %shufflevector7 to <16 x i8>
store <16 x i8> %zext, ptr %load3, align 32, !alias.scope !6
%getelementptr8 = getelementptr inbounds nuw i8, ptr %load3, i64 16
%insertelement9 = insertelement <8 x i64> poison, i64 %add, i64 0
%shufflevector10 = shufflevector <8 x i64> %insertelement9, <8 x i64> poison, <8 x i32> zeroinitializer
%icmp11 = icmp slt <8 x i64> %shufflevector10, <i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24>
%zext12 = zext <8 x i1> %icmp11 to <8 x i8>
store <8 x i8> %zext12, ptr %getelementptr8, align 16, !alias.scope !6
%icmp13 = icmp slt i64 %add, 25
%zext14 = zext i1 %icmp13 to i8
%getelementptr15 = getelementptr inbounds nuw i8, ptr %load3, i64 24
store i8 %zext14, ptr %getelementptr15, align 8, !alias.scope !6
ret ptr null
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) uwtable "frame-pointer"="all" "prefer-vector-width"="256" }
!llvm.module.flags = !{!0, !1}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 1, !"xx", i64 18}
!2 = !{}
!3 = !{i64 8}
!4 = !{i64 32}
!5 = !{i64 25}
!6 = !{!7}
!7 = !{!"result slice: {index:1, offset:0, size:25}", !8}
!8 = !{!"xx"} When running the original test, it fails ~half the time. (A single test case takes 2 seconds when it passes, but otherwise gets killed by the test runner at 60s). However, I can't get standard LLVM tool (opt, lli, ...) to fail when directly processing this IR. I'm not sure what the difference is yet. |
Reduces diff for an updated version of llvm#133083
@rupprecht @googlewalt I think I have a repro now - just working on reduction |
) Reduces diff for an updated version of #133083
define void @widget() #0 {
%load4 = load i64, ptr poison, align 32
%add = add i64 %load4, 1
%insertelement5 = insertelement <16 x i64> zeroinitializer, i64 %add, i64 1
%shufflevector = shufflevector <16 x i64> %insertelement5, <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%icmp6 = icmp slt <16 x i64> %shufflevector, <i64 9223372036854775806, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16>
%shufflevector7 = shufflevector <16 x i1> poison, <16 x i1> %icmp6, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%zext = zext <16 x i1> %shufflevector7 to <16 x i8>
store <16 x i8> %zext, ptr poison, align 32
ret void
}
attributes #0 = { "target-cpu"="skx" }
|
…oadcast)) pattern identified in #133083 Infinite loop check
…bvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines. Updated version of llvm#133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
…#133705) Reduces diff for an updated version of llvm#133083
…oadcast)) pattern identified in llvm#133083 Infinite loop check
…bvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) (#133724) If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines. Updated version of #133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
…bvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) (llvm#133724) If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines. Updated version of llvm#133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly.
If the broadcast is already the full vector width, try to prefer a blend over a vector insertion which is usually a lower latency (and sometimes a lower uop count).