-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[X86] lowerShuffleAsVTRUNC - use combineConcatVectorOps to catch more "cheap" concats #145876
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesPatch is 551.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145876.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c26dd6e2dc2f..307a237e2955c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10312,6 +10312,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable,
return true;
}
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+ ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ unsigned Depth = 0);
+
/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
@@ -10692,7 +10697,8 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
SelectionDAG &DAG) {
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unexpected VTRUNC type");
- if (!Subtarget.hasAVX512())
+ if (!Subtarget.hasAVX512() ||
+ (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
return SDValue();
unsigned NumElts = VT.getVectorNumElements();
@@ -10721,30 +10727,19 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
bool UndefUppers =
UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
+ // As we're using both sources then we need to concat them together
+ // and truncate from the double-sized src.
+ MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
+
// For offset truncations, ensure that the concat is cheap.
- if (Offset) {
- auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
- if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
- return Lo.getOperand(0) == Hi.getOperand(0);
- if (ISD::isNormalLoad(Lo.getNode()) &&
- ISD::isNormalLoad(Hi.getNode())) {
- auto *LDLo = cast<LoadSDNode>(Lo);
- auto *LDHi = cast<LoadSDNode>(Hi);
- return DAG.areNonVolatileConsecutiveLoads(
- LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
- }
- return false;
- };
- if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
+ SDValue Src =
+ combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
+ if (!Src) {
+ if (Offset)
continue;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
}
- // As we're using both sources then we need to concat them together
- // and truncate from the double-sized src.
- MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
- SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
-
MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
Src = DAG.getBitcast(SrcVT, Src);
@@ -42183,11 +42178,6 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
return SDValue();
}
-static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
- ArrayRef<SDValue> Ops, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- unsigned Depth = 0);
-
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index ae4f85ce42a19..01aacc1e06258 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1297,68 +1297,59 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride4_vf16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512-NEXT: vpmovqw %ymm1, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512-NEXT: vpmovqw %zmm4, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpsrlq $16, %ymm1, %ymm7
+; AVX512-NEXT: vpmovqw %ymm7, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
-; AVX512-NEXT: vpmovqw %zmm3, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm4, %zmm6
+; AVX512-NEXT: vpmovqw %zmm6, %xmm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm6
+; AVX512-NEXT: vpmovqw %ymm6, %xmm6
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT: vpsrlq $32, %zmm4, %zmm7
+; AVX512-NEXT: vpmovqw %zmm7, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %ymm1, %ymm1
+; AVX512-NEXT: vpmovqw %ymm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm3, %zmm5
-; AVX512-NEXT: vpmovqw %zmm5, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm8
-; AVX512-NEXT: vpmovqw %zmm8, %xmm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpsrlq $48, %zmm3, %zmm3
-; AVX512-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpsrlq $48, %zmm4, %zmm2
+; AVX512-NEXT: vpmovqw %zmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512-NEXT: vmovdqa %ymm2, (%rdx)
-; AVX512-NEXT: vmovdqa %ymm5, (%rcx)
+; AVX512-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512-NEXT: vmovdqa %ymm6, (%rcx)
; AVX512-NEXT: vmovdqa %ymm1, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1411,68 +1402,59 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-LABEL: load_i16_stride4_vf16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-NEXT: vpmovqw %zmm4, %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %ymm1, %ymm7
+; AVX512DQ-NEXT: vpmovqw %ymm7, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3
-; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm4, %zmm6
+; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm6
+; AVX512DQ-NEXT: vpmovqw %ymm6, %xmm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT: vpsrlq $32, %zmm4, %zmm7
+; AVX512DQ-NEXT: vpmovqw %zmm7, %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm3, %zmm5
-; AVX512DQ-NEXT: vpmovqw %zmm5, %xmm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm3, %zmm8
-; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm3, %zmm3
-; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %zmm4, %zmm2
+; AVX512DQ-NEXT: vpmovqw %zmm2, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %ymm2, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm5, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm6, (%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -2713,128 +2695,110 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512-NEXT: vpmovqw %ymm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa 240(%rdi), %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpmovqw %ymm2, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpmovqw %zmm1, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa 240(%rdi), %xmm5
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpmovqw %zmm1, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX512-NEXT: vpmovqw %ymm3, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-NEXT: vpmovqw %zmm0, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 192(%rdi), %xmm13
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm14
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3...
[truncated]
|
anthonyhatran
pushed a commit
to anthonyhatran/llvm-project
that referenced
this pull request
Jun 26, 2025
… "cheap" concats (llvm#145876)
rlavaee
pushed a commit
to rlavaee/llvm-project
that referenced
this pull request
Jul 1, 2025
… "cheap" concats (llvm#145876)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.