Skip to content

[X86] lowerShuffleAsVTRUNC - use combineConcatVectorOps to catch more "cheap" concats #145876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 26, 2025

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Jun 26, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Jun 26, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Patch is 551.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145876.diff

3 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+16-26)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll (+650-812)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll (+3021-3427)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c26dd6e2dc2f..307a237e2955c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10312,6 +10312,11 @@ static bool isNonZeroElementsInOrder(const APInt &Zeroable,
   return true;
 }
 
+static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
+                                      ArrayRef<SDValue> Ops, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget,
+                                      unsigned Depth = 0);
+
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                       ArrayRef<int> Mask, SDValue V1,
@@ -10692,7 +10697,8 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
                                     SelectionDAG &DAG) {
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unexpected VTRUNC type");
-  if (!Subtarget.hasAVX512())
+  if (!Subtarget.hasAVX512() ||
+      (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
     return SDValue();
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -10721,30 +10727,19 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
       bool UndefUppers =
           UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
 
+      // As we're using both sources then we need to concat them together
+      // and truncate from the double-sized src.
+      MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
+
       // For offset truncations, ensure that the concat is cheap.
-      if (Offset) {
-        auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
-          if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-              Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-            return Lo.getOperand(0) == Hi.getOperand(0);
-          if (ISD::isNormalLoad(Lo.getNode()) &&
-              ISD::isNormalLoad(Hi.getNode())) {
-            auto *LDLo = cast<LoadSDNode>(Lo);
-            auto *LDHi = cast<LoadSDNode>(Hi);
-            return DAG.areNonVolatileConsecutiveLoads(
-                LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
-          }
-          return false;
-        };
-        if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
+      SDValue Src =
+          combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
+      if (!Src) {
+        if (Offset)
           continue;
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
       }
 
-      // As we're using both sources then we need to concat them together
-      // and truncate from the double-sized src.
-      MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
-      SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
-
       MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
       MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
       Src = DAG.getBitcast(SrcVT, Src);
@@ -42183,11 +42178,6 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
   return SDValue();
 }
 
-static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
-                                      ArrayRef<SDValue> Ops, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget,
-                                      unsigned Depth = 0);
-
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
                                     SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index ae4f85ce42a19..01aacc1e06258 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1297,68 +1297,59 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512-LABEL: load_i16_stride4_vf16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512-NEXT:    vpmovqw %ymm1, %xmm0
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm2
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512-NEXT:    vpmovqw %zmm4, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vpsrlq $16, %ymm1, %ymm7
+; AVX512-NEXT:    vpmovqw %ymm7, %xmm7
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT:    vpsrlq $16, %zmm4, %zmm6
+; AVX512-NEXT:    vpmovqw %zmm6, %xmm6
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm6
+; AVX512-NEXT:    vpmovqw %ymm6, %xmm6
+; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT:    vpsrlq $32, %zmm4, %zmm7
+; AVX512-NEXT:    vpmovqw %zmm7, %xmm7
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT:    vpsrlq $48, %ymm1, %ymm1
+; AVX512-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT:    vpsrlq $16, %zmm3, %zmm5
-; AVX512-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vpsrlq $32, %zmm3, %zmm8
-; AVX512-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT:    vpsrlq $48, %zmm3, %zmm3
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpsrlq $48, %zmm4, %zmm2
+; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512-NEXT:    vmovdqa %ymm2, (%rdx)
-; AVX512-NEXT:    vmovdqa %ymm5, (%rcx)
+; AVX512-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512-NEXT:    vmovdqa %ymm6, (%rcx)
 ; AVX512-NEXT:    vmovdqa %ymm1, (%r8)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1411,68 +1402,59 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-LABEL: load_i16_stride4_vf16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-NEXT:    vpmovqw %ymm1, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 112(%rdi), %xmm2
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-NEXT:    vpmovqw %zmm4, %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %ymm1, %ymm7
+; AVX512DQ-NEXT:    vpmovqw %ymm7, %xmm7
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm3
-; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm6
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm7
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT:    vpsrlq $16, %zmm4, %zmm6
+; AVX512DQ-NEXT:    vpmovqw %zmm6, %xmm6
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $32, %ymm1, %ymm6
+; AVX512DQ-NEXT:    vpmovqw %ymm6, %xmm6
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT:    vpsrlq $32, %zmm4, %zmm7
+; AVX512DQ-NEXT:    vpmovqw %zmm7, %xmm7
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
 ; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT:    vpsrlq $16, %zmm3, %zmm5
-; AVX512DQ-NEXT:    vpmovqw %zmm5, %xmm5
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT:    vpsrlq $32, %zmm3, %zmm8
-; AVX512DQ-NEXT:    vpmovqw %zmm8, %xmm8
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT:    vpsrlq $48, %zmm3, %zmm3
-; AVX512DQ-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT:    vpsrlq $48, %zmm4, %zmm2
+; AVX512DQ-NEXT:    vpmovqw %zmm2, %xmm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512DQ-NEXT:    vmovdqa %ymm2, (%rdx)
-; AVX512DQ-NEXT:    vmovdqa %ymm5, (%rcx)
+; AVX512DQ-NEXT:    vmovdqa %ymm3, (%rdx)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rcx)
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, (%r8)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2713,128 +2695,110 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm2
-; AVX512-NEXT:    vpmovqw %ymm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm6
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm7
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vpmovqw %ymm2, %xmm3
 ; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm3
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm5
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm7
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpmovqw %zmm1, %xmm4
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7]
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm3
-; AVX512-NEXT:    vpmovqw %ymm3, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm10
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm3
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm4
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm11
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm10
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm13
-; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm14
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3...
[truncated]

@RKSimon RKSimon merged commit fe4b403 into llvm:main Jun 26, 2025
9 checks passed
@RKSimon RKSimon deleted the x86-vtrunc-concat branch June 26, 2025 13:12
anthonyhatran pushed a commit to anthonyhatran/llvm-project that referenced this pull request Jun 26, 2025
rlavaee pushed a commit to rlavaee/llvm-project that referenced this pull request Jul 1, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants