[AArch64] Override canCombineStoreAndExtract #145825

AZero13 · 2025-06-26T01:14:04Z

ST1 instructions extract and store in one instruction. We can map them to 64-bit and 128-bit registers (V and Q regs).

llvmbot · 2025-06-26T01:14:38Z

@llvm/pr-subscribers-backend-aarch64

Author: AZero13 (AZero13)

Changes

ST1 instructions extract and store in one instruction. We can map them to 64-bit and 128-bit registers (V and Q regs).

Full diff: https://github.com/llvm/llvm-project/pull/145825.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+32)
(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+3)
(added) llvm/test/CodeGen/AArch64/vector-promotion.ll (+434)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 13835747c91e5..85ad7b59236c7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28261,6 +28261,38 @@ Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
   return TargetLowering::getSSPStackGuardCheck(M);
 }
 
+bool AArch64TargetLowering::canCombineStoreAndExtract(Type *VectorTy,
+                                                      Value *Idx,
+                                                      unsigned &Cost) const {
+  // Floating point values and vector values map to the same register file.
+  // Therefore, although we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+
+  // Reject scalable vectors - ST1 lane indexing only works with fixed-size NEON
+  // vectors
+  if (cast<VectorType>(VectorTy)->isScalableTy())
+    return false;
+
+  unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
+  // We can do a store + vector extract on any vector that fits perfectly in a V
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;
+}
+
 Value *
 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 89f90ee2b7707..8f7e9c53a5e5e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -354,6 +354,9 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
+  bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                 unsigned &Cost) const override;
+
   bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                         const MachineFunction &MF) const override;
 
diff --git a/llvm/test/CodeGen/AArch64/vector-promotion.ll b/llvm/test/CodeGen/AArch64/vector-promotion.ll
new file mode 100644
index 0000000000000..6e5ac8c2abf52
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-promotion.ll
@@ -0,0 +1,434 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    orr.2s v0, #1
+; ASM-NEXT:    st1.s { v0 }[1], [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+;
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov w8, s0
+; ASM-NEXT:    cmp w8, w1
+; ASM-NEXT:    cset w8, eq
+; ASM-NEXT:    strb w8, [x2]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, i1* %dest, align 4
+  ret void
+}
+
+
+; BB2
+;
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM:       ; %bb.0: ; %bb1
+; ASM-NEXT:    tbz w2, #0, LBB2_2
+; ASM-NEXT:  ; %bb.1: ; %bb2
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov w8, s0
+; ASM-NEXT:    orr w8, w8, #0x1
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:  LBB2_2: ; %end
+; ASM-NEXT:    ret
+bb1:
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  br i1 %bool, label %bb2, label %end
+bb2:
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  br label %end
+end:
+  ret void
+}
+
+;
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    orr.2s v0, #1
+; ASM-NEXT:    str s0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  %out4 = or i32 %out3, 1
+  %out5 = or i32 %out4, 1
+  %out6 = or i32 %out5, 1
+  %out7 = or i32 %out6, 1
+  store i32 %out7, i32* %dest, align 4
+  ret void
+}
+
+;
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: unsupportedMultiUses:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    orr w0, w8, #0x1
+; ASM-NEXT:    str w0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; Scalar version:
+; Vector version:
+;
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: udivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #18725 ; =0x4925
+; ASM-NEXT:    movk w9, #9362, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    umull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    sub w8, w8, w9
+; ASM-NEXT:    add w8, w9, w8, lsr #1
+; ASM-NEXT:    lsr w8, w8, #2
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: uremCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #18725 ; =0x4925
+; ASM-NEXT:    movk w9, #9362, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    umull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    sub w10, w8, w9
+; ASM-NEXT:    add w9, w9, w10, lsr #1
+; ASM-NEXT:    lsr w9, w9, #2
+; ASM-NEXT:    sub w9, w9, w9, lsl #3
+; ASM-NEXT:    add w8, w8, w9
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = urem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sdivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #9363 ; =0x2493
+; ASM-NEXT:    movk w9, #37449, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    smull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    add w8, w9, w8
+; ASM-NEXT:    asr w9, w8, #2
+; ASM-NEXT:    add w8, w9, w8, lsr #31
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = sdiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sremCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #9363 ; =0x2493
+; ASM-NEXT:    movk w9, #37449, lsl #16
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    smull x9, w8, w9
+; ASM-NEXT:    lsr x9, x9, #32
+; ASM-NEXT:    add w9, w9, w8
+; ASM-NEXT:    asr w10, w9, #2
+; ASM-NEXT:    add w9, w10, w9, lsr #31
+; ASM-NEXT:    sub w9, w9, w9, lsl #3
+; ASM-NEXT:    add w8, w8, w9
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fdivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #7.00000000
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    fdiv s0, s0, s1
+; ASM-NEXT:    str s0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fdiv float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fremCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT:    .cfi_def_cfa_offset 32
+; ASM-NEXT:    .cfi_offset w30, -8
+; ASM-NEXT:    .cfi_offset w29, -16
+; ASM-NEXT:    .cfi_offset w19, -24
+; ASM-NEXT:    .cfi_offset w20, -32
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #7.00000000
+; ASM-NEXT:    mov x19, x1
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    bl _fmodf
+; ASM-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT:    str s0, [x19]
+; ASM-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefDivCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #7 ; =0x7
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    udiv w8, w9, w8
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefRemCase:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov w9, #7 ; =0x7
+; ASM-NEXT:    mov.s w8, v0[1]
+; ASM-NEXT:    sdiv w10, w9, w8
+; ASM-NEXT:    msub w8, w10, w8, w9
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefConstantFRemCaseWithFastMath:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT:    .cfi_def_cfa_offset 32
+; ASM-NEXT:    .cfi_offset w30, -8
+; ASM-NEXT:    .cfi_offset w29, -16
+; ASM-NEXT:    .cfi_offset w19, -24
+; ASM-NEXT:    .cfi_offset w20, -32
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #7.00000000
+; ASM-NEXT:    mov x19, x1
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    bl _fmodf
+; ASM-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT:    str s0, [x19]
+; ASM-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefVectorFRemCaseWithFastMath:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT:    .cfi_def_cfa_offset 32
+; ASM-NEXT:    .cfi_offset w30, -8
+; ASM-NEXT:    .cfi_offset w29, -16
+; ASM-NEXT:    .cfi_offset w19, -24
+; ASM-NEXT:    .cfi_offset w20, -32
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    mov x19, x1
+; ASM-NEXT:    mov s1, v0[1]
+; ASM-NEXT:    fmov s0, #7.00000000
+; ASM-NEXT:    bl _fmodf
+; ASM-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT:    str s0, [x19]
+; ASM-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float 7.0, %extract
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we are able to promote floating point value.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotionFloat:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr d0, [x0]
+; ASM-NEXT:    fmov s1, #1.00000000
+; ASM-NEXT:    mov s0, v0[1]
+; ASM-NEXT:    fadd s0, s0, s1
+; ASM-NEXT:    str s0, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fadd float %extract, 1.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+; ASM-LABEL: simpleOneInstructionPromotionVariableIdx:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ; kill: def $w2 killed $w2 def $x2
+; ASM-NEXT:    and x8, x2, #0x1
+; ASM-NEXT:    ldr w8, [x0, x8, lsl #2]
+; ASM-NEXT:    orr w8, w8, #0x1
+; ASM-NEXT:    str w8, [x1]
+; ASM-NEXT:    ret
+  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion8x8:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    movi.8b v0, #1
+; ASM-NEXT:    ldr d1, [x0]
+; ASM-NEXT:    orr.8b v0, v1, v0
+; ASM-NEXT:    st1.b { v0 }[1], [x1]
+; ASM-NEXT:    ret
+  %in1 = load <8 x i8>, <8 x i8>* %addr1, align 8
+  %extract = extractelement <8 x i8> %in1, i32 1
+  %out = or i8 %extract, 1
+  store i8 %out, i8* %dest, align 4
+  ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; The Q register used here must be [[LOAD]] / 2, but we cannot express that.
+define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion4x32:
+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr q0, [x0]
+; ASM-NEXT:    orr.4s v0, #1
+; ASM-NEXT:    st1.s { v0 }[1], [x1]
+; ASM-NEXT:    ret
+  %in1 = load <4 x i32>, <4 x i32>* %addr1, align 8
+  %extract = extractelement <4 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 1
+  ret void
+}

AZero13 · 2025-06-26T01:16:12Z

@davemgreen

github-actions · 2025-06-26T01:16:19Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AArch64/vector-promotion.ll llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h

The following files introduce new uses of undef:

llvm/test/CodeGen/AArch64/vector-promotion.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

MacDue · 2025-06-30T20:06:34Z

llvm/test/CodeGen/AArch64/vector-promotion.ll

+; ASM:       ; %bb.0:
+; ASM-NEXT:    ldr q0, [x0]
+; ASM-NEXT:    orr.4s v0, #1
+; ASM-NEXT:    st1.s { v0 }[1], [x1]
+; ASM-NEXT:    ret


Currently, this generates:

ldr w8, [x0, #4] orr w8, w8, #0x1 str w8, [x1] ret

Which seems like it could be preferred over vector operations. Similar for simpleOneInstructionPromotion8x8.

MacDue · 2025-06-30T20:07:27Z

llvm/test/CodeGen/AArch64/vector-promotion.ll

+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+;
+; Make sure we got rid of any expensive vmov.32 instructions.


These comments are referring to ARM not AArch64 instructions.

MacDue · 2025-06-30T20:08:29Z

llvm/test/CodeGen/AArch64/vector-promotion.ll

+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s


opt run lines are never checked

MacDue · 2025-06-30T20:08:49Z

llvm/test/CodeGen/AArch64/vector-promotion.ll

+  ret void
+}
+
+;


It seems like quite a few of these test cases don't change with this patch.

AZero13 added 2 commits June 25, 2025 21:13

Pre-commit test

9b92416

[AArch64] Override canCombineStoreAndExtract

50043d1

ST1 instructions extract and store in one instruction. We can map them to 64-bit and 128-bit registers (V and Q regs).

llvmbot added the backend:AArch64 label Jun 26, 2025

MacDue reviewed Jun 30, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Override canCombineStoreAndExtract #145825

[AArch64] Override canCombineStoreAndExtract #145825

Uh oh!

AZero13 commented Jun 26, 2025

Uh oh!

llvmbot commented Jun 26, 2025

Uh oh!

AZero13 commented Jun 26, 2025

Uh oh!

github-actions bot commented Jun 26, 2025

Uh oh!

MacDue Jun 30, 2025

Uh oh!

MacDue Jun 30, 2025 •

edited

Loading

Uh oh!

MacDue Jun 30, 2025

Uh oh!

MacDue Jun 30, 2025 •

edited

Loading

Uh oh!

Uh oh!

[AArch64] Override canCombineStoreAndExtract #145825

Are you sure you want to change the base?

[AArch64] Override canCombineStoreAndExtract #145825

Uh oh!

Conversation

AZero13 commented Jun 26, 2025

Uh oh!

llvmbot commented Jun 26, 2025

Uh oh!

AZero13 commented Jun 26, 2025

Uh oh!

github-actions bot commented Jun 26, 2025

Uh oh!

MacDue Jun 30, 2025

Choose a reason for hiding this comment

Uh oh!

MacDue Jun 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

MacDue Jun 30, 2025

Choose a reason for hiding this comment

Uh oh!

MacDue Jun 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

MacDue Jun 30, 2025 •

edited

Loading

MacDue Jun 30, 2025 •

edited

Loading