[AArch64][SVE] Fold integer lane extract and store to FPR store #129756

MacDue · 2025-03-04T18:03:10Z

This helps avoid pointless fmovs to GPRs, which may be slow, especially in streaming mode.

llvmbot · 2025-03-04T18:03:47Z

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes

This helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode.

Patch is 21.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129756.diff

10 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+2-2)
(modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+32)
(added) llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll (+328)
(modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll (+2-3)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+17-15)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+4-7)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+2-3)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll (+1-2)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c836f3138a45f..3bfc1a922357a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4581,8 +4581,6 @@ let Predicates = [IsLE] in {
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
-} // AddedComplexity = 10
-
 // unscaled i64 truncating stores
 def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
   (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4591,6 +4589,8 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
 def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
   (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 
+} // AddedComplexity = 10
+
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
                              ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4365e573d8b16..c5a246296ae0b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1988,6 +1988,38 @@ let Predicates = [HasSVE_or_SME] in {
   def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
             (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
 
+  // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+  multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+                              ValueType VTy, ValueType STy,
+                              ValueType SubRegTy,
+                              SubRegIndex SubRegIdx, Operand IndexType,
+                              Instruction STR> {
+    def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+                      (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+              (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+                  GPR64sp:$Rn, IndexType:$offset)>;
+  }
+
+  let AddedComplexity = 19 in {
+    // Lane 0 truncating stores
+    // i32 -> i16
+    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi>;
+    // i64 -> i32
+    defm : SVEVecStoreLane0Pat<am_indexed32,  truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi>;
+    // i64 -> i16
+    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi>;
+    // i16 -> i16 (technically a truncate as the extracted type is i32)
+    defm : SVEVecStoreLane0Pat<am_indexed16,  truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui>;
+    defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
+
+    // Lane 0 stores
+    defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
+    defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+  }
+
   // Insert subvectors into FP SVE vectors.
   foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in
     foreach idx = [0, 2] in
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
new file mode 100644
index 0000000000000..22b136ac194cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; TODO: Improve codegen for non-zero extract indices.
+
+define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    str w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 3
+  store i32 %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane0_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x i32> %b, i32 0
+  store i32 %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT:    fmov x8, d0
+; STREAMING-COMPAT-NEXT:    str x8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x i64> %b, i32 1
+  store i64 %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane0_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane0_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x i64> %b, i32 0
+  store i64 %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x float> %b, i32 3
+  store float %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane0_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane0_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f32:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 4 x float> %b, i32 0
+  store float %0, ptr %a, align 4
+  ret void
+}
+
+define void @test_str_lane_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x double> %b, i32 1
+  store double %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane0_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane0_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f64:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 2 x double> %b, i32 0
+  store double %0, ptr %a, align 8
+  ret void
+}
+
+define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w8, v0.b[7]
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    strb w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 16 x i8> %b, i32 7
+  store i8 %0, ptr %a, align 1
+  ret void
+}
+
+define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umov w8, v0.h[3]
+; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s16:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    mov z0.h, z0.h[3]
+; STREAMING-COMPAT-NEXT:    fmov w8, s0
+; STREAMING-COMPAT-NEXT:    strh w8, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 8 x i16> %b, i32 3
+  store i16 %0, ptr %a, align 2
+  ret void
+}
+
+define void @test_str_lane0_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane0_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s16:
+; STREAMING-COMPAT:       // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+entry:
+  %0 = extractelement <vscale x 8 x i16> %b, i32 0
+  store i16 %0, ptr %a, align 2
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i32(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    str s0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i32
+  store i32 %trunc, ptr %ptr, align 4
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i64(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    str d0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  store i64 %reduce, ptr %ptr, align 8
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    str h0, [x0]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i16
+  store i16 %trunc, ptr %ptr, align 2
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    stur s0, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i32
+  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  store i32 %trunc, ptr %out_ptr, align 4
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    stur x8, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    fmov x8, d0
+; STREAMING-COMPAT-NEXT:    stur x8, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  store i64 %reduce, ptr %out_ptr, align 8
+  ret void
+}
+
+define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    stur h0, [x0, #-32]
+; CHECK-NEXT:    ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; STREAMING-COMPAT:       // %bb.0:
+; STREAMING-COMPAT-NEXT:    uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT:    stur h0, [x0, #-32]
+; STREAMING-COMPAT-NEXT:    ret
+
+  %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+  %trunc = trunc i64 %reduce to i16
+  %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+  store i16 %trunc, ptr %out_ptr, align 2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 668dc18df6a0b..89f790210e193 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -332,15 +332,14 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-NEXT:    add z2.d, z5.d, z2.d
 ; CHECK-NEXT:    b.ne .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
-; CHECK-NEXT:    uaddv d2, p0, z2.d
 ; CHECK-NEXT:    uzp2 z3.d, z1.d, z0.d
 ; CHECK-NEXT:    uzp1 z1.d, z1.d, z0.d
+; CHECK-NEXT:    uaddv d2, p0, z2.d
 ; CHECK-NEXT:    faddv d0, p0, z3.d
-; CHECK-NEXT:    fmov x8, d2
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str s2, [x4]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
-; CHECK-NEXT:    str w8, [x4]
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 6644be11a02ba..ffef6f74f2d36 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -95,8 +95,7 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    str s0, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 7d6336a43a4fd..9e1d342663f0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    mov z1.d, z1.d[1]
+; CHECK-NEXT:    str d1, [x8]
+; CHECK-NEXT:    str d0, [x8, #64]
+; CHECK-NEXT:    fmov x10, d2
 ; CHECK-NEXT:    fmov x11, d0
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    stp x9, x10, [x8]
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    asr x12, x11, #63
-; CHECK-NEXT:    stp x10, x10, [x8, #16]
-; CHECK-NEXT:    stp x11, x12, [x8, #64]
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    stp x12, x12, [x8, #80]
-; CHECK-NEXT:    stp x10, x10, [x8, #48]
-; CHECK-NEXT:    asr x12, x11, #63
-; CHECK-NEXT:    stp x9, x10, [x8, #32]
-; CHECK-NEXT:    stp x12, x12, [x8, #112]
-; CHECK-NEXT:    stp x11, x12, [x8, #96]
+; CHECK-NEXT:    asr x9, x9, #63
+; CHECK-NEXT:    stp x9, x9, [x8, #8]
+; CHECK-NEXT:    asr x11, x11, #63
+; CHECK-NEXT:    stp x9, x10, [x8, #24]
+; CHECK-NEXT:    asr x9, x10, #63
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    stp x11, x11, [x8, #72]
+; CHECK-NEXT:    stp x9, x9, [x8, #48]
+; CHECK-NEXT:    str x9, [x8, #40]
+; CHECK-NEXT:    asr x9, x10, #63
+; CHECK-NEXT:    stp x11, x10, [x8, #88]
+; CHECK-NEXT:    stp x9, x9, [x8, #112]
+; CHECK-NEXT:    str x9, [x8, #104]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 613543310f2c3..aa1adfd306a4c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -75,8 +75,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x8]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strb w8, [x19, #2]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [x19]
+; CHECK-NEXT:    str h1, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
@@ -120,14 +119,12 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    bl def
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
-; CHECK-NEXT:    ldr q0, [sp]
+; CHECK-NEXT:    ldp q0, q2, [sp]
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strb w8, [x19, #8]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    str x8, [x19]
+; CHECK-NEXT:    str d0, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index c8cea6ebabd48..434e24bf48724 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -121,9 +121,8 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_3: // %cond.store
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    ...
[truncated]

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

paulwalker-arm

I feel like I've said this before but why is this not just a DAG combine? Some of the results look preferable regardless of streaming requirements.

MacDue · 2025-03-06T14:35:27Z

I feel like I've said this before but why is this not just a DAG combine? Some of the results look preferable regardless of streaming requirements.

I was simply implementing it in a similar manner to the existing Neon patterns. Most of these patterns are enabled for non-streaming case, I hadn't enabled the non-zero extract case, since it looks like it might not be worthwhile, but it seems like DUP could have more throughput vs umov, so I'll enable that too.

paulwalker-arm · 2025-03-06T14:42:44Z

I was simply implementing it in a similar manner to the existing Neon patterns.

In which case, if there's no functional reason not to then please go the DAG combine route rather than ISel. From what I can see you're essentially just converting integer based extract_elt into floating-point ones when they feed a store?

sdesmalen-arm · 2025-03-06T14:43:35Z

I feel like I've said this before but why is this not just a DAG combine? Some of the results look preferable regardless of streaming requirements.

I've not tried this myself, but I figured that inserting bitcasts might result in the generic DAGCombiner reverting some of those decisions, unless you'd put in some AArch64-specific instruction to stop the DAGCombiner from doing so. I had a think to see what would be required and thought this might not be trivial either and so I was happy with the current patterns.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

sdesmalen-arm · 2025-03-13T17:35:53Z

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll

-; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    stp x9, x10, [x8]
+; CHECK-NEXT:    mov z2.d, z1.d[1]


This is an unnecessary regression because the extracted value has another use (which is not a store). Can you add an extra guard for this?

I've added a heuristic "If there are other users of integer scalars from this vector that won't be folded into a store -- don't fold". This resolves this case and the Neon regressions (so this is now enabled for SVE and Neon).

Note: Just hasOneUse() alone was not enough to prevent regressions (since this fold can extend vector lifetimes, and disrupt paired stores if generally applied).

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

sdesmalen-arm

LGTM!

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode.

MacDue · 2025-03-18T09:54:24Z

(Rebased & updated some more tests)

This rewrites the fold from #129756 to apply to all types, including stores of i8s. This required adding a new `aarch64mfp8` MVT to represent FPR8 types on AArch64, which can be used to extract and store 8-bit values using b sub-registers. Follow on from: #129756 Closes: #131793

This rewrites the fold from llvm#129756 to apply to all types, including stores of i8s. This required adding a new `aarch64mfp8` MVT to represent FPR8 types on AArch64, which can be used to extract and store 8-bit values using b sub-registers. Follow on from: llvm#129756 Closes: llvm#131793

MacDue requested review from SamTebbs33, paulwalker-arm, sdesmalen-arm and NickGuy-Arm March 4, 2025 18:03

llvmbot added the backend:AArch64 label Mar 4, 2025

MacDue force-pushed the fold_store_int branch from 576fed1 to 3c7c727 Compare March 5, 2025 10:20

sdesmalen-arm reviewed Mar 5, 2025

View reviewed changes

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td Outdated Show resolved Hide resolved

MacDue changed the title ~~[AArch64][SVE] Fold integer lane 0 extract and store to FPR store~~ [AArch64][SVE] Fold integer lane extract and store to FPR store Mar 5, 2025

paulwalker-arm reviewed Mar 6, 2025

View reviewed changes

MacDue commented Mar 6, 2025

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

sdesmalen-arm reviewed Mar 13, 2025

View reviewed changes

sdesmalen-arm approved these changes Mar 17, 2025

View reviewed changes

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Outdated Show resolved Hide resolved

MacDue added 11 commits March 18, 2025 09:30

[AArch64][SVE] Fold integer lane 0 extract and store to FPR store

58842de

This helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode.

Add missing folds

2b54fe2

Handle a few more cases + more tests

8a9bc1c

Avoid duplicate test checks

446ae82

Fixups

40b1a94

Fixups

9886583

Rewrite as DAG combine

17f34a6

Rm whitespace

473144f

Fixups

2e7219e

Fixups

f64904c

Rebase: Update tests

a689783

MacDue force-pushed the fold_store_int branch from b19ecb3 to a689783 Compare March 18, 2025 09:53

MacDue merged commit f406b28 into llvm:main Mar 18, 2025
6 of 10 checks passed

MacDue deleted the fold_store_int branch March 18, 2025 10:10

MacDue mentioned this pull request Mar 18, 2025

[AArch64] Make use of byte FPR stores for bytes extracted from vectors #131793

Closed

MacDue mentioned this pull request Apr 3, 2025

[AArch64] Generalize integer FPR lane stores for all types #134117

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64][SVE] Fold integer lane extract and store to FPR store #129756

[AArch64][SVE] Fold integer lane extract and store to FPR store #129756

Uh oh!

MacDue commented Mar 4, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 4, 2025

Uh oh!

Uh oh!

paulwalker-arm left a comment

Uh oh!

MacDue commented Mar 6, 2025 •

edited

Loading

Uh oh!

paulwalker-arm commented Mar 6, 2025

Uh oh!

sdesmalen-arm commented Mar 6, 2025

Uh oh!

Uh oh!

Uh oh!

sdesmalen-arm Mar 13, 2025

Uh oh!

MacDue Mar 17, 2025

Uh oh!

Uh oh!

sdesmalen-arm left a comment

Uh oh!

Uh oh!

MacDue commented Mar 18, 2025

Uh oh!

Uh oh!

Uh oh!

[AArch64][SVE] Fold integer lane extract and store to FPR store #129756

[AArch64][SVE] Fold integer lane extract and store to FPR store #129756

Uh oh!

Conversation

MacDue commented Mar 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 4, 2025

Uh oh!

Uh oh!

paulwalker-arm left a comment

Choose a reason for hiding this comment

Uh oh!

MacDue commented Mar 6, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

paulwalker-arm commented Mar 6, 2025

Uh oh!

sdesmalen-arm commented Mar 6, 2025

Uh oh!

Uh oh!

Uh oh!

sdesmalen-arm Mar 13, 2025

Choose a reason for hiding this comment

Uh oh!

MacDue Mar 17, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

sdesmalen-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

MacDue commented Mar 18, 2025

Uh oh!

Uh oh!

Uh oh!

MacDue commented Mar 4, 2025 •

edited

Loading

MacDue commented Mar 6, 2025 •

edited

Loading