-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Fold integer lane extract and store to FPR store #129756
base: main
Are you sure you want to change the base?
Conversation
This helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode.
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode. Patch is 21.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129756.diff 10 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c836f3138a45f..3bfc1a922357a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4581,8 +4581,6 @@ let Predicates = [IsLE] in {
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
-} // AddedComplexity = 10
-
// unscaled i64 truncating stores
def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4591,6 +4589,8 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+} // AddedComplexity = 10
+
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4365e573d8b16..c5a246296ae0b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1988,6 +1988,38 @@ let Predicates = [HasSVE_or_SME] in {
def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))),
(UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>;
+ // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+ multiclass SVEVecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ ValueType SubRegTy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR> {
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+ }
+
+ let AddedComplexity = 19 in {
+ // Lane 0 truncating stores
+ // i32 -> i16
+ defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv4i32, i32, f16, hsub, uimm12s4, STRHui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv4i32, i32, f16, hsub, simm9, STURHi>;
+ // i64 -> i32
+ defm : SVEVecStoreLane0Pat<am_indexed32, truncstorei32, nxv2i64, i64, f32, ssub, uimm12s4, STRSui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled32, truncstorei32, nxv2i64, i64, f32, ssub, simm9, STURSi>;
+ // i64 -> i16
+ defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv2i64, i64, f16, hsub, uimm12s4, STRHui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv2i64, i64, f16, hsub, simm9, STURHi>;
+ // i16 -> i16 (technically a truncate as the extracted type is i32)
+ defm : SVEVecStoreLane0Pat<am_indexed16, truncstorei16, nxv8i16, i32, f16, hsub, uimm12s4, STRHui>;
+ defm : SVEVecStoreLane0Pat<am_unscaled16, truncstorei16, nxv8i16, i32, f16, hsub, simm9, STURHi>;
+
+ // Lane 0 stores
+ defm : SVEVecStoreLane0Pat<am_indexed32, store, nxv4i32, i32, f32, ssub, uimm12s4, STRSui>;
+ defm : SVEVecStoreLane0Pat<am_indexed64, store, nxv2i64, i64, f64, dsub, uimm12s4, STRDui>;
+ }
+
// Insert subvectors into FP SVE vectors.
foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in
foreach idx = [0, 2] in
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
new file mode 100644
index 0000000000000..22b136ac194cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; TODO: Improve codegen for non-zero extract indices.
+
+define void @test_str_lane_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, v0.s[3]
+; CHECK-NEXT: str w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: str w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ store i32 %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane0_s32(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_lane0_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ store i32 %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, v0.d[1]
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT: fmov x8, d0
+; STREAMING-COMPAT-NEXT: str x8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 1
+ store i64 %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane0_s64(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_lane0_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 0
+ store i64 %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x float> %b, i32 3
+ store float %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane0_f32(ptr %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: test_str_lane0_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f32:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x float> %b, i32 0
+ store float %0, ptr %a, align 4
+ ret void
+}
+
+define void @test_str_lane_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[1]
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_f64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1]
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x double> %b, i32 1
+ store double %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane0_f64(ptr %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: test_str_lane0_f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_f64:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x double> %b, i32 0
+ store double %0, ptr %a, align 8
+ ret void
+}
+
+define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umov w8, v0.b[7]
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: strb w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 16 x i8> %b, i32 7
+ store i8 %0, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_lane_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umov w8, v0.h[3]
+; CHECK-NEXT: strh w8, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s16:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3]
+; STREAMING-COMPAT-NEXT: fmov w8, s0
+; STREAMING-COMPAT-NEXT: strh w8, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 8 x i16> %b, i32 3
+ store i16 %0, ptr %a, align 2
+ ret void
+}
+
+define void @test_str_lane0_s16(ptr %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: test_str_lane0_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s16:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str h0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 8 x i16> %b, i32 0
+ store i16 %0, ptr %a, align 2
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i32(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str s0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str s0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i32
+ store i32 %trunc, ptr %ptr, align 4
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i64(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str d0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ store i64 %reduce, ptr %ptr, align 8
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str h0, [x0]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i16
+ store i16 %trunc, ptr %ptr, align 2
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur s0, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i32
+ %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ store i32 %trunc, ptr %out_ptr, align 4
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: stur x8, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: fmov x8, d0
+; STREAMING-COMPAT-NEXT: stur x8, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ store i64 %reduce, ptr %out_ptr, align 8
+ ret void
+}
+
+define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur h0, [x0, #-32]
+; CHECK-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: stur h0, [x0, #-32]
+; STREAMING-COMPAT-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i16
+ %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8
+ store i16 %trunc, ptr %out_ptr, align 2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 668dc18df6a0b..89f790210e193 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -332,15 +332,14 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: add z2.d, z5.d, z2.d
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: // %bb.2: // %middle.block
-; CHECK-NEXT: uaddv d2, p0, z2.d
; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d
; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d
+; CHECK-NEXT: uaddv d2, p0, z2.d
; CHECK-NEXT: faddv d0, p0, z3.d
-; CHECK-NEXT: fmov x8, d2
; CHECK-NEXT: faddv d1, p0, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: str s2, [x4]
; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1
-; CHECK-NEXT: str w8, [x4]
; CHECK-NEXT: ret
entry:
%0 = tail call i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 6644be11a02ba..ffef6f74f2d36 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -95,8 +95,7 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: mov z1.s, z0.s[1]
; CHECK-NEXT: zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: bitcast_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 7d6336a43a4fd..9e1d342663f0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
; CHECK-NEXT: sunpklo z1.d, z0.s
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.d, z0.s
+; CHECK-NEXT: mov z2.d, z1.d[1]
; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: mov z1.d, z1.d[1]
+; CHECK-NEXT: str d1, [x8]
+; CHECK-NEXT: str d0, [x8, #64]
+; CHECK-NEXT: fmov x10, d2
; CHECK-NEXT: fmov x11, d0
; CHECK-NEXT: mov z0.d, z0.d[1]
-; CHECK-NEXT: asr x10, x9, #63
-; CHECK-NEXT: stp x9, x10, [x8]
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: asr x12, x11, #63
-; CHECK-NEXT: stp x10, x10, [x8, #16]
-; CHECK-NEXT: stp x11, x12, [x8, #64]
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: asr x10, x9, #63
-; CHECK-NEXT: stp x12, x12, [x8, #80]
-; CHECK-NEXT: stp x10, x10, [x8, #48]
-; CHECK-NEXT: asr x12, x11, #63
-; CHECK-NEXT: stp x9, x10, [x8, #32]
-; CHECK-NEXT: stp x12, x12, [x8, #112]
-; CHECK-NEXT: stp x11, x12, [x8, #96]
+; CHECK-NEXT: asr x9, x9, #63
+; CHECK-NEXT: stp x9, x9, [x8, #8]
+; CHECK-NEXT: asr x11, x11, #63
+; CHECK-NEXT: stp x9, x10, [x8, #24]
+; CHECK-NEXT: asr x9, x10, #63
+; CHECK-NEXT: fmov x10, d0
+; CHECK-NEXT: stp x11, x11, [x8, #72]
+; CHECK-NEXT: stp x9, x9, [x8, #48]
+; CHECK-NEXT: str x9, [x8, #40]
+; CHECK-NEXT: asr x9, x10, #63
+; CHECK-NEXT: stp x11, x10, [x8, #88]
+; CHECK-NEXT: stp x9, x9, [x8, #112]
+; CHECK-NEXT: str x9, [x8, #104]
; CHECK-NEXT: ret
;
; NONEON-NOSVE-LABEL: load_sext_v4i32i256:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 613543310f2c3..aa1adfd306a4c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -75,8 +75,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: strb w8, [x19, #2]
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strh w8, [x19]
+; CHECK-NEXT: str h1, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
@@ -120,14 +119,12 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: bl def
; CHECK-NEXT: adrp x8, .LCPI2_0
-; CHECK-NEXT: ldr q0, [sp]
+; CHECK-NEXT: ldp q0, q2, [sp]
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: fmov w8, s2
; CHECK-NEXT: strb w8, [x19, #8]
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: str x8, [x19]
+; CHECK-NEXT: str d0, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index c8cea6ebabd48..434e24bf48724 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -121,9 +121,8 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_3: // %cond.store
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like I've said this before but why is this not just a DAG combine? Some of the results look preferable regardless of streaming requirements.
I was simply implementing it in a similar manner to the existing Neon patterns. Most of these patterns are enabled for non-streaming case, I hadn't enabled the non-zero extract case, since it looks like it might not be worthwhile, but it seems like DUP could have more throughput vs |
In which case, if there's no functional reason not to then please go the DAG combine route rather than ISel. From what I can see you're essentially just converting integer based extract_elt into floating-point ones when they feed a store? |
I've not tried this myself, but I figured that inserting bitcasts might result in the generic DAGCombiner reverting some of those decisions, unless you'd put in some AArch64-specific instruction to stop the DAGCombiner from doing so. I had a think to see what would be required and thought this might not be trivial either and so I was happy with the current patterns. |
// TODO: Consider allowing Neon (a lot of churn, not necessarily better). | ||
if (!VectorVT.isScalableVector()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this could be enabled for Neon too, but it causes a huge amount of churn, and more code in some cases. I've not looked into it in detail yet -- so for now it's disabled.
This helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode.