-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Override canCombineStoreAndExtract #145825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
ST1 instructions extract and store in one instruction. We can map them to 64-bit and 128-bit registers (V and Q regs).
@llvm/pr-subscribers-backend-aarch64 Author: AZero13 (AZero13) ChangesST1 instructions extract and store in one instruction. We can map them to 64-bit and 128-bit registers (V and Q regs). Full diff: https://github.com/llvm/llvm-project/pull/145825.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 13835747c91e5..85ad7b59236c7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28261,6 +28261,38 @@ Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
return TargetLowering::getSSPStackGuardCheck(M);
}
+bool AArch64TargetLowering::canCombineStoreAndExtract(Type *VectorTy,
+ Value *Idx,
+ unsigned &Cost) const {
+ // Floating point values and vector values map to the same register file.
+ // Therefore, although we could do a store extract of a vector type, this is
+ // better to leave at float as we have more freedom in the addressing mode for
+ // those.
+ if (VectorTy->isFPOrFPVectorTy())
+ return false;
+
+ // If the index is unknown at compile time, this is very expensive to lower
+ // and it is not possible to combine the store with the extract.
+ if (!isa<ConstantInt>(Idx))
+ return false;
+
+ assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+
+ // Reject scalable vectors - ST1 lane indexing only works with fixed-size NEON
+ // vectors
+ if (cast<VectorType>(VectorTy)->isScalableTy())
+ return false;
+
+ unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
+ // We can do a store + vector extract on any vector that fits perfectly in a V
+ // or Q register.
+ if (BitWidth == 64 || BitWidth == 128) {
+ Cost = 0;
+ return true;
+ }
+ return false;
+}
+
Value *
AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 89f90ee2b7707..8f7e9c53a5e5e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -354,6 +354,9 @@ class AArch64TargetLowering : public TargetLowering {
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
+ bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+ unsigned &Cost) const override;
+
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/AArch64/vector-promotion.ll b/llvm/test/CodeGen/AArch64/vector-promotion.ll
new file mode 100644
index 0000000000000..6e5ac8c2abf52
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vector-promotion.ll
@@ -0,0 +1,434 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: orr.2s v0, #1
+; ASM-NEXT: st1.s { v0 }[1], [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+;
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov w8, s0
+; ASM-NEXT: cmp w8, w1
+; ASM-NEXT: cset w8, eq
+; ASM-NEXT: strb w8, [x2]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 0
+ %out = icmp eq i32 %extract, %in2
+ store i1 %out, i1* %dest, align 4
+ ret void
+}
+
+
+; BB2
+;
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM: ; %bb.0: ; %bb1
+; ASM-NEXT: tbz w2, #0, LBB2_2
+; ASM-NEXT: ; %bb.1: ; %bb2
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov w8, s0
+; ASM-NEXT: orr w8, w8, #0x1
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: LBB2_2: ; %end
+; ASM-NEXT: ret
+bb1:
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 0
+ br i1 %bool, label %bb2, label %end
+bb2:
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ br label %end
+end:
+ ret void
+}
+
+;
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: orr.2s v0, #1
+; ASM-NEXT: str s0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 0
+ %out1 = or i32 %extract, 1
+ %out2 = or i32 %out1, 1
+ %out3 = or i32 %out2, 1
+ %out4 = or i32 %out3, 1
+ %out5 = or i32 %out4, 1
+ %out6 = or i32 %out5, 1
+ %out7 = or i32 %out6, 1
+ store i32 %out7, i32* %dest, align 4
+ ret void
+}
+
+;
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: unsupportedMultiUses:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: orr w0, w8, #0x1
+; ASM-NEXT: str w0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; Scalar version:
+; Vector version:
+;
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: udivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #18725 ; =0x4925
+; ASM-NEXT: movk w9, #9362, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: umull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: sub w8, w8, w9
+; ASM-NEXT: add w8, w9, w8, lsr #1
+; ASM-NEXT: lsr w8, w8, #2
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = udiv i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: uremCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #18725 ; =0x4925
+; ASM-NEXT: movk w9, #9362, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: umull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: sub w10, w8, w9
+; ASM-NEXT: add w9, w9, w10, lsr #1
+; ASM-NEXT: lsr w9, w9, #2
+; ASM-NEXT: sub w9, w9, w9, lsl #3
+; ASM-NEXT: add w8, w8, w9
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = urem i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sdivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #9363 ; =0x2493
+; ASM-NEXT: movk w9, #37449, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: smull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: add w8, w9, w8
+; ASM-NEXT: asr w9, w8, #2
+; ASM-NEXT: add w8, w9, w8, lsr #31
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = sdiv i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: sremCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #9363 ; =0x2493
+; ASM-NEXT: movk w9, #37449, lsl #16
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: smull x9, w8, w9
+; ASM-NEXT: lsr x9, x9, #32
+; ASM-NEXT: add w9, w9, w8
+; ASM-NEXT: asr w10, w9, #2
+; ASM-NEXT: add w9, w10, w9, lsr #31
+; ASM-NEXT: sub w9, w9, w9, lsl #3
+; ASM-NEXT: add w8, w8, w9
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = srem i32 %extract, 7
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fdivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #7.00000000
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: fdiv s0, s0, s1
+; ASM-NEXT: str s0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = fdiv float %extract, 7.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Scalar version:
+; Vector version:
+;
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: fremCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT: .cfi_def_cfa_offset 32
+; ASM-NEXT: .cfi_offset w30, -8
+; ASM-NEXT: .cfi_offset w29, -16
+; ASM-NEXT: .cfi_offset w19, -24
+; ASM-NEXT: .cfi_offset w20, -32
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #7.00000000
+; ASM-NEXT: mov x19, x1
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: bl _fmodf
+; ASM-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT: str s0, [x19]
+; ASM-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = frem float %extract, 7.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefDivCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #7 ; =0x7
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: udiv w8, w9, w8
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = udiv i32 7, %extract
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: undefRemCase:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov w9, #7 ; =0x7
+; ASM-NEXT: mov.s w8, v0[1]
+; ASM-NEXT: sdiv w10, w9, w8
+; ASM-NEXT: msub w8, w10, w8, w9
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 1
+ %out = srem i32 7, %extract
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefConstantFRemCaseWithFastMath:
+; ASM: ; %bb.0:
+; ASM-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT: .cfi_def_cfa_offset 32
+; ASM-NEXT: .cfi_offset w30, -8
+; ASM-NEXT: .cfi_offset w29, -16
+; ASM-NEXT: .cfi_offset w19, -24
+; ASM-NEXT: .cfi_offset w20, -32
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #7.00000000
+; ASM-NEXT: mov x19, x1
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: bl _fmodf
+; ASM-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT: str s0, [x19]
+; ASM-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = frem nnan float %extract, 7.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; Scalar version:
+; Vector version:
+;
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: undefVectorFRemCaseWithFastMath:
+; ASM: ; %bb.0:
+; ASM-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; ASM-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; ASM-NEXT: .cfi_def_cfa_offset 32
+; ASM-NEXT: .cfi_offset w30, -8
+; ASM-NEXT: .cfi_offset w29, -16
+; ASM-NEXT: .cfi_offset w19, -24
+; ASM-NEXT: .cfi_offset w20, -32
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: mov x19, x1
+; ASM-NEXT: mov s1, v0[1]
+; ASM-NEXT: fmov s0, #7.00000000
+; ASM-NEXT: bl _fmodf
+; ASM-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ASM-NEXT: str s0, [x19]
+; ASM-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = frem nnan float 7.0, %extract
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we are able to promote floating point value.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotionFloat:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr d0, [x0]
+; ASM-NEXT: fmov s1, #1.00000000
+; ASM-NEXT: mov s0, v0[1]
+; ASM-NEXT: fadd s0, s0, s1
+; ASM-NEXT: str s0, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x float>, <2 x float>* %addr1, align 8
+ %extract = extractelement <2 x float> %in1, i32 1
+ %out = fadd float %extract, 1.0
+ store float %out, float* %dest, align 4
+ ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+; ASM-LABEL: simpleOneInstructionPromotionVariableIdx:
+; ASM: ; %bb.0:
+; ASM-NEXT: ; kill: def $w2 killed $w2 def $x2
+; ASM-NEXT: and x8, x2, #0x1
+; ASM-NEXT: ldr w8, [x0, x8, lsl #2]
+; ASM-NEXT: orr w8, w8, #0x1
+; ASM-NEXT: str w8, [x1]
+; ASM-NEXT: ret
+ %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
+ %extract = extractelement <2 x i32> %in1, i32 %idx
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 4
+ ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; Scalar version:
+; Vector version:
+;
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion8x8:
+; ASM: ; %bb.0:
+; ASM-NEXT: movi.8b v0, #1
+; ASM-NEXT: ldr d1, [x0]
+; ASM-NEXT: orr.8b v0, v1, v0
+; ASM-NEXT: st1.b { v0 }[1], [x1]
+; ASM-NEXT: ret
+ %in1 = load <8 x i8>, <8 x i8>* %addr1, align 8
+ %extract = extractelement <8 x i8> %in1, i32 1
+ %out = or i8 %extract, 1
+ store i8 %out, i8* %dest, align 4
+ ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; The Q register used here must be [[LOAD]] / 2, but we cannot express that.
+define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) {
+; ASM-LABEL: simpleOneInstructionPromotion4x32:
+; ASM: ; %bb.0:
+; ASM-NEXT: ldr q0, [x0]
+; ASM-NEXT: orr.4s v0, #1
+; ASM-NEXT: st1.s { v0 }[1], [x1]
+; ASM-NEXT: ret
+ %in1 = load <4 x i32>, <4 x i32>* %addr1, align 8
+ %extract = extractelement <4 x i32> %in1, i32 1
+ %out = or i32 %extract, 1
+ store i32 %out, i32* %dest, align 1
+ ret void
+}
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AArch64/vector-promotion.ll llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
; ASM: ; %bb.0: | ||
; ASM-NEXT: ldr q0, [x0] | ||
; ASM-NEXT: orr.4s v0, #1 | ||
; ASM-NEXT: st1.s { v0 }[1], [x1] | ||
; ASM-NEXT: ret |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, this generates:
ldr w8, [x0, #4]
orr w8, w8, #0x1
str w8, [x1]
ret
Which seems like it could be preferred over vector operations. Similar for simpleOneInstructionPromotion8x8
.
; Check that we optimized the sequence correctly when it can be | ||
; lowered on a Q register. | ||
; | ||
; Make sure we got rid of any expensive vmov.32 instructions. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These comments are referring to ARM not AArch64 instructions.
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s | ||
; RUN: opt -codegenprepare -mtriple=arm64-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
opt
run lines are never checked
ret void | ||
} | ||
|
||
; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems like quite a few of these test cases don't change with this patch.
ST1 instructions extract and store in one instruction. We can map them to 64-bit and 128-bit registers (V and Q regs).