-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 #144152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Use 0-cycle reg2reg MOVs for FPR32, FPR16, FPR8 #144152
Conversation
98b8048
to
29bd77f
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
ad98f61
to
2f47266
Compare
@llvm/pr-subscribers-backend-aarch64 Author: Tomer Shafir (tomershafir) ChangesThis change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16. Depends on: #143680 to resolve the test structure. Full diff: https://github.com/llvm/llvm-project/pull/144152.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..70d8e918acbfa 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5302,30 +5302,73 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
-
if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
- DestReg =
- RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
- SrcReg =
- RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
- BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR64RegClass);
+ // This instruction is reading and writing D registers. This may upset
+ // the register scavenger and machine verifier, so we need to indicate
+ // that we are reading an undefined value from SrcRegD, but a proper
+ // value from SrcReg.
+ BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
+ .addReg(SrcRegD, RegState::Undef)
+ .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+ } else {
+ DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+ &AArch64::FPR32RegClass);
+ BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ }
return;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
new file mode 100644
index 0000000000000..1c9b06f20b6f9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr16.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define void @t(half %a, half %b, half %c, half %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: fmov s0, s2
+; NOTCPU: fmov s1, s3
+; NOTCPU: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-NEXT: bl {{_?foo}}
+; NOTCPU: fmov s0, [[REG1]]
+; NOTCPU: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU-NEXT: bl {{_?foo}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR-NEXT: bl {{_?foo}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR-NEXT: bl {{_?foo}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call half @foo(half %c, half %d)
+ %call1 = call half @foo(half %c, half %d)
+ unreachable
+}
+
+declare half @foo(half, half)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
new file mode 100644
index 0000000000000..9d9b58aa0e0ae
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr32.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s -check-prefixes=NOTCPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm | FileCheck %s -check-prefixes=ATTR --match-full-lines
+
+define void @t(float %a, float %b, float %c, float %d) {
+entry:
+; CHECK-LABEL: t:
+; NOTCPU: fmov s0, s2
+; NOTCPU: fmov s1, s3
+; NOTCPU: fmov [[REG2:s[0-9]+]], s3
+; NOTCPU: fmov [[REG1:s[0-9]+]], s2
+; NOTCPU-NEXT: bl {{_?foo}}
+; NOTCPU: fmov s0, [[REG1]]
+; NOTCPU: fmov s1, [[REG2]]
+
+; CPU: fmov [[REG2:d[0-9]+]], d3
+; CPU: fmov [[REG1:d[0-9]+]], d2
+; CPU: fmov d0, d2
+; CPU: fmov d1, d3
+; CPU-NEXT: bl {{_?foo}}
+; CPU: fmov d0, [[REG1]]
+; CPU: fmov d1, [[REG2]]
+
+; NOTATTR: fmov [[REG2:s[0-9]+]], s3
+; NOTATTR: fmov [[REG1:s[0-9]+]], s2
+; NOTATTR: fmov s0, s2
+; NOTATTR: fmov s1, s3
+; NOTATTR-NEXT: bl {{_?foo}}
+; NOTATTR: fmov s0, [[REG1]]
+; NOTATTR: fmov s1, [[REG2]]
+
+; ATTR: fmov d0, d2
+; ATTR: fmov d1, d3
+; ATTR: fmov [[REG2:d[0-9]+]], d3
+; ATTR: fmov [[REG1:d[0-9]+]], d2
+; ATTR-NEXT: bl {{_?foo}}
+; ATTR: fmov d0, [[REG1]]
+; ATTR: fmov d1, [[REG2]]
+ %call = call float @foo(float %c, float %d)
+ %call1 = call float @foo(float %c, float %d)
+ unreachable
+}
+
+declare float @foo(float, float)
|
@@ -5302,30 +5302,73 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, | |||
|
|||
if (AArch64::FPR32RegClass.contains(DestReg) && | |||
AArch64::FPR32RegClass.contains(SrcReg)) { | |||
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) | |||
.addReg(SrcReg, getKillRegState(KillSrc)); | |||
if (Subtarget.isTargetDarwin() && Subtarget.hasZeroCycleRegMove()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ISTM this should be a new subtarget feature, maybe hasZeroCycleFPRMove()
or something similar. It's weird to conflate per-cpu hardware tuning details with platform details like isTargetDarwin()
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The zero-latency instructions on the Arm side are listed in the SWOG's and are often a little different. As far as I could tell from the code, the existing HasZeroCycleRegMove might better be described as "HasZeroCycleXRegMoveButNotWRegMove". We tend to have both if we have any, so don't prefer the xreg version over the wreg. For FPR regs it looks like if we have D we have S as free too.
My point is - what do you think of representing it as the individual instructions that are expected to be free? So something like if (Subtarget.hasZeroCycleDRegMove() && !Subtarget.hasZeroCycleSRegMove()) ...
. Hopefully that is still simple enough and not too over-engineered.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree. Ill add new subtarget features, though per register class and not underlying register names, like hasZeroCycleRegMoveFPR64
.
…asses This change adds 2 new subtarget features to AArch64 to model 0-cycle copy execution for FPR64 and FPR32 register classes. It also adds the new `HasZeroCycleRegMovFPR64` feature to Apple processors.
d45817b
to
a1439b4
Compare
v2: commit 1: bb47589
commit 2: 7c0a526
I will fix the existing |
@jroelofs @davemgreen Ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for adding the two new features. LGTM if there are not other comments from @jroelofs
return; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's keep these blank lines after the return; }
's. They help draw your eye to their early-return-ness.
return; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One small nit, but otherwise LGTM.
This change emits zero cycle copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds a regression test with 2 variants for FPR32 and FPR16.
a1439b4
to
17743ba
Compare
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16. Depends on: llvm#143680 to resolve the test structure.
…lasses Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: llvm#144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mzcm` directly, so I think it should be ok to make an immediate switch, unless this doesn't align with the conventions in this project. The patch adds a release note for that.
Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: llvm#144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mzcm` directly, so I think it should be ok to make an immediate switch, unless this doesn't align with the conventions in this project. The patch adds a release note for that.
Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: llvm#144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mattr=zcm` directly, so I think it should be ok to make an immediate switch without a release note.
…46051) Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: #144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mzcm` directly, so I think it should be ok to make an immediate switch, unless this doesn't align with the conventions in this project. The patch adds a release note for that.
…classes (#146051) Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: llvm/llvm-project#144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mzcm` directly, so I think it should be ok to make an immediate switch, unless this doesn't align with the conventions in this project. The patch adds a release note for that.
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16. Depends on: llvm#143680 to resolve the test structure.
…vm#146051) Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: llvm#144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mzcm` directly, so I think it should be ok to make an immediate switch, unless this doesn't align with the conventions in this project. The patch adds a release note for that.
…vm#146051) Aligns 0-cycle register MOV model of GPR64 and GPR32 register classes to that of FPR64 and FPR32 resolved in: llvm#144152. - Splits `FeatureZCRegMove` into `FeatureZCRegMoveGPR64` and `FeatureZCRegMove32` and fix Apple processors and `AArch64InstrInfo` accordingly - Aligns the test `arm64-zero-cycle-regmov-gpr.ll` to the FPR one The target feature name change is effectively a breaking change. The absolute most of users shouldn't use `-mzcm` directly, so I think it should be ok to make an immediate switch, unless this doesn't align with the conventions in this project. The patch adds a release note for that.
This change emits optimized copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds 2 regression tests for FPR32 and FPR16.
Depends on: #143680 to resolve the test structure.