-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP][AMDGPU] Improve the handling of inreg
arguments
#133614
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesWhen SGPRs available for This PR introduces Fixes #130443 and #129071. Full diff: https://github.com/llvm/llvm-project/pull/133614.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c8645850fe111..628ea2515482d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2841,6 +2841,86 @@ void SITargetLowering::insertCopiesSplitCSR(
}
}
+class InregVPGRSpiller {
+ CCState &State;
+ const unsigned WaveFrontSize;
+
+ Register CurReg;
+ unsigned CurLane = 0;
+
+protected:
+ SelectionDAG &DAG;
+ MachineFunction &MF;
+
+ Register getCurReg() const { return CurReg; }
+ unsigned getCurLane() const { return CurLane; }
+
+ InregVPGRSpiller(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
+ : State(State),
+ WaveFrontSize(MF.getSubtarget<GCNSubtarget>().getWavefrontSize()),
+ DAG(DAG), MF(MF) {}
+
+ void setReg(Register &Reg) {
+ if (CurReg.isValid()) {
+ State.DeallocateReg(Reg);
+ Reg = CurReg;
+ } else {
+ CurReg = Reg;
+ }
+ }
+
+ void forward() {
+ // We have used the same VGPRs of all the lanes, so we need to reset it and
+ // pick up a new one in the next move.
+ if (++CurLane % WaveFrontSize == 0)
+ CurReg = 0;
+ }
+};
+
+class InregVPGRSpillerCallee final : private InregVPGRSpiller {
+public:
+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
+ : InregVPGRSpiller(DAG, MF, State) {}
+
+ SDValue read(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
+ setReg(Reg);
+
+ MF.addLiveIn(getCurReg(), &AMDGPU::VGPR_32RegClass);
+
+ // TODO: Do we need the chain here?
+ SmallVector<SDValue, 4> Operands{
+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
+ DAG.getRegister(getCurReg(), VT),
+ DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
+
+ forward();
+
+ return Res;
+ }
+};
+
+class InregVPGRSpillerCallSite final : private InregVPGRSpiller {
+public:
+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF,
+ CCState &State)
+ : InregVPGRSpiller(DAG, MF, State) {}
+
+ SDValue write(const SDLoc &SL, Register &Reg, SDValue V, EVT VT) {
+ setReg(Reg);
+
+ SmallVector<SDValue, 4> Operands{
+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
+ DAG.getRegister(getCurReg(), VT), V,
+ DAG.getTargetConstant(getCurLane(), SL, MVT::i32)};
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
+
+ forward();
+
+ return Res;
+ }
+};
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3043,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
// kern arg offset.
const Align KernelArgBaseAlign = Align(16);
+ InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3211,17 @@ SDValue SITargetLowering::LowerFormalArguments(
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
EVT ValVT = VA.getValVT();
- Reg = MF.addLiveIn(Reg, RC);
- SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ SDValue Val;
+ // If an argument is marked inreg but gets pushed to a VGPR, it indicates
+ // we've run out of SGPRs for argument passing. In such cases, we'd prefer
+ // to start packing inreg arguments into individual lanes of VGPRs, rather
+ // than placing them directly into VGPRs.
+ if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
+ Val = Spiller.read(Chain, DL, Reg, VT);
+ } else {
+ Reg = MF.addLiveIn(Reg, RC);
+ Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ }
if (Arg.Flags.isSRet()) {
// The return object should be reasonably addressable.
@@ -3875,6 +3965,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MVT PtrVT = MVT::i32;
+ InregVPGRSpillerCallSite Spiller(DAG, MF, CCInfo);
+
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -3904,6 +3996,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
if (VA.isRegLoc()) {
+ Register Reg = VA.getLocReg();
+ if (Outs[i].Flags.isInReg() && AMDGPU::VGPR_32RegClass.contains(Reg))
+ Arg = Spiller.write(DL, Reg, Arg, VA.getLocVT());
RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
diff --git a/llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll
new file mode 100644
index 0000000000000..3ec4c86fa87ad
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inreg-vgpr-spill.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -o - %s | FileCheck %s
+
+; arg3 is v0, arg4 is in v1. These should be packed into a lane and extracted with readlane
+define i32 @callee(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <2 x i32> inreg %arg2, i32 inreg %arg3, i32 inreg %arg4) {
+; CHECK-LABEL: test0:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_readlane_b32 s0, v0, 1
+; CHECK-NEXT: v_readlane_b32 s1, v0, 0
+; CHECK-NEXT: s_add_i32 s1, s1, s0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b32_e32 v0, s1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %add = add i32 %arg3, %arg4
+ ret i32 %add
+}
+
+define amdgpu_kernel void @kernel(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p) {
+ %arg0 = load <8 x i32>, ptr %p0
+ %arg1 = load <8 x i32>, ptr %p1
+ %arg2 = load <2 x i32>, ptr %p2
+ %arg3 = load i32, ptr %p3
+ %arg4 = load i32, ptr %p4
+ %ret = call i32 @callee(<8 x i32> %arg0, <8 x i32> %arg1, <2 x i32> %arg2, i32 %arg3, i32 %arg4)
+ store i32 %ret, ptr %p
+ ret void
+}
|
I'm still working on this, but I'd like to get some early feedback. The newly added test case currently crashes in the verifier with the following error:
Looks like I can't just use an immediate operand for the lane selection. I'll fix that in a later update. |
d823879
to
ff4fc41
Compare
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions h,cpp -- llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.h View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a12ff8d2e..9d1e2a0e7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4114,7 +4114,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Outs[ArgIdx - NumSpecialInputs].Flags.isInReg() &&
AMDGPU::VGPR_32RegClass.contains(Reg)) {
Spiller.writeLane(DL, Reg, Val,
- ArgLocs[ArgIdx - NumSpecialInputs].getLocVT());
+ ArgLocs[ArgIdx - NumSpecialInputs].getLocVT());
} else {
Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
InGlue = Chain.getValue(1);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 0990d8187..07348611d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -402,13 +402,11 @@ public:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
- void passSpecialInputs(
- CallLoweringInfo &CLI,
- CCState &CCInfo,
- const SIMachineFunctionInfo &Info,
- SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass,
- SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain) const;
+ void
+ passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const;
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
|
1a27c5f
to
bf0af25
Compare
} | ||
|
||
define i32 @tail_caller(<8 x i32> inreg %arg0, <8 x i32> inreg %arg1, <2 x i32> inreg %arg2, i32 inreg %arg3, i32 inreg %arg4) { | ||
%ret = tail call i32 @callee(<8 x i32> %arg0, <8 x i32> %arg1, <2 x i32> %arg2, i32 %arg3, i32 %arg4) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For some reason, it emits an error error: <unknown>:0:0: ran out of registers during register allocation in function 'tail_caller'
for this tail call version.
; CHECK-NEXT: s_addc_u32 s9, s5, 0 | ||
; CHECK-NEXT: v_mov_b32_e32 v1, v0 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: v_writelane_b32 v1, s30, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is some sort of mismatch here. It writes to v1
at call site, but reads v0
in the callee.
When SGPRs available for `inreg` argument passing run out, the compiler silently falls back to using whole VGPRs to pass those arguments. Ideally, instead of using whole VGPRs, we should pack `inreg` arguments into individual lanes of VGPRs. This PR introduces `InregVGPRSpiller`, which handles this packing. It uses `v_writelane` at the call site to place `inreg` arguments into specific VGPR lanes, and then extracts them in the callee using `v_readlane`. Fixes #130443 and #129071.
bf0af25
to
bd81ac3
Compare
DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32), Val, | ||
DAG.getTargetConstant(CurLane++, SL, MVT::i32)}; | ||
if (!LastWrite) { | ||
Register VReg = MF.getRegInfo().getLiveInVirtReg(DstReg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The mismatch comes from here: the first tied-input operand of the series of writelane intrinsics. Basically, we want the intrinsic to write directly to a specific register. The input Reg
here is actually an MCRegister
, but we need a virtual register for this intrinsic. To achieve this, we first call getLiveInVirtReg
to get a virtual register corresponding to the MCRegister
, and then call getRegister
to obtain an SDValue
. This generates MIR code similar to the following:
%0:vgpr_32(s32) = COPY $vgpr0
...
%33:vgpr_32 = V_WRITELANE_B32 killed %28:sreg_32, 0, %0:vgpr_32(tied-def 0)(s32)
%34:vgpr_32 = V_WRITELANE_B32 killed %29:sreg_32, 1, %33:vgpr_32(tied-def 0)
Later, this sequence is lowered to something like:
%34:vgpr_32 = COPY %0:vgpr_32(s32)
%34:vgpr_32 = V_WRITELANE_B32 $sgpr30, 0, %34:vgpr_32(tied-def 0)
%34:vgpr_32 = V_WRITELANE_B32 $sgpr31, 1, %34:vgpr_32(tied-def 0)
And after register allocation, it becomes:
renamable $vgpr1 = COPY renamable $vgpr0
renamable $vgpr1 = V_WRITELANE_B32 $sgpr30, 0, killed $vgpr1(tied-def 0)
renamable $vgpr1 = V_WRITELANE_B32 $sgpr31, 1, killed $vgpr1(tied-def 0)
As we can see, the intended behavior is writing directly into $vgpr0
. However, due to the COPY
, the intrinsic instead writes to $vgpr1
.
We probably shouldn't be using virtual registers as operands directly in the intrinsic, but doing so crashes the two-address instruction pass later, which expects operands to be virtual registers.
When SGPRs available for
inreg
argument passing run out, the compiler silentlyfalls back to using whole VGPRs to pass those arguments. Ideally, instead of
using whole VGPRs, we should pack
inreg
arguments into individual lanes ofVGPRs.
This PR introduces
InregVGPRSpiller
, which handles this packing. It usesv_writelane
at the call site to placeinreg
arguments into specific VGPRlanes, and then extracts them in the callee using
v_readlane
.Fixes #130443 and #129071.