-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][Scheduler] Support for rematerializing SGPRs and AGPRs #140036
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Lucas Ramirez (lucas-rami) ChangesThis adds the ability to rematerialize SGPRs and AGPRs to the scheduler's Patch is 95.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140036.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a9c891a1f1dd1..6f6abf5a20930 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1702,6 +1702,8 @@ namespace {
/// Models excess register pressure in a region and tracks our progress as we
/// identify rematerialization opportunities.
struct ExcessRP {
+ /// Number of excess SGPRs.
+ unsigned SGPRs = 0;
/// Number of excess ArchVGPRs.
unsigned ArchVGPRs = 0;
/// Number of excess AGPRs.
@@ -1717,8 +1719,13 @@ struct ExcessRP {
bool UnifiedRF;
/// Constructs the excess RP model; determines the excess pressure w.r.t. a
- /// maximum number of allowed VGPRs.
- ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
+ /// maximum number of allowed SGPRs/VGPRs.
+ ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxSGPRs,
+ unsigned MaxVGPRs);
+
+ /// Accounts for \p NumRegs saved SGPRs in the model. Returns whether saving
+ /// these SGPRs helped reduce excess pressure.
+ bool saveSGPRs(unsigned NumRegs) { return saveRegs(SGPRs, NumRegs); }
/// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
/// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
@@ -1726,17 +1733,20 @@ struct ExcessRP {
/// saving these ArchVGPRs helped reduce excess pressure.
bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
- /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
- /// these ArchVGPRs helped reduce excess pressure.
- bool saveAGPRs(unsigned NumRegs);
+ /// Accounts for \p NumRegs saved AGPRs in the model. Returns whether saving
+ /// these AGPRs helped reduce excess pressure.
+ bool saveAGPRs(unsigned NumRegs) {
+ return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
+ }
/// Returns whether there is any excess register pressure.
- operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
+ operator bool() const { return SGPRs || ArchVGPRs || AGPRs || VGPRs; }
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
- OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
- << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
+ OS << Excess.SGPRs << " SGPRs, " << Excess.ArchVGPRs << " ArchVGPRs, and "
+ << Excess.AGPRs << " AGPRs, (" << Excess.VGPRs
+ << " VGPRs in total, next ArchVGPR aligment in "
<< Excess.ArchVGPRsToAlignment << " registers)\n";
return OS;
}
@@ -1753,12 +1763,17 @@ struct ExcessRP {
} // namespace
ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
- unsigned MaxVGPRs)
+ unsigned MaxSGPRs, unsigned MaxVGPRs)
: UnifiedRF(ST.hasGFX90AInsts()) {
+ // Compute excess SGPR pressure.
+ unsigned NumSGPRs = RP.getSGPRNum();
+ if (NumSGPRs > MaxSGPRs)
+ SGPRs = NumSGPRs - MaxSGPRs;
+
+ // Compute excess ArchVGPR/AGPR pressure.
unsigned NumArchVGPRs = RP.getArchVGPRNum();
unsigned NumAGPRs = RP.getAGPRNum();
HasAGPRs = NumAGPRs;
-
if (!UnifiedRF) {
// Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
// independently.
@@ -1839,10 +1854,6 @@ bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
return Progress;
}
-bool ExcessRP::saveAGPRs(unsigned NumRegs) {
- return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
-}
-
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
@@ -1865,46 +1876,19 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
- auto ClearOptRegionsIf = [&](bool Cond) -> bool {
- if (Cond) {
- // We won't try to increase occupancy.
- IncreaseOccupancy = false;
- OptRegions.clear();
- }
- return Cond;
- };
-
// Collect optimizable regions. If there is spilling in any region we will
- // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
- // occupancy by one in the whole function.
+ // just try to reduce spilling. Otherwise we will try to increase occupancy by
+ // one in the whole function.
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
GCNRegPressure &RP = DAG.Pressure[I];
-
- // Check whether SGPR pressures prevents us from eliminating spilling.
- unsigned NumSGPRs = RP.getSGPRNum();
- if (NumSGPRs > MaxSGPRsNoSpill)
- ClearOptRegionsIf(IncreaseOccupancy);
-
- ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
- if (Excess) {
- ClearOptRegionsIf(IncreaseOccupancy);
+ ExcessRP Excess(ST, RP, MaxSGPRsNoSpill, MaxVGPRsNoSpill);
+ if (Excess && IncreaseOccupancy) {
+ // There is spilling in the region and we were so far trying to increase
+ // occupancy. Strop trying that and focus on reducing spilling.
+ IncreaseOccupancy = false;
+ OptRegions.clear();
} else if (IncreaseOccupancy) {
- // Check whether SGPR pressure prevents us from increasing occupancy.
- if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
- if (DAG.MinOccupancy >= WavesPerEU.first)
- return false;
- continue;
- }
- if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
- // We can only rematerialize ArchVGPRs at this point.
- unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
- bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
- if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
- if (DAG.MinOccupancy >= WavesPerEU.first)
- return false;
- continue;
- }
- }
+ Excess = ExcessRP(ST, RP, MaxSGPRsIncOcc, MaxVGPRsIncOcc);
}
if (Excess)
OptRegions.insert({I, Excess});
@@ -1924,23 +1908,34 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
#endif
// When we are reducing spilling, the target is the minimum target number of
- // waves/EU determined by the subtarget.
- TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
+ // waves/EU determined by the subtarget. In cases where either one of
+ // "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
+ // minimum region occupancy may be higher than the latter.
+ TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
+ : std::max(DAG.MinOccupancy, WavesPerEU.first);
// Accounts for a reduction in RP in an optimizable region. Returns whether we
// estimate that we have identified enough rematerialization opportunities to
// achieve our goal, and sets Progress to true when this particular reduction
// in pressure was helpful toward that goal.
auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
+ const TargetRegisterClass *RC,
bool &Progress) -> bool {
ExcessRP &Excess = OptIt->getSecond();
- // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
- // only when we are just trying to eliminate spilling to memory. At this
- // point we err on the conservative side and do not increase
- // register-to-register spilling for the sake of increasing occupancy.
- Progress |=
- Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
- /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
+ unsigned NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+ if (SRI->isSGPRClass(RC)) {
+ Progress |= Excess.saveSGPRs(NumRegs);
+ } else if (SRI->isAGPRClass(RC)) {
+ Progress |= Excess.saveAGPRs(NumRegs);
+ } else {
+ // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
+ // only when we are just trying to eliminate spilling to memory. At this
+ // point we err on the conservative side and do not increase
+ // register-to-register spilling for the sake of increasing occupancy.
+ Progress |=
+ Excess.saveArchVGPRs(NumRegs,
+ /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
+ }
if (!Excess)
OptRegions.erase(OptIt->getFirst());
return OptRegions.empty();
@@ -1962,10 +1957,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
if (!isTriviallyReMaterializable(DefMI))
continue;
- // We only support rematerializing virtual VGPRs with one definition.
+ // We only support rematerializing virtual registers with one definition.
Register Reg = DefMI.getOperand(0).getReg();
- if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
- !DAG.MRI.hasOneDef(Reg))
+ if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
continue;
// We only care to rematerialize the instruction if it has a single
@@ -2003,6 +1997,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
Rematerializations.try_emplace(&DefMI, UseMI).first->second;
bool RematUseful = false;
+ const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg);
if (auto It = OptRegions.find(I); It != OptRegions.end()) {
// Optimistically consider that moving the instruction out of its
// defining region will reduce RP in the latter; this assumes that
@@ -2010,7 +2005,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// instruction and the end of the region.
REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
- if (ReduceRPInRegion(It, Mask, RematUseful))
+ if (ReduceRPInRegion(It, Mask, RC, RematUseful))
return true;
}
@@ -2030,7 +2025,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// instruction's use.
if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
- if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
+ if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RC, RematUseful))
return true;
}
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index ca4ab4a2c560f..e20ad34797541 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -444,8 +444,6 @@ class ClusteredLowOccStage : public GCNSchedStage {
/// estimates reducing spilling or increasing occupancy is possible, as few
/// instructions as possible are rematerialized to reduce potential negative
/// effects on function latency.
-///
-/// TODO: We should extend this to work on SGPRs and AGPRs as well.
class PreRARematStage : public GCNSchedStage {
private:
/// Useful information about a rematerializable instruction.
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index f5558964d2707..ba42f9f1d1860 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -3,6 +3,9 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s
--- |
+ define void @small_num_sgprs_as_spill() "amdgpu-num-sgpr"="85" {
+ ret void
+ }
define void @small_num_vgprs_as_spill() "amdgpu-num-vgpr"="28" {
ret void
}
@@ -15,13 +18,333 @@
define void @reduce_arch_and_acc_vgrp_spill() "amdgpu-waves-per-eu"="8,8" {
ret void
}
- define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
+ define void @reduce_spill_archvgpr_above_addressable_limit() "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,2" {
ret void
}
- define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-waves-per-eu"="1,10" {
+ define void @reduce_spill_agpr_above_addressable_limit() "amdgpu-flat-work-group-size"="1,64" "amdgpu-waves-per-eu"="1,2" {
ret void
}
---
+# User-requested maximum number of SGPRs need to be taken into account by
+# the scheduler's rematerialization stage. Register usage above that number
+# is considered like spill.
+name: small_num_sgprs_as_spill
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX908-LABEL: name: small_num_sgprs_as_spill
+ ; GFX908: bb.0:
+ ; GFX908-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+ ; GFX908-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+ ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+ ; GFX908-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+ ; GFX908-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+ ; GFX908-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+ ; GFX908-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+ ; GFX908-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+ ; GFX908-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+ ; GFX908-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+ ; GFX908-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+ ; GFX908-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+ ; GFX908-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+ ; GFX908-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+ ; GFX908-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+ ; GFX908-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+ ; GFX908-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+ ; GFX908-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+ ; GFX908-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+ ; GFX908-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+ ; GFX908-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+ ; GFX908-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+ ; GFX908-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+ ; GFX908-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+ ; GFX908-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+ ; GFX908-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+ ; GFX908-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+ ; GFX908-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+ ; GFX908-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+ ; GFX908-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+ ; GFX908-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+ ; GFX908-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+ ; GFX908-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+ ; GFX908-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+ ; GFX908-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+ ; GFX908-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+ ; GFX908-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+ ; GFX908-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+ ; GFX908-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+ ; GFX908-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+ ; GFX908-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+ ; GFX908-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+ ; GFX908-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+ ; GFX908-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+ ; GFX908-NEXT: [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+ ; GFX908-NEXT: [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+ ; GFX908-NEXT: [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+ ; GFX908-NEXT: [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+ ; GFX908-NEXT: [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+ ; GFX908-NEXT: [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+ ; GFX908-NEXT: [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+ ; GFX908-NEXT: [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+ ; GFX908-NEXT: [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+ ; GFX908-NEXT: [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+ ; GFX908-NEXT: [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+ ; GFX908-NEXT: [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+ ; GFX908-NEXT: [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+ ; GFX908-NEXT: [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+ ; GFX908-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+ ; GFX908-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+ ; GFX908-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+ ; GFX908-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+ ; GFX908-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+ ; GFX908-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+ ; GFX908-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+ ; GFX908-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+ ; GFX908-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+ ; GFX908-NEXT: [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+ ; GFX908-NEXT: [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+ ; GFX908-NEXT: [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+ ; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+ ; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+ ; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+ ; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+ ; GFX908-NEXT: [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+ ; GFX908-NEXT: [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+ ; GFX908-NEXT: [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+ ; GFX908-NEXT: [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+ ; GFX908-NEXT: [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.1:
+ ; GFX908-NEXT: [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+ ; GFX908-NEX...
[truncated]
|
53d7202
to
b5cd537
Compare
ping |
/// Accounts for \p NumRegs saved AGPRs in the model. Returns whether saving | ||
/// these AGPRs helped reduce excess pressure. | ||
bool saveAGPRs(unsigned NumRegs) { | ||
return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we model that we can spill ArchVGPR into AGPR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should UnifiedRF be a precondition to saving VGPRs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right, this is more consistent.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In saveVGPRs, it looks like we do allow saving VGPRs even in non unifiedRF case -- I'm not sure why this is, but it looks like a bug. Probably can look into this separately though
The last commit is somewhat incorrect, currently working on a fix. |
This should be correct now. These RP excess calculations are a bit messy; my next PR will make things much more simple by using Edit: CI failure appears spurious (fails on |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nit
@@ -1705,28 +1707,35 @@ struct ExcessRP { | |||
bool HasAGPRs = false; | |||
/// Whether the subtarget has a unified RF. | |||
bool UnifiedRF; | |||
/// Whether we consider that ArchVGPRs can be spilled to AGPRs and the other | |||
/// way around. | |||
bool AllowVGPRToVGPRSpill; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: naming.
VGPRToVGPR is potentially confusing -- maybe SpillToVecReg?
/// Whether we consider that ArchVGPRs can be spilled to AGPRs and the other | ||
/// way around. | ||
bool AllowVGPRToVGPRSpill; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The name is inaccurate and this is always true. It is not spilling. The register allocator will create a combined AV_ class during live range splitting in some cases. Copy to a compatible super register class is not a spill
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the context, I wasn't clear on how these ArchVGPR/AGPR swaps actually happen. Changed the name to CombineVGPRSavings
and updated comments where necessary.
2edbe3f
to
cf76e4a
Compare
…#140036) This adds the ability to rematerialize SGPRs and AGPRs to the scheduler's `PreRARematStage`, which can currently only rematerialize ArchVGPRs. This also fixes a small potential issue in the stage where, in case of spilling, the target occupancy could be set to a lower than expected value when the function had either one of the "amdgpu-num-sgpr" or "amdgpu-num-vgpr" attributes set.
…#140036) This adds the ability to rematerialize SGPRs and AGPRs to the scheduler's `PreRARematStage`, which can currently only rematerialize ArchVGPRs. This also fixes a small potential issue in the stage where, in case of spilling, the target occupancy could be set to a lower than expected value when the function had either one of the "amdgpu-num-sgpr" or "amdgpu-num-vgpr" attributes set.
This adds the ability to rematerialize SGPRs and AGPRs to the scheduler's
PreRARematStage
, which can currently only rematerialize ArchVGPRs. This also fixes a small potential issue in the stage where, in case of spilling, the target occupancy could be set to a lower than expected value when the function had either one of the "amdgpu-num-sgpr" or "amdgpu-num-vgpr" attributes set.