-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize #145911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/petar-avramovic/rbl-ral-tests
Are you sure you want to change the base?
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize #145911
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesPatch is 32.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145911.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index ba661348ca5b5..733b5d0865d7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -115,126 +117,233 @@ class AMDGPURegBankLegalizeCombiner {
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
- bool isLaneMask(Register Reg) {
- const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
- if (RB && RB->getID() == AMDGPU::VCCRegBankID)
- return true;
+ bool isLaneMask(Register Reg);
+ std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode);
+ std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src);
+ Register getReadAnyLaneSrc(Register Src);
+ void replaceRegWithOrBuildCopy(Register Dst, Register Src);
- const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
- return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
- }
+ bool tryEliminateReadAnyLane(MachineInstr &Copy);
+ void tryCombineCopy(MachineInstr &MI);
+ void tryCombineS1AnyExt(MachineInstr &MI);
+};
- void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) {
- MI.eraseFromParent();
- if (Optional0 && isTriviallyDead(*Optional0, MRI))
- Optional0->eraseFromParent();
- }
+bool AMDGPURegBankLegalizeCombiner::isLaneMask(Register Reg) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == AMDGPU::VCCRegBankID)
+ return true;
- std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
- MachineInstr *MatchMI = MRI.getVRegDef(Src);
- if (MatchMI->getOpcode() != Opcode)
- return {nullptr, Register()};
- return {MatchMI, MatchMI->getOperand(1).getReg()};
- }
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
+}
- void tryCombineCopy(MachineInstr &MI) {
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- // Skip copies of physical registers.
- if (!Dst.isVirtual() || !Src.isVirtual())
- return;
-
- // This is a cross bank copy, sgpr S1 to lane mask.
- //
- // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
- // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
- // ->
- // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
- if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
- assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
- "sgpr S1 must be result of G_TRUNC of sgpr S32");
-
- B.setInstr(MI);
- // Ensure that truncated bits in BoolSrc are 0.
- auto One = B.buildConstant({SgprRB, S32}, 1);
- auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
- B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+std::pair<MachineInstr *, Register>
+AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) {
+ MachineInstr *MatchMI = MRI.getVRegDef(Src);
+ if (MatchMI->getOpcode() != Opcode)
+ return {nullptr, Register()};
+ return {MatchMI, MatchMI->getOperand(1).getReg()};
+}
+
+std::pair<GUnmerge *, int>
+AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {
+ MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
+ if (ReadAnyLane->getOpcode() != AMDGPU::G_AMDGPU_READANYLANE)
+ return {nullptr, -1};
+
+ Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+ if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
+ return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
- // Src = G_AMDGPU_READANYLANE RALSrc
- // Dst = COPY Src
- // ->
- // Dst = RALSrc
- if (MRI.getRegBankOrNull(Dst) == VgprRB &&
- MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (!RAL)
- return;
-
- assert(MRI.getRegBank(RALSrc) == VgprRB);
- MRI.replaceRegWith(Dst, RALSrc);
- cleanUpAfterCombine(MI, RAL);
- return;
+ return {nullptr, -1};
+}
+
+Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
+ // Src = G_AMDGPU_READANYLANE RALSrc
+ auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+
+ // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+ // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+ // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+ // Src G_MERGE_VALUES LoSgpr, HiSgpr
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
+ if (Merge) {
+ unsigned NumElts = Merge->getNumSources();
+ auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+ if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ return {};
+
+ // Check if all elements are from same unmerge and there is no shuffling.
+ for (unsigned i = 1; i < NumElts; ++i) {
+ auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+ if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+ return {};
}
+ return Unmerge->getSourceReg();
}
- void tryCombineS1AnyExt(MachineInstr &MI) {
- // %Src:sgpr(S1) = G_TRUNC %TruncSrc
- // %Dst = G_ANYEXT %Src:sgpr(S1)
- // ->
- // %Dst = G_... %TruncSrc
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- if (MRI.getType(Src) != S1)
- return;
-
- auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
- if (!Trunc)
- return;
-
- LLT DstTy = MRI.getType(Dst);
- LLT TruncSrcTy = MRI.getType(TruncSrc);
-
- if (DstTy == TruncSrcTy) {
- MRI.replaceRegWith(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ // ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
+ // SgprI = G_AMDGPU_READANYLANE VgprI
+ // SgprLarge G_MERGE_VALUES ..., SgprI, ...
+ // ..., Src, ... = G_UNMERGE_VALUES SgprLarge
+ auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
+ if (!UnMerge)
+ return {};
+
+ int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
+ Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ if (!Merge || UnMerge->getNumDefs() != Merge->getNumSources())
+ return {};
+
+ Register SrcRegIdx = Merge->getSourceReg(Idx);
+ if (MRI.getType(Src) != MRI.getType(SrcRegIdx))
+ return {};
+
+ auto [RALEl, RALElSrc] = tryMatch(SrcRegIdx, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RALEl)
+ return RALElSrc;
+
+ return {};
+}
+
+void AMDGPURegBankLegalizeCombiner::replaceRegWithOrBuildCopy(Register Dst,
+ Register Src) {
+ if (Dst.isVirtual())
+ MRI.replaceRegWith(Dst, Src);
+ else
+ B.buildCopy(Dst, Src);
+}
+
+bool AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(
+ MachineInstr &Copy) {
+ Register Dst = Copy.getOperand(0).getReg();
+ Register Src = Copy.getOperand(1).getReg();
+
+ // Skip non-vgpr Dst
+ if (Dst.isVirtual() ? (MRI.getRegBankOrNull(Dst) != VgprRB)
+ : !TRI.isVGPR(MRI, Dst))
+ return false;
+
+ // Skip physical source registers and source registers with register class
+ if (!Src.isVirtual() || MRI.getRegClassOrNull(Src))
+ return false;
+
+ Register RALDst = Src;
+ MachineInstr &SrcMI = *MRI.getVRegDef(Src);
+ if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
+ RALDst = SrcMI.getOperand(1).getReg();
+
+ Register RALSrc = getReadAnyLaneSrc(RALDst);
+ if (!RALSrc)
+ return false;
+
+ B.setInstr(Copy);
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc Src = READANYLANE RALSrc
+ // Dst = Copy Src $Dst = Copy Src
+ // -> ->
+ // Dst = RALSrc $Dst = Copy RALSrc
+ replaceRegWithOrBuildCopy(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst Src = G_BITCAST RALDst
+ // Dst = Copy Src Dst = Copy Src
+ // -> ->
+ // NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
+ // Dst = NewVgpr $Dst = Copy NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
+ }
+
+ eraseInstr(Copy, MRI);
+ return true;
+}
+
+void AMDGPURegBankLegalizeCombiner::tryCombineCopy(MachineInstr &MI) {
+ if (tryEliminateReadAnyLane(MI))
+ return;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ // Skip copies of physical registers.
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ return;
+
+ // This is a cross bank copy, sgpr S1 to lane mask.
+ //
+ // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
+ // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
+ // ->
+ // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
+ if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
+ auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
+ assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
+ "sgpr S1 must be result of G_TRUNC of sgpr S32");
B.setInstr(MI);
+ // Ensure that truncated bits in BoolSrc are 0.
+ auto One = B.buildConstant({SgprRB, S32}, 1);
+ auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
+ B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
+ eraseInstr(MI, MRI);
+ }
+}
- if (DstTy == S32 && TruncSrcTy == S64) {
- auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
- MRI.replaceRegWith(Dst, Unmerge.getReg(0));
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+void AMDGPURegBankLegalizeCombiner::tryCombineS1AnyExt(MachineInstr &MI) {
+ // %Src:sgpr(S1) = G_TRUNC %TruncSrc
+ // %Dst = G_ANYEXT %Src:sgpr(S1)
+ // ->
+ // %Dst = G_... %TruncSrc
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (MRI.getType(Src) != S1)
+ return;
+
+ auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
+ if (!Trunc)
+ return;
+
+ LLT DstTy = MRI.getType(Dst);
+ LLT TruncSrcTy = MRI.getType(TruncSrc);
+
+ if (DstTy == TruncSrcTy) {
+ MRI.replaceRegWith(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S64 && TruncSrcTy == S32) {
- B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
- {TruncSrc, B.buildUndef({SgprRB, S32})});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ B.setInstr(MI);
- if (DstTy == S32 && TruncSrcTy == S16) {
- B.buildAnyExt(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S32 && TruncSrcTy == S64) {
+ auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
+ MRI.replaceRegWith(Dst, Unmerge.getReg(0));
+ eraseInstr(MI, MRI);
+ return;
+ }
- if (DstTy == S16 && TruncSrcTy == S32) {
- B.buildTrunc(Dst, TruncSrc);
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
+ if (DstTy == S64 && TruncSrcTy == S32) {
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
+ {TruncSrc, B.buildUndef({SgprRB, S32})});
+ eraseInstr(MI, MRI);
+ return;
+ }
- llvm_unreachable("missing anyext + trunc combine");
+ if (DstTy == S32 && TruncSrcTy == S16) {
+ B.buildAnyExt(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
}
-};
+
+ if (DstTy == S16 && TruncSrcTy == S32) {
+ B.buildTrunc(Dst, TruncSrc);
+ eraseInstr(MI, MRI);
+ return;
+ }
+
+ llvm_unreachable("missing anyext + trunc combine");
+}
// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
index 51b473f2d8994..5f72d3e2ab161 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
@@ -20,8 +20,6 @@ define amdgpu_ps float @readanylane_to_physical_vgpr(ptr addrspace(1) inreg %ptr
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile float, ptr addrspace(1) %ptr
ret float %load
@@ -33,8 +31,6 @@ define amdgpu_ps void @readanylane_to_bitcast_to_virtual_vgpr(ptr addrspace(1) i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: v_mov_b32_e32 v1, s0
; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
@@ -49,8 +45,6 @@ define amdgpu_ps float @readanylane_to_bitcast_to_physical_vgpr(ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
%bitcast = bitcast <2 x i16> %load to float
@@ -63,10 +57,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_to_virtual_vgpr(ptr addrspace(1
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile i64, ptr addrspace(1) %ptr0
@@ -85,10 +75,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_bitcast_to_virtual_vgpr(ptr add
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
@@ -109,9 +95,7 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_to_virtual_vgpr(ptr add
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
+; CHECK-NEXT: global_store_dword v2, v1, s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
%extracted = extractelement <2 x i32> %load, i32 1
@@ -125,8 +109,7 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_to_physical_vgpr(ptr a
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile <2 x float>, ptr addrspace(1) %ptr0
%extracted = extractelement <2 x float> %load, i32 1
@@ -139,8 +122,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
@@ -156,8 +137,6 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_bitcast_to_physical_vg
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
%extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
index 673cf1696e5e0..dd7a3ebeab471 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
@@ -46,8 +46,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_READANYLANE]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -74,11 +73,9 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[LOAD]]
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[AMDGPU_READANYLANE]](<2 x s16>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[BITCAST]](s32)
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY5]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -106,8 +103,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[LOAD]]
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[AMDGPU_READANYLANE]](<2 x s16>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%0:sgpr(s32) = COPY $sgpr0
@@ -136,13 +132,8 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_R...
[truncated]
|
: !TRI.isVGPR(MRI, Dst)) | ||
return false; | ||
|
||
// Skip physical source registers and source registers with register class |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shouldn't happen?
It is quite common
physical source registers is lower formal arguments for function %0:_(s32) = COPY $vgpr0
source registers with register class is amdgpu-regbakselect(the pass) isolating registers with register classes from being used in g_instruction %1:sgpr(s32) = COPY %0:sreg_32(s32)
, register class comes from selecting instruction and constraining register classes to all operands
ad17135
to
7c2efb4
Compare
57e9aec
to
7f9708f
Compare
7c2efb4
to
8711400
Compare
std::pair<MachineInstr *, Register> | ||
AMDGPURegBankLegalizeCombiner::tryMatch(Register Src, unsigned Opcode) { | ||
MachineInstr *MatchMI = MRI.getVRegDef(Src); | ||
if (MatchMI->getOpcode() != Opcode) | ||
return {nullptr, Register()}; | ||
return {MatchMI, MatchMI->getOperand(1).getReg()}; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a better name for this than tryMatch
? Come to think of it, can the generic matching infrastructure be used for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can use mi_match, this is shorter because we use auto instead of declaring what we want to capture. To me at least, this has nicer formatting.
How about matchInstAndGetSrc?
auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); | ||
assert(Trunc && MRI.getType(TruncS32Src) == S32 && | ||
"sgpr S1 must be result of G_TRUNC of sgpr S32"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this really guaranteed? Couldn't the source program have an LLVM IR trunc
from i64 to i1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are correct, not guaranteed. Was trying to make one with trunc from i64 to i1 but did not find one yet. Here is one test that actually hits that assert.
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a) {
entry:
br label %A
A:
%counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
%a.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %a, i32 %counter
%a.val = load i32, ptr addrspace(1) %a.plus.counter
%a.cond = icmp eq i32 %a.val, 0
br i1 %a.cond, label %exit, label %loop.body
loop.body:
%x.plus.counter = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %counter
%x.val = load i32, ptr addrspace(1) %x.plus.counter
%x.val.plus.1 = add i32 %x.val, 1
store i32 %x.val.plus.1, ptr addrspace(1) %x.plus.counter
%counter.plus.1 = add i32 %counter, 1
%x.cond = trunc i32 %counter to i1
br i1 %x.cond, label %exit, label %A
exit:
ret void
}
Original idea where that assert comes from trunc created regbanklegalize. Legal i1 used by something that is lowered by divergence lowering. Uniform i1 is lowered as sgpr S32 that is truncated to S1.
As this pass is still partially implemented can we deal with this in later patch?
8711400
to
046418f
Compare
No description provided.