-
Notifications
You must be signed in to change notification settings - Fork 14.4k
AMDGPU: Implement intrinsic/builtins for gfx1250 load transpose instructions #146289
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang Author: Changpeng Fang (changpeng) ChangesPatch is 37.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146289.diff 11 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1d1f5a4ee3f9f..4e28f3bb7ef81 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -640,6 +640,19 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8i16, "V8sV8s*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8f16, "V8hV8h*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8bf16, "V8yV8y*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr4_b64_v2i32, "V2iV2i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr6_b96_v3i32, "V3iV3i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
+
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 982e5cd37ffd1..f09b3b92c4ea0 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -545,6 +545,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
@@ -555,6 +567,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -563,8 +576,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
+ IID = Intrinsic::amdgcn_global_load_tr4_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
+ IID = Intrinsic::amdgcn_global_load_tr6_b96;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_load_tr4_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
+ IID = Intrinsic::amdgcn_ds_load_tr6_b96;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_load_tr8_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
+ IID = Intrinsic::amdgcn_ds_load_tr16_b128;
+ break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
break;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
new file mode 100644
index 0000000000000..1e3a88a41f90e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v3i __attribute__((ext_vector_type(3)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef __bf16 v8y __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr4_b64_v2i32(global v2i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr4_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr8_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr8_b64_v2i32(global v2i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr8_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr6_b96_v3i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_global_load_tr6_b96_v3i32(global v3i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr6_b96_v3i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8i16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_global_load_tr16_b128_v8i16(global v8s* inptr)
+{
+ return __builtin_amdgcn_global_load_tr16_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_global_load_tr16_b128_v8f16(global v8h* inptr)
+{
+ return __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_global_load_tr16_b128_v8bf16(global v8y* inptr)
+{
+ return __builtin_amdgcn_global_load_tr16_b128_v8bf16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr4_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_load_tr4_b64_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr4_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr8_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_load_tr8_b64_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr8_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr6_b96_v3i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_ds_load_tr6_b96_v3i32(local v3i* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr6_b96_v3i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8i16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_ds_load_tr16_b128_v8i16(local v8s* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr16_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_ds_load_tr16_b128_v8f16(local v8h* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_ds_load_tr16_b128_v8bf16(local v8y* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr16_b128_v8bf16(inptr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6f974c97361de..ce37702b91486 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2996,6 +2996,12 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr4_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr6_b96 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_ds_load_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr16_b128 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b20760c356263..6874657a4ffe7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5105,6 +5105,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_global_load_tr4_b64:
+ case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_ds_load_tr8_b64:
+ case Intrinsic::amdgcn_ds_load_tr16_b128:
+ case Intrinsic::amdgcn_ds_load_tr4_b64:
+ case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 3b62dcf3c92cd..1f6002a3c6a20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -344,6 +344,12 @@ def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr6_b96>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr8_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr16_b128>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr6_b96>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 445ba9a26d336..f824253ce0f35 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -789,12 +789,12 @@ let WaveSizePredicate = isWave32, mayStore = 0 in {
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>;
defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>;
-} // let OtherPredicates = [HasTransposeLoadF4F6Insts]
+} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
-} // let WaveSizePredicate = isWave32, mayStore = 0
+} // End WaveSizePredicate = isWave32, mayStore = 0
-} // let SubtargetPredicate = isGFX1250Plus
+} // End SubtargetPredicate = isGFX1250Plus
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
@@ -1276,6 +1276,17 @@ class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPa
(inst $ptr, Offset:$offset, (i1 0))
>;
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
+let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
+ def : DSLoadTrPat <DS_LOAD_TR4_B64, v2i32, int_amdgcn_ds_load_tr4_b64>;
+ def : DSLoadTrPat <DS_LOAD_TR6_B96, v3i32, int_amdgcn_ds_load_tr6_b96>;
+} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
+
+ def : DSLoadTrPat <DS_LOAD_TR8_B64, v2i32, int_amdgcn_ds_load_tr8_b64>;
+ foreach vt = [v8i16, v8f16, v8bf16] in
+ def : DSLoadTrPat <DS_LOAD_TR16_B128, vt, int_amdgcn_ds_load_tr16_b128>;
+} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus
+
let SubtargetPredicate = HasGFX950Insts in {
def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c4db88b6e5105..dc6dbcef1f033 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1848,6 +1848,11 @@ let WaveSizePredicate = isWave64, OtherPredicates = [isGFX12PlusNot12_50] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>;
}
+let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] in {
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR4_B64, int_amdgcn_global_load_tr4_b64, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8d7dcf8c4a064..bb1de58e04fbc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1444,6 +1444,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_ds_load_tr6_b96:
+ case Intrinsic::amdgcn_ds_load_tr4_b64:
+ case Intrinsic::amdgcn_ds_load_tr8_b64:
+ case Intrinsic::amdgcn_ds_load_tr16_b128:
+ case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
@@ -1548,6 +1554,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
+ case Intrinsic::amdgcn_ds_load_tr8_b64:
+ case Intrinsic::amdgcn_ds_load_tr16_b128:
+ case Intrinsic::amdgcn_ds_load_tr4_b64:
+ case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -1562,6 +1572,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_global_load_tr4_b64:
+ case Intrinsic::amdgcn_global_load_tr6_b96:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 1028cc9ebb342..bd7464577b7db 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -261,6 +261,70 @@ bb:
ret void
}
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr4_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr6_b96_v3i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr)
+ store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr8_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ r...
[truncated]
|
shiltian
approved these changes
Jun 29, 2025
rlavaee
pushed a commit
to rlavaee/llvm-project
that referenced
this pull request
Jul 1, 2025
rlavaee
pushed a commit
to rlavaee/llvm-project
that referenced
this pull request
Jul 1, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
backend:AMDGPU
clang:codegen
IR generation bugs: mangling, exceptions, etc.
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
clang
Clang issues not falling into any other category
llvm:analysis
Includes value tracking, cost tables and constant folding
llvm:ir
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.