Skip to content

AMDGPU: Implement intrinsic/builtins for gfx1250 load transpose instructions #146289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 29, 2025

Conversation

changpeng
Copy link
Contributor

No description provided.

@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen IR generation bugs: mangling, exceptions, etc. llvm:ir llvm:analysis Includes value tracking, cost tables and constant folding labels Jun 29, 2025
@changpeng changpeng requested a review from rampitec June 29, 2025 19:42
@llvmbot
Copy link
Member

llvmbot commented Jun 29, 2025

@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-clang

Author: Changpeng Fang (changpeng)

Changes

Patch is 37.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146289.diff

11 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+13)
  • (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+36)
  • (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl (+130)
  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+6)
  • (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+14-3)
  • (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+5)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+12)
  • (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+72)
  • (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll (+322)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1d1f5a4ee3f9f..4e28f3bb7ef81 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -640,6 +640,19 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
 // GFX1250+ only builtins.
 //===----------------------------------------------------------------------===//
 
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8i16, "V8sV8s*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8f16, "V8hV8h*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8bf16, "V8yV8y*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr4_b64_v2i32, "V2iV2i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr6_b96_v3i32, "V3iV3i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
+
 TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
 
 TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 982e5cd37ffd1..f09b3b92c4ea0 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -545,6 +545,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
+  case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
+  case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
+  case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
   case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
   case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
@@ -555,6 +567,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
       IID = Intrinsic::amdgcn_global_load_tr_b64;
       break;
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -563,8 +576,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
     case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
       IID = Intrinsic::amdgcn_global_load_tr_b128;
       break;
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
+      IID = Intrinsic::amdgcn_global_load_tr4_b64;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
+      IID = Intrinsic::amdgcn_global_load_tr6_b96;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
+      IID = Intrinsic::amdgcn_ds_load_tr4_b64;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
+      IID = Intrinsic::amdgcn_ds_load_tr6_b96;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
+      IID = Intrinsic::amdgcn_ds_load_tr8_b64;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
+    case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
+    case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
+      IID = Intrinsic::amdgcn_ds_load_tr16_b128;
+      break;
     case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
       IID = Intrinsic::amdgcn_ds_read_tr4_b64;
       break;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
new file mode 100644
index 0000000000000..1e3a88a41f90e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v3i   __attribute__((ext_vector_type(3)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef __bf16 v8y  __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr4_b64_v2i32(global v2i* inptr)
+{
+  return __builtin_amdgcn_global_load_tr4_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr8_b64_v2i32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr8_b64_v2i32(global v2i* inptr)
+{
+  return __builtin_amdgcn_global_load_tr8_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr6_b96_v3i32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_global_load_tr6_b96_v3i32(global v3i* inptr)
+{
+  return __builtin_amdgcn_global_load_tr6_b96_v3i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8i16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_global_load_tr16_b128_v8i16(global v8s* inptr)
+{
+  return __builtin_amdgcn_global_load_tr16_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_global_load_tr16_b128_v8f16(global v8h* inptr)
+{
+  return __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_global_load_tr16_b128_v8bf16(global v8y* inptr)
+{
+  return __builtin_amdgcn_global_load_tr16_b128_v8bf16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr4_b64_v2i32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_load_tr4_b64_v2i32(local v2i* inptr)
+{
+  return __builtin_amdgcn_ds_load_tr4_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr8_b64_v2i32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_load_tr8_b64_v2i32(local v2i* inptr)
+{
+  return __builtin_amdgcn_ds_load_tr8_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr6_b96_v3i32(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_ds_load_tr6_b96_v3i32(local v3i* inptr)
+{
+  return __builtin_amdgcn_ds_load_tr6_b96_v3i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8i16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_ds_load_tr16_b128_v8i16(local v8s* inptr)
+{
+  return __builtin_amdgcn_ds_load_tr16_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8f16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_ds_load_tr16_b128_v8f16(local v8h* inptr)
+{
+  return __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8bf16(
+// CHECK-GFX1250-NEXT:  entry:
+// CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_ds_load_tr16_b128_v8bf16(local v8y* inptr)
+{
+  return __builtin_amdgcn_ds_load_tr16_b128_v8bf16(inptr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6f974c97361de..ce37702b91486 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2996,6 +2996,12 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
 
 def int_amdgcn_global_load_tr_b64  : AMDGPULoadIntrinsic<global_ptr_ty>;
 def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr4_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr6_b96 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_ds_load_tr8_b64     : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr16_b128   : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr4_b64     : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr6_b96     : AMDGPULoadIntrinsic<local_ptr_ty>;
 def int_amdgcn_ds_read_tr4_b64     : AMDGPULoadIntrinsic<local_ptr_ty>;
 def int_amdgcn_ds_read_tr6_b96     : AMDGPULoadIntrinsic<local_ptr_ty>;
 def int_amdgcn_ds_read_tr8_b64     : AMDGPULoadIntrinsic<local_ptr_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b20760c356263..6874657a4ffe7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5105,6 +5105,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
     case Intrinsic::amdgcn_global_load_tr_b64:
     case Intrinsic::amdgcn_global_load_tr_b128:
+    case Intrinsic::amdgcn_global_load_tr4_b64:
+    case Intrinsic::amdgcn_global_load_tr6_b96:
+    case Intrinsic::amdgcn_ds_load_tr8_b64:
+    case Intrinsic::amdgcn_ds_load_tr16_b128:
+    case Intrinsic::amdgcn_ds_load_tr4_b64:
+    case Intrinsic::amdgcn_ds_load_tr6_b96:
     case Intrinsic::amdgcn_ds_read_tr4_b64:
     case Intrinsic::amdgcn_ds_read_tr6_b96:
     case Intrinsic::amdgcn_ds_read_tr8_b64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 3b62dcf3c92cd..1f6002a3c6a20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -344,6 +344,12 @@ def : SourceOfDivergence<intr>;
 
 def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
 def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr6_b96>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr8_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr16_b128>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr6_b96>;
 
 def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
 def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 445ba9a26d336..f824253ce0f35 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -789,12 +789,12 @@ let WaveSizePredicate = isWave32, mayStore = 0 in {
 let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
 defm DS_LOAD_TR4_B64   : DS_1A_RET_NoM0<"ds_load_tr4_b64",   VReg_64>;
 defm DS_LOAD_TR6_B96   : DS_1A_RET_NoM0<"ds_load_tr6_b96",   VReg_96>;
-} // let OtherPredicates = [HasTransposeLoadF4F6Insts]
+} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
 defm DS_LOAD_TR8_B64   : DS_1A_RET_NoM0<"ds_load_tr8_b64",   VReg_64>;
 defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
-} // let WaveSizePredicate = isWave32, mayStore = 0
+} // End WaveSizePredicate = isWave32, mayStore = 0
 
-} // let SubtargetPredicate = isGFX1250Plus
+} // End SubtargetPredicate = isGFX1250Plus
 
 let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
   defm DS_READ_B64_TR_B4  : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
@@ -1276,6 +1276,17 @@ class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPa
   (inst $ptr, Offset:$offset, (i1 0))
 >;
 
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
+let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
+  def : DSLoadTrPat <DS_LOAD_TR4_B64, v2i32, int_amdgcn_ds_load_tr4_b64>;
+  def : DSLoadTrPat <DS_LOAD_TR6_B96, v3i32, int_amdgcn_ds_load_tr6_b96>;
+} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
+
+  def : DSLoadTrPat <DS_LOAD_TR8_B64, v2i32, int_amdgcn_ds_load_tr8_b64>;
+  foreach vt = [v8i16, v8f16, v8bf16] in
+    def : DSLoadTrPat <DS_LOAD_TR16_B128, vt, int_amdgcn_ds_load_tr16_b128>;
+} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus
+
 let SubtargetPredicate = HasGFX950Insts in {
   def : DSLoadTrPat <DS_READ_B64_TR_B4,  v2i32, int_amdgcn_ds_read_tr4_b64>;
   def : DSLoadTrPat <DS_READ_B64_TR_B8,  v2i32, int_amdgcn_ds_read_tr8_b64>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c4db88b6e5105..dc6dbcef1f033 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1848,6 +1848,11 @@ let WaveSizePredicate = isWave64, OtherPredicates = [isGFX12PlusNot12_50] in {
     defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>;
 }
 
+let WaveSizePredicate = isWave32,  OtherPredicates = [HasTransposeLoadF4F6Insts] in {
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR4_B64, int_amdgcn_global_load_tr4_b64, v2i32>;
+  defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
+}
+
 let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
 defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8d7dcf8c4a064..bb1de58e04fbc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1444,6 +1444,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                   MachineMemOperand::MOVolatile;
     return true;
   }
+  case Intrinsic::amdgcn_ds_load_tr6_b96:
+  case Intrinsic::amdgcn_ds_load_tr4_b64:
+  case Intrinsic::amdgcn_ds_load_tr8_b64:
+  case Intrinsic::amdgcn_ds_load_tr16_b128:
+  case Intrinsic::amdgcn_global_load_tr6_b96:
+  case Intrinsic::amdgcn_global_load_tr4_b64:
   case Intrinsic::amdgcn_global_load_tr_b64:
   case Intrinsic::amdgcn_global_load_tr_b128:
   case Intrinsic::amdgcn_ds_read_tr4_b64:
@@ -1548,6 +1554,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_ds_append:
   case Intrinsic::amdgcn_ds_consume:
+  case Intrinsic::amdgcn_ds_load_tr8_b64:
+  case Intrinsic::amdgcn_ds_load_tr16_b128:
+  case Intrinsic::amdgcn_ds_load_tr4_b64:
+  case Intrinsic::amdgcn_ds_load_tr6_b96:
   case Intrinsic::amdgcn_ds_read_tr4_b64:
   case Intrinsic::amdgcn_ds_read_tr6_b96:
   case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -1562,6 +1572,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_global_load_tr_b64:
   case Intrinsic::amdgcn_global_load_tr_b128:
+  case Intrinsic::amdgcn_global_load_tr4_b64:
+  case Intrinsic::amdgcn_global_load_tr6_b96:
     Ptr = II->getArgOperand(0);
     break;
   case Intrinsic::amdgcn_load_to_lds:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 1028cc9ebb342..bd7464577b7db 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -261,6 +261,70 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr4_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr)
+  store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr6_b96_v3i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr)
+  store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr8_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr)
+  store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+  r...
[truncated]

@changpeng changpeng merged commit 1f5f381 into llvm:main Jun 29, 2025
14 checks passed
@changpeng changpeng deleted the tr1 branch June 29, 2025 21:33
rlavaee pushed a commit to rlavaee/llvm-project that referenced this pull request Jul 1, 2025
rlavaee pushed a commit to rlavaee/llvm-project that referenced this pull request Jul 1, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU clang:codegen IR generation bugs: mangling, exceptions, etc. clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category llvm:analysis Includes value tracking, cost tables and constant folding llvm:ir
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants