-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MLIR][NVGPU] Use gpu.dynamic_shared_memory
in tests
#133051
Conversation
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir Author: Guray Ozen (grypp) ChangesFull diff: https://github.com/llvm/llvm-project/pull/133051.diff 3 Files Affected:
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
index 1c5cf73db6eba..07324c603012a 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir
@@ -141,14 +141,18 @@ func.func @main() {
%c16 = arith.constant 16 : index
%c4096 = arith.constant 4096 : index
%c8 = arith.constant 8 : index
- %txcount = arith.constant 32768 : index
+ %txcount = arith.constant 32768 : index
+ %c24576 = arith.constant 24576 : index
+ %c16384 = arith.constant 16384 : index
+ %c49152 = arith.constant 49152 : index
+ %c57344 = arith.constant 57344 : index
%tidx = gpu.thread_id x
%dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3>
%lhsShmem = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [2, 128, 64], strides: [8192, 64, 1] : memref<0xf16, 3> to memref<2x128x64xf16, 3>
%rhsShmem2 = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [4, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<4x64x128xf16,3>
%rhsShmem = memref.subview %rhsShmem2[2, 0, 0][2, 64, 128][1, 1, 1] : memref<4x64x128xf16,3> to memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3>
-
+ %dynsmem = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
// Step 1. [GPU] Create Async Transactional Barriers (mbarriers)
%barrier = nvgpu.mbarrier.create -> !barrierType
%cnd = arith.cmpi eq, %tidx, %c0 : index
@@ -161,31 +165,29 @@ func.func @main() {
nvgpu.tma.prefetch.descriptor %descA : !lhsTensorMap
nvgpu.tma.prefetch.descriptor %descB : !rhsTensorMap
- // Step 4.1 [GPU] TMA Load Pipeline 1
+ // Step 4.1 [GPU] TMA Load Pipeline 1
scf.if %cnd {
%pipe = arith.constant 0 : index
- %lhsSlice = memref.subview %lhsShmem[0, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, 3>
- %rhsSlice = memref.subview %rhsShmem[0, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: 16384>, 3>
- %halfFirst = memref.subview %rhsSlice[0, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 16384>, 3> to memref<64x64xf16, strided<[128, 1], offset: 16384>, 3>
- %halfSecond = memref.subview %rhsSlice[32, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 16384>, 3> to memref<64x64xf16, strided<[128, 1], offset: 20480>, 3>
+ %lhsSlice = memref.view %dynsmem[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+ %halfFirst = memref.view %dynsmem[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+ %halfSecond = memref.view %dynsmem[%c24576][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %barrier[%pipe], %txcount : !barrierType
%dim = arith.muli %pipe, %c64 : index
- nvgpu.tma.async.load %descA[%dim, %c0], %barrier[%pipe] to %lhsSlice : !lhsTensorMap, !barrierType -> memref<128x64xf16, 3>
- nvgpu.tma.async.load %descB[%c0, %dim], %barrier[%pipe] to %halfFirst : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 16384>, 3>
- nvgpu.tma.async.load %descB[%c64, %dim], %barrier[%pipe] to %halfSecond : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 20480>, 3>
+ nvgpu.tma.async.load %descA[%dim, %c0], %barrier[%pipe] to %lhsSlice : !lhsTensorMap, !barrierType -> memref<128x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c0, %dim], %barrier[%pipe] to %halfFirst : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c64, %dim], %barrier[%pipe] to %halfSecond : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
}
// Step 4.2 [GPU] TMA Load Pipeline 2
scf.if %cnd {
%pipe = arith.constant 1 : index
- %lhsSlice = memref.subview %lhsShmem[1, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: 8192>, 3>
- %rhsSlice = memref.subview %rhsShmem[1, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: 24576>, 3>
- %halfFirst = memref.subview %rhsSlice[0, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 24576>, 3> to memref<64x64xf16, strided<[128, 1], offset: 24576>, 3>
- %halfSecond = memref.subview %rhsSlice[32, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 24576>, 3> to memref<64x64xf16, strided<[128, 1], offset: 28672>, 3>
+ %lhsSlice = memref.view %dynsmem[%c32768][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+ %halfFirst = memref.view %dynsmem[%c49152][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+ %halfSecond = memref.view %dynsmem[%c57344][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %barrier[%pipe], %txcount : !barrierType
%dim = arith.muli %pipe, %c64 : index
- nvgpu.tma.async.load %descA[%dim, %c0], %barrier[%pipe] to %lhsSlice : !lhsTensorMap, !barrierType -> memref<128x64xf16, strided<[64, 1], offset: 8192>, 3>
- nvgpu.tma.async.load %descB[%c0, %dim], %barrier[%pipe] to %halfFirst : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 24576>, 3>
- nvgpu.tma.async.load %descB[%c64, %dim], %barrier[%pipe] to %halfSecond : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 28672>, 3>
+ nvgpu.tma.async.load %descA[%dim, %c0], %barrier[%pipe] to %lhsSlice : !lhsTensorMap, !barrierType -> memref<128x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c0, %dim], %barrier[%pipe] to %halfFirst : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c64, %dim], %barrier[%pipe] to %halfSecond : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
}
// Step 5. [GPU] Initiliaze accumulator matrix
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
index 6e8ef2b75eae6..48ddef41e286a 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir
@@ -142,13 +142,17 @@ func.func @main() {
%c4096 = arith.constant 4096 : index
%c8 = arith.constant 8 : index
%txcount = arith.constant 32768 : index
+ %c24576 = arith.constant 24576 : index
+ %c16384 = arith.constant 16384 : index
+ %c49152 = arith.constant 49152 : index
+ %c57344 = arith.constant 57344 : index
%tidx = gpu.thread_id x
%dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3>
%lhsShmem = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [2, 128, 64], strides: [8192, 64, 1] : memref<0xf16, 3> to memref<2x128x64xf16, 3>
%rhsShmem2 = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [4, 64, 128], strides: [8192,128,1] : memref<0xf16, 3> to memref<4x64x128xf16,3>
%rhsShmem = memref.subview %rhsShmem2[2, 0, 0][2, 64, 128][1, 1, 1] : memref<4x64x128xf16,3> to memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3>
-
+ %dynsmem = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
// Step 1. [GPU] Create Async Transactional Barriers (mbarriers)
%barrier = nvgpu.mbarrier.create -> !barrierType
@@ -175,28 +179,25 @@ func.func @main() {
// Step 4.2 [GPU] TMA Load Pipeline 1 (predicated)
%pipe1 = arith.constant 0 : index
- %p1lhsSlice = memref.subview %lhsShmem[0, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, 3>
- %p1rhsSlice = memref.subview %rhsShmem[0, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: 16384>, 3>
- %p1halfFirst = memref.subview %p1rhsSlice[0, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 16384>, 3> to memref<64x64xf16, strided<[128, 1], offset: 16384>, 3>
- %p1halfSecond = memref.subview %p1rhsSlice[32, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 16384>, 3> to memref<64x64xf16, strided<[128, 1], offset: 20480>, 3>
+ %lhsSlice1 = memref.view %dynsmem[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+ %halfFirst1 = memref.view %dynsmem[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+ %halfSecond1 = memref.view %dynsmem[%c24576][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %barrier[%pipe1], %txcount, predicate = %cnd : !barrierType
%dim1 = arith.muli %pipe1, %c64 : index
- nvgpu.tma.async.load %descA[%dim1, %c0], %barrier[%pipe1] to %p1lhsSlice, predicate = %cnd : !lhsTensorMap, !barrierType -> memref<128x64xf16, 3>
- nvgpu.tma.async.load %descB[%c0, %dim1], %barrier[%pipe1] to %p1halfFirst, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 16384>, 3>
- nvgpu.tma.async.load %descB[%c64, %dim1], %barrier[%pipe1] to %p1halfSecond, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 20480>, 3>
+ nvgpu.tma.async.load %descA[%dim1, %c0], %barrier[%pipe1] to %lhsSlice1, predicate = %cnd : !lhsTensorMap, !barrierType -> memref<128x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c0, %dim1], %barrier[%pipe1] to %halfFirst1, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c64, %dim1], %barrier[%pipe1] to %halfSecond1, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
// Step 5. [GPU] TMA Load Pipeline 2 (predicated)
%pipe2 = arith.constant 1 : index
- %p2lhsSlice = memref.subview %lhsShmem[1, 0, 0][1, 128, 64][1, 1, 1] : memref<2x128x64xf16, 3> to memref<128x64xf16, strided<[64, 1], offset: 8192>, 3>
- %p2rhsSlice = memref.subview %rhsShmem[1, 0, 0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16, strided<[8192, 128, 1], offset: 16384>, 3> to memref<64x128xf16, strided<[128, 1], offset: 24576>, 3>
- %p2halfFirst = memref.subview %p2rhsSlice[0, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 24576>, 3> to memref<64x64xf16, strided<[128, 1], offset: 24576>, 3>
- %p2halfSecond = memref.subview %p2rhsSlice[32, 0][64, 64][1, 1] : memref<64x128xf16, strided<[128, 1], offset: 24576>, 3> to memref<64x64xf16, strided<[128, 1], offset: 28672>, 3>
+ %lhsSlice2 = memref.view %dynsmem[%c32768][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+ %halfFirst2 = memref.view %dynsmem[%c49152][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+ %halfSecond2 = memref.view %dynsmem[%c57344][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %barrier[%pipe2], %txcount, predicate = %cnd : !barrierType
%dim2 = arith.muli %pipe2, %c64 : index
- nvgpu.tma.async.load %descA[%dim2, %c0], %barrier[%pipe2] to %p2lhsSlice, predicate = %cnd : !lhsTensorMap, !barrierType -> memref<128x64xf16, strided<[64, 1], offset: 8192>, 3>
- nvgpu.tma.async.load %descB[%c0, %dim2], %barrier[%pipe2] to %p2halfFirst, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 24576>, 3>
- nvgpu.tma.async.load %descB[%c64, %dim2], %barrier[%pipe2] to %p2halfSecond, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 28672>, 3>
-
+ nvgpu.tma.async.load %descA[%dim2, %c0], %barrier[%pipe2] to %lhsSlice2, predicate = %cnd : !lhsTensorMap, !barrierType -> memref<128x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c0, %dim2], %barrier[%pipe2] to %halfFirst2, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %descB[%c64, %dim2], %barrier[%pipe2] to %halfSecond2, predicate = %cnd : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
// Step 6. [GPU] Initiliaze accumulator matrix
%14 = nvgpu.warpgroup.mma.init.accumulator -> <fragmented = vector<128x128xf32>>
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 462040cd04a3d..00c5bf2e49cdb 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -39,8 +39,6 @@
module @mymod {
func.func private @printMemrefF32(memref<*xf32>)
- memref.global "private" @bufferLhsGlobal : !shmemlhs
- memref.global "private" @bufferRhsGlobal : !shmemrhs
llvm.func @printf(!llvm.ptr, ...) -> i32
func.func @main() {
%c32768 = arith.constant 32768 : index
@@ -49,7 +47,7 @@ module @mymod {
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
- %c0 = arith.constant 0 : index
+ %c01 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%c8 = arith.constant 8 : index
@@ -58,8 +56,8 @@ module @mymod {
%rhs = memref.alloc() : !rhs
%lhs32 = memref.alloc() : memref<128x64xf32>
%rhs32 = memref.alloc() : memref<64x128xf32>
- scf.for %i = %c0 to %c64 step %c1 {
- scf.for %j = %c0 to %c128 step %c1 {
+ scf.for %i = %c01 to %c64 step %c1 {
+ scf.for %j = %c01 to %c128 step %c1 {
%v0 = arith.muli %i, %c128 : index
%v00 = arith.addi %v0, %j : index
%v01 = arith.divui %v00, %c8 : index
@@ -92,15 +90,21 @@ module @mymod {
%d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
%d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap
+ %c32768_i32 = arith.constant 32768 : i32
// Step 4. Launch a GPU kernel
- gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
+ gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) dynamic_shared_memory_size %c32768_i32 {
%5 = gpu.block_dim x
%6 = gpu.thread_id x
- %lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs
- %rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
- %rhsShmem1 = memref.subview %rhsShmem[0, 0][64, 64][1, 1] : !shmemrhs to memref<64x64xf16, strided<[128, 1]>, 3>
- %rhsShmem2 = memref.subview %rhsShmem[32, 0][64, 64][1, 1] : !shmemrhs to memref<64x64xf16, strided<[128, 1], offset: 4096>, 3>
+ %c0 = arith.constant 0 : index
+ %txcount = arith.constant 32768 : index
+ %c24576 = arith.constant 24576 : index
+ %c16384 = arith.constant 16384 : index
+ %dynsmem = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+ %lhsSlice = memref.view %dynsmem[%c01][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+ %rhsSlice = memref.view %dynsmem[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x128xf16, #gpu.address_space<workgroup>>
+ %halfFirst = memref.view %dynsmem[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+ %halfSecond = memref.view %dynsmem[%c24576][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
// Step 5. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
@@ -110,9 +114,9 @@ module @mymod {
// Step 6. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A", %c32768 : index
- nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
- nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem1 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1]>, 3>
- nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 4096>, 3>
+ nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsSlice : !lhsTensorMap, !barrierType -> memref<128x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %halfFirst : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
+ nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %halfSecond : !rhsTensorMap, !barrierType -> memref<64x64xf16, #gpu.address_space<workgroup>>
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
} else {
nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
@@ -127,7 +131,7 @@ module @mymod {
gpu.printf "===--- Matrix B ---=== %d \n", %c-1_i32 : i32
scf.for %ii = %c0 to %c64 step %c1 {
scf.for %j = %c0 to %c128 step %c1 {
- %lhs0 = memref.load %rhsShmem[%ii, %j] : !shmemrhs
+ %lhs0 = memref.load %rhsSlice[%ii, %j] : memref<64x128xf16, #gpu.address_space<workgroup>>
%lhs032 = arith.extf %lhs0: f16 to f32
gpu.printf "%.0f, ", %lhs032 : f32
}
|
It seems that tests
|
…)" This reverts commit 15f5a7a.
) Reverts #133051 due to failing integration tests
…ests" (#133103) Reverts llvm/llvm-project#133051 due to failing integration tests
* Improve the verifier of `memref.subview` to detect out-of-bounds extractions. * Improve the documentation of `memref.subview` to make clear that out-of-bounds extractions are not allowed. Rewrite examples to use the new `strided<>` notation instead of `affine_map` layout maps. Also remove all unrelated operations (`memref.alloc`) from the examples. * Fix various test cases where `memref.subview` ops ran out-of-bounds. * Update canonicalizations patterns to ensure that they do not fold IR if it would generate IR that no longer verifies. Related discussion on Discourse: https://discourse.llvm.org/t/out-of-bounds-semantics-of-memref-subview/85293 This is a re-upload of #131876, which was reverted due to failing GPU tests. These tests were faulty and fixed in #133051.
The
memref.subview
ops in the test case were incorrect: they extracted out-of-bounds.