Incorrect GPU grid dimension

I tiled a n x m matrix into 32 x 32 tiles, with each tile mapped to a GPU block, and each element in a tile mapped to a GPU thread. There should be `ceil(n / 32) * ceil(m / 32) = ((n + 31) / 32) * (m + 31) / 32))` blocks. However, TACO generates `(n + 31) / 32) * (m + 31) / 32)` blocks. Please note those parentheses: `((1024 + 31) / 32) * (1024 + 31) / 32)) = 1024`, while `(1024 + 31) / 32 * (1024 + 31) / 32 = 1055`.

The following code is generated using the latest commit (ed3488f5c1c4c8c3b6a7b1dfaa613a2443853eb5).

```cuda
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)

__global__
void computeDeviceKernel0(taco_tensor_t * __restrict__ A, taco_tensor_t * __restrict__ B, taco_tensor_t * __restrict__ C){
  int A1_dimension = (int)(A->dimensions[0]);
  int A2_dimension = (int)(A->dimensions[1]);
  double* __restrict__ A_vals = (double*)(A->vals);
  int B1_dimension = (int)(B->dimensions[0]);
  int B2_dimension = (int)(B->dimensions[1]);
  double* __restrict__ B_vals = (double*)(B->vals);
  int C2_dimension = (int)(C->dimensions[1]);
  double* __restrict__ C_vals = (double*)(C->vals);

  int32_t f = blockIdx.x;
  int32_t g = (threadIdx.x % (1024));
  if (threadIdx.x >= 1024) {
    return;
  }

  int32_t i0 = f / (B2_dimension + 31) / 32;
  int32_t j0 = f % (B2_dimension + 31) / 32;
  int32_t i1 = g / 32;
  int32_t i = i0 * 32 + i1;
  if (i >= A1_dimension)
    return;

  int32_t j1 = g % 32;
  int32_t j = j0 * 32 + j1;
  if (j >= B2_dimension)
    return;

  int32_t jC = i * C2_dimension + j;
  double tkC_val = 0.0;
  for (int32_t k = 0; k < B1_dimension; k++) {
    int32_t kA = i * A2_dimension + k;
    int32_t jB = k * B2_dimension + j;
    tkC_val = tkC_val + A_vals[kA] * B_vals[jB];
  }
  C_vals[jC] = C_vals[jC] + tkC_val;
}

int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) {
  int C1_dimension = (int)(C->dimensions[0]);
  int C2_dimension = (int)(C->dimensions[1]);
  double* __restrict__ C_vals = (double*)(C->vals);
  int A1_dimension = (int)(A->dimensions[0]);
  int B2_dimension = (int)(B->dimensions[1]);

  for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) {
    C_vals[pC] = 0.0;
  }

  computeDeviceKernel0<<<(A1_dimension + 31) / 32 * (B2_dimension + 31) / 32, 1024>>>(A, B, C);  // <----- LOOK AT HERE
  cudaDeviceSynchronize();
  return 0;
}
```

Using the following command:

```sh
taco \
    "C(i, j) = A(i, k) * B(k, j)" \
    -s="split(i,i0,i1,32)" \
    -s="split(j,j0,j1,32)" \
    -s="reorder(i0,j0,i1,j1,k)" \
    -s="fuse(i0,j0,f)" \
    -s="fuse(i1,j1,g)" \
    -s="parallelize(f,GPUBlock,NoRaces)" \
    -s="parallelize(g,GPUThread,NoRaces)"
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Incorrect GPU grid dimension #338

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Incorrect GPU grid dimension #338

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions