Skip to content

Incorrect GPU grid dimension #338

@roastduck

Description

@roastduck

I tiled a n x m matrix into 32 x 32 tiles, with each tile mapped to a GPU block, and each element in a tile mapped to a GPU thread. There should be ceil(n / 32) * ceil(m / 32) = ((n + 31) / 32) * (m + 31) / 32)) blocks. However, TACO generates (n + 31) / 32) * (m + 31) / 32) blocks. Please note those parentheses: ((1024 + 31) / 32) * (1024 + 31) / 32)) = 1024, while (1024 + 31) / 32 * (1024 + 31) / 32 = 1055.

The following code is generated using the latest commit (ed3488f).

// Generated by the Tensor Algebra Compiler (tensor-compiler.org)

__global__
void computeDeviceKernel0(taco_tensor_t * __restrict__ A, taco_tensor_t * __restrict__ B, taco_tensor_t * __restrict__ C){
  int A1_dimension = (int)(A->dimensions[0]);
  int A2_dimension = (int)(A->dimensions[1]);
  double* __restrict__ A_vals = (double*)(A->vals);
  int B1_dimension = (int)(B->dimensions[0]);
  int B2_dimension = (int)(B->dimensions[1]);
  double* __restrict__ B_vals = (double*)(B->vals);
  int C2_dimension = (int)(C->dimensions[1]);
  double* __restrict__ C_vals = (double*)(C->vals);

  int32_t f = blockIdx.x;
  int32_t g = (threadIdx.x % (1024));
  if (threadIdx.x >= 1024) {
    return;
  }

  int32_t i0 = f / (B2_dimension + 31) / 32;
  int32_t j0 = f % (B2_dimension + 31) / 32;
  int32_t i1 = g / 32;
  int32_t i = i0 * 32 + i1;
  if (i >= A1_dimension)
    return;

  int32_t j1 = g % 32;
  int32_t j = j0 * 32 + j1;
  if (j >= B2_dimension)
    return;

  int32_t jC = i * C2_dimension + j;
  double tkC_val = 0.0;
  for (int32_t k = 0; k < B1_dimension; k++) {
    int32_t kA = i * A2_dimension + k;
    int32_t jB = k * B2_dimension + j;
    tkC_val = tkC_val + A_vals[kA] * B_vals[jB];
  }
  C_vals[jC] = C_vals[jC] + tkC_val;
}

int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) {
  int C1_dimension = (int)(C->dimensions[0]);
  int C2_dimension = (int)(C->dimensions[1]);
  double* __restrict__ C_vals = (double*)(C->vals);
  int A1_dimension = (int)(A->dimensions[0]);
  int B2_dimension = (int)(B->dimensions[1]);

  for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) {
    C_vals[pC] = 0.0;
  }

  computeDeviceKernel0<<<(A1_dimension + 31) / 32 * (B2_dimension + 31) / 32, 1024>>>(A, B, C);  // <----- LOOK AT HERE
  cudaDeviceSynchronize();
  return 0;
}

Using the following command:

taco \
    "C(i, j) = A(i, k) * B(k, j)" \
    -s="split(i,i0,i1,32)" \
    -s="split(j,j0,j1,32)" \
    -s="reorder(i0,j0,i1,j1,k)" \
    -s="fuse(i0,j0,f)" \
    -s="fuse(i1,j1,g)" \
    -s="parallelize(f,GPUBlock,NoRaces)" \
    -s="parallelize(g,GPUThread,NoRaces)"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions