-
Notifications
You must be signed in to change notification settings - Fork 196
Closed
Description
I tiled a n x m matrix into 32 x 32 tiles, with each tile mapped to a GPU block, and each element in a tile mapped to a GPU thread. There should be ceil(n / 32) * ceil(m / 32) = ((n + 31) / 32) * (m + 31) / 32)) blocks. However, TACO generates (n + 31) / 32) * (m + 31) / 32) blocks. Please note those parentheses: ((1024 + 31) / 32) * (1024 + 31) / 32)) = 1024, while (1024 + 31) / 32 * (1024 + 31) / 32 = 1055.
The following code is generated using the latest commit (ed3488f).
// Generated by the Tensor Algebra Compiler (tensor-compiler.org)
__global__
void computeDeviceKernel0(taco_tensor_t * __restrict__ A, taco_tensor_t * __restrict__ B, taco_tensor_t * __restrict__ C){
int A1_dimension = (int)(A->dimensions[0]);
int A2_dimension = (int)(A->dimensions[1]);
double* __restrict__ A_vals = (double*)(A->vals);
int B1_dimension = (int)(B->dimensions[0]);
int B2_dimension = (int)(B->dimensions[1]);
double* __restrict__ B_vals = (double*)(B->vals);
int C2_dimension = (int)(C->dimensions[1]);
double* __restrict__ C_vals = (double*)(C->vals);
int32_t f = blockIdx.x;
int32_t g = (threadIdx.x % (1024));
if (threadIdx.x >= 1024) {
return;
}
int32_t i0 = f / (B2_dimension + 31) / 32;
int32_t j0 = f % (B2_dimension + 31) / 32;
int32_t i1 = g / 32;
int32_t i = i0 * 32 + i1;
if (i >= A1_dimension)
return;
int32_t j1 = g % 32;
int32_t j = j0 * 32 + j1;
if (j >= B2_dimension)
return;
int32_t jC = i * C2_dimension + j;
double tkC_val = 0.0;
for (int32_t k = 0; k < B1_dimension; k++) {
int32_t kA = i * A2_dimension + k;
int32_t jB = k * B2_dimension + j;
tkC_val = tkC_val + A_vals[kA] * B_vals[jB];
}
C_vals[jC] = C_vals[jC] + tkC_val;
}
int compute(taco_tensor_t *C, taco_tensor_t *A, taco_tensor_t *B) {
int C1_dimension = (int)(C->dimensions[0]);
int C2_dimension = (int)(C->dimensions[1]);
double* __restrict__ C_vals = (double*)(C->vals);
int A1_dimension = (int)(A->dimensions[0]);
int B2_dimension = (int)(B->dimensions[1]);
for (int32_t pC = 0; pC < (C1_dimension * C2_dimension); pC++) {
C_vals[pC] = 0.0;
}
computeDeviceKernel0<<<(A1_dimension + 31) / 32 * (B2_dimension + 31) / 32, 1024>>>(A, B, C); // <----- LOOK AT HERE
cudaDeviceSynchronize();
return 0;
}Using the following command:
taco \
"C(i, j) = A(i, k) * B(k, j)" \
-s="split(i,i0,i1,32)" \
-s="split(j,j0,j1,32)" \
-s="reorder(i0,j0,i1,j1,k)" \
-s="fuse(i0,j0,f)" \
-s="fuse(i1,j1,g)" \
-s="parallelize(f,GPUBlock,NoRaces)" \
-s="parallelize(g,GPUThread,NoRaces)"Metadata
Metadata
Assignees
Labels
No labels