Skip to content

Commit

Permalink
Allowing larger grids for THCApply shows improved performance.
Browse files Browse the repository at this point in the history
  • Loading branch information
csarofeen authored and soumith committed Aug 27, 2017
1 parent 34cb262 commit d891ff3
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions lib/THC/THCApply.cuh
Expand Up @@ -109,16 +109,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
return false;
}

// Assume a reasonable number of SMs if no state is available
int numSM =
state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;

// 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
// which seems to be a good sweetspot for latency hiding
grid = dim3(min((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
4LL * numSM));
if(THCState_getCurrentDeviceProperties(state)->major < 3){
grid = dim3(min((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1));
return true;
}

grid = dim3((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
return true;

}

template <typename TensorTypeA,
Expand Down

0 comments on commit d891ff3

Please sign in to comment.