Allowing larger grids for THCApply shows improved performance.

torch · Aug 27, 2017 · d891ff3 · d891ff3
1 parent 34cb262
commit d891ff3
Showing 1 changed file with 9 additions and 9 deletions.
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
@@ -109,16 +109,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
     return false;
   }
 
-  // Assume a reasonable number of SMs if no state is available
-  int numSM =
-    state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
-
-  // 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
-  // which seems to be a good sweetspot for latency hiding
-  grid = dim3(min((long long) THCCeilDiv(totalElements,
-                                         (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
-                  4LL * numSM));
+  if(THCState_getCurrentDeviceProperties(state)->major < 3){
+    grid = dim3(min((long long) THCCeilDiv(totalElements,
+               (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1));
+    return true;
+  }
+
+  grid = dim3((long long) THCCeilDiv(totalElements,
+              (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
   return true;
+
 }
 
 template <typename TensorTypeA,