wavefunction91 · wavefunction91 · Nov 10, 2020 · Oct 13, 2020 · Oct 13, 2020 · Oct 14, 2020
diff --git a/include/gauxc/xc_integrator/xc_cuda_data.hpp b/include/gauxc/xc_integrator/xc_cuda_data.hpp
@@ -79,7 +79,8 @@ struct XCCudaData {
   F*           weights_device_buffer    = nullptr;
   size_t*      shell_list_device_buffer = nullptr;
   size_t*      shell_offs_device_buffer = nullptr;
-  int64_t*     submat_cut_device_buffer = nullptr;
+  int32_t*     submat_cut_device_buffer = nullptr;
+  int32_t*     submat_block_device_buffer = nullptr;
   int32_t*     iparent_device_buffer    = nullptr;
   F*           dist_nearest_buffer      = nullptr;
 

diff --git a/include/gauxc/xc_task.hpp b/include/gauxc/xc_task.hpp
@@ -80,13 +80,15 @@ struct XCTaskDevice {
   size_t nbe;
   size_t npts;
   size_t ncut;
+  size_t nblock;
   size_t nshells;
 
   double* points;
   double* weights;
   size_t* shell_list;
   size_t* shell_offs;
-  int64_t* submat_cut;
+  int32_t* submat_cut;
+  int32_t* submat_block;
 
   Shell<T>* shells;
   double*   nbe_scr;

diff --git a/src/integrator/cuda/cuda_device_properties.cxx b/src/integrator/cuda/cuda_device_properties.cxx
@@ -0,0 +1,26 @@
+#include <cmath>
+#include <algorithm>
+
+#include "cuda_runtime.h"
+
+#include "cuda_device_properties.hpp"
+
+namespace GauXC {
+namespace cuda  {
+
+
+uint32_t get_submat_cut_block(int32_t LDA) {
+  int l2_cache_size;
+  cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, 0);
+
+  int l2_block_size = (int) sqrt(0.75 * ((double) l2_cache_size / 8));
+  int min_block_size = LDA / max_submat_blocks;
+
+  int block_size = std::max(l2_block_size, min_block_size);
+  block_size = std::min(block_size, LDA);
+
+  return block_size;
+}
+
+}
+}
diff --git a/src/integrator/cuda/cuda_device_properties.hpp b/src/integrator/cuda/cuda_device_properties.hpp
@@ -9,5 +9,7 @@ static constexpr uint32_t max_threads_per_thread_block = 1024;
 static constexpr uint32_t max_warps_per_thread_block = 
   max_threads_per_thread_block / warp_size;
 
+static constexpr uint32_t max_submat_blocks = 10;
+uint32_t get_submat_cut_block(int32_t LDA);
 }
 }
diff --git a/src/integrator/cuda/cuda_inc_potential.cu b/src/integrator/cuda/cuda_inc_potential.cu
@@ -1,4 +1,7 @@
 #include "cuda_inc_potential.hpp"
+#include "cuda_device_properties.hpp"
+#include <gauxc/util/div_ceil.hpp>
+
 #include "cuda_device_properties.hpp"
 
 namespace GauXC      {
@@ -7,55 +10,85 @@ namespace cuda       {
 
 using namespace GauXC::cuda;
 
-template <typename T>
-__global__ void inc_by_submat_combined_kernel( size_t           ntasks,
-                                               XCTaskDevice<T>* device_tasks,
-                                               T*               A,
-                                               size_t           LDA ) {
 
-  const int batch_id = blockIdx.z;
+#define WARP_X 16
+#define WARP_Y 1
+#define UNROLL_FACTOR 4
+#define EFF_UNROLL 4
+#define CUT_X 8
+#define CUT_Y 8
 
-  if( batch_id < ntasks ) {
 
+template <typename T>
+__global__ __launch_bounds__(1024, 1)
+void inc_by_submat_combined_kernel( size_t           ntasks,
+                                    XCTaskDevice<T>* device_tasks,
+                                    T*               A,
+                                    size_t           LDA, 
+				    const int block_y,
+				    const int block_x ) {
+
+  const int batch_id = blockIdx.z;
   auto& task = device_tasks[ batch_id ];
 
-  const auto  ncut              = task.ncut;
   const auto* submat_cut_device = task.submat_cut;
+  const auto* submat_block_device = task.submat_block;
   const auto  LDAS              = task.nbe;
         auto* ASmall_device     = task.nbe_scr;
 
   //if( LDAS == LDAB ) return;
+  const int tid_xx = threadIdx.x % WARP_X;
+  const int tid_xy = threadIdx.x / WARP_X;
+
+  const int tid_yx = threadIdx.y % CUT_X;
+  const int tid_yy = threadIdx.y / CUT_X;
+
+  const int start_cut_y = submat_block_device[block_y];
+  const int end_cut_y   = submat_block_device[block_y+1];
+  const int start_cut_x = submat_block_device[block_x];
+  const int end_cut_x   = submat_block_device[block_x+1];
+
+  for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) {
+    const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut));
+    const int i_cut_first  = i_data.x;
+    const int delta_i      = i_data.y;
+    const int i_cut_small  = i_data.z;
+
+  for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) {
+    const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut));
+    const int j_cut_first  = j_data.x; 
+    const int delta_j      = j_data.y;
+    const int j_cut_small  = j_data.z;
+
+    auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS;
+    auto* ABig_begin   = A   + i_cut_first + j_cut_first*LDA;
+
+    int J;
+    for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) {
+      for( int I = tid_xx; I < delta_i; I += WARP_X ) {
+
+        double val[UNROLL_FACTOR];
+        double* address[UNROLL_FACTOR];
+#pragma unroll
+        for (int k = 0; k < UNROLL_FACTOR; k++) {
+          val[k] = ASmall_begin[I + (J+k*WARP_Y)*LDAS];
+          address[k] = ABig_begin + I + (J+k*WARP_Y)*LDA;
+        }
+#pragma unroll
+        for (int k = 0; k < UNROLL_FACTOR; k++) {
+          atomicAdd(address[k], val[k] );
+        }
+      }
+    }
+
+    for ( ; J < delta_j; J += WARP_Y) {
+      for( int I = tid_xx; I < delta_i; I += WARP_X ) {
+        atomicAdd(ABig_begin + I + J*LDA, ASmall_begin[I + J*LDAS] );
+      }
+    }
 
-
-  const int tid_x = blockDim.x * blockIdx.x + threadIdx.x;
-  const int tid_y = blockDim.y * blockIdx.y + threadIdx.y;
-
-  int64_t i(0);
-  for( size_t i_cut = 0; i_cut < ncut; ++i_cut ) {
-    const int64_t i_cut_first  = submat_cut_device[ 2*i_cut ];
-    const int64_t i_cut_second = submat_cut_device[ 2*i_cut + 1 ];
-    const int64_t delta_i      = i_cut_second - i_cut_first;
-
-    int64_t j(0);
-  for( size_t j_cut = 0; j_cut < ncut; ++j_cut ) {
-    const int64_t j_cut_first  = submat_cut_device[ 2*j_cut ];
-    const int64_t j_cut_second = submat_cut_device[ 2*j_cut + 1 ];
-    const int64_t delta_j      = j_cut_second - j_cut_first;
-
-    auto* ASmall_begin = ASmall_device + i           + j          *LDAS;
-    auto* ABig_begin   = A             + i_cut_first + j_cut_first*LDA ;
-
-    for( size_t J = tid_y; J < delta_j; J += blockDim.y )      
-    for( size_t I = tid_x; I < delta_i; I += blockDim.x )
-      //ABig_begin[I + J*LDA] += ASmall_begin[I + J*LDAS];
-      atomicAdd( ABig_begin + I + J*LDA, ASmall_begin[I+J*LDAS] );
-
-    j += delta_j;
   }
-    i += delta_i;
   }
-
-  } // batch_id check
 }
 
 
@@ -65,13 +98,16 @@ void task_inc_potential( size_t           ntasks,
                          T*               V_device,
                          size_t           LDV,
                          cudaStream_t     stream ) {
-
-  dim3 threads(warp_size,max_warps_per_thread_block,1), blocks(1,1,ntasks);
-  inc_by_submat_combined_kernel<<< blocks, threads, 0, stream >>>(
-    ntasks, device_tasks, V_device, LDV
-  );
-
-
+  dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks);
+
+  const int submat_block_size = get_submat_cut_block(LDV);
+  for (int i = 0; i < util::div_ceil(LDV, submat_block_size); i++) {
+    for (int j = 0; j < util::div_ceil(LDV, submat_block_size); j++) {
+      inc_by_submat_combined_kernel<<< blocks, threads, 0, stream >>>(
+        ntasks, device_tasks, V_device, LDV, i, j
+      );
+    }
+  }
 }
 
 template 

diff --git a/src/integrator/cuda/cuda_pack_density.cu b/src/integrator/cuda/cuda_pack_density.cu
@@ -1,4 +1,7 @@
 #include "cuda_pack_density.hpp"
+#include "cuda_device_properties.hpp"
+#include <gauxc/util/div_ceil.hpp>
+
 #include "cuda_device_properties.hpp"
 
 namespace GauXC      {
@@ -7,54 +10,89 @@ namespace cuda       {
 
 using namespace GauXC::cuda;
 
-template <typename T>
-__global__ void submat_set_combined_kernel( size_t           ntasks,
-                                            XCTaskDevice<T>* device_tasks,
-                                            T*               A,
-                                            size_t           LDA ) {
+#define WARP_X 16
+#define WARP_Y 1
+#define UNROLL_FACTOR 4
+#define EFF_UNROLL 4
+#define CUT_X 8
+#define CUT_Y 8
 
-  const int batch_id = blockIdx.z;
+template <typename T>
+__global__ __launch_bounds__(1024, 1)
+void submat_set_combined_kernel( size_t           ntasks,
+                                 XCTaskDevice<T>* device_tasks,
+                                 T*               A,
+                                 size_t           LDA,
+				 const int block_y,
+				 const int block_x) {
 
-  if( batch_id < ntasks ) {
 
+  const int batch_id = blockIdx.z;
   auto& task = device_tasks[ batch_id ];
 
-  const auto  ncut              = task.ncut;
   const auto* submat_cut_device = task.submat_cut;
+  const auto* submat_block_device = task.submat_block;
   const auto  LDAS              = task.nbe;
         auto* ASmall_device     = task.nbe_scr;
 
   //if( LDAS == LDAB ) return;
 
-
-  const int tid_x = blockDim.x * blockIdx.x + threadIdx.x;
-  const int tid_y = blockDim.y * blockIdx.y + threadIdx.y;
-
-  int64_t i(0);
-  for( size_t i_cut = 0; i_cut < ncut; ++i_cut ) {
-    const int64_t i_cut_first  = submat_cut_device[ 2*i_cut ];
-    const int64_t i_cut_second = submat_cut_device[ 2*i_cut + 1 ];
-    const int64_t delta_i      = i_cut_second - i_cut_first;
-
-    int64_t j(0);
-  for( size_t j_cut = 0; j_cut < ncut; ++j_cut ) {
-    const int64_t j_cut_first  = submat_cut_device[ 2*j_cut ];
-    const int64_t j_cut_second = submat_cut_device[ 2*j_cut + 1 ];
-    const int64_t delta_j      = j_cut_second - j_cut_first;
-
-    auto* ASmall_begin = ASmall_device + i           + j          *LDAS;
-    auto* ABig_begin   = A             + i_cut_first + j_cut_first*LDA ;
-
-    for( size_t J = tid_y; J < delta_j; J += blockDim.y )      
-    for( size_t I = tid_x; I < delta_i; I += blockDim.x )
-      ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA];
-
-    j += delta_j;
+  const int tid_xx = threadIdx.x % WARP_X;
+  const int tid_xy = threadIdx.x / WARP_X;
+
+  const int tid_yx = threadIdx.y % CUT_X;
+  const int tid_yy = threadIdx.y / CUT_X;
+
+  const int start_cut_y = submat_block_device[block_y];
+  const int end_cut_y   = submat_block_device[block_y+1];
+  const int start_cut_x = submat_block_device[block_x];
+  const int end_cut_x   = submat_block_device[block_x+1];
+
+  for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) {
+    const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut));
+    const int i_cut_first  = i_data.x;
+    const int delta_i      = i_data.y;
+    const int i_cut_small  = i_data.z;
+
+  for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) {
+    const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut));
+    const int j_cut_first  = j_data.x; 
+    const int delta_j      = j_data.y;
+    const int j_cut_small  = j_data.z;
+
+    auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS;
+    auto* ABig_begin   = A   + i_cut_first + j_cut_first*LDA;
+
+    int J;
+    for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) {
+      for( int I = tid_xx; I < delta_i; I += WARP_X ) {
+
+        double val[UNROLL_FACTOR];
+        double* address[UNROLL_FACTOR];
+#pragma unroll
+        for (int k = 0; k < UNROLL_FACTOR; k++) {
+          val[k] = ABig_begin[I + (J + k*WARP_Y)*LDA];
+          address[k] = ASmall_begin + I + (J + k*WARP_Y) * LDAS;
+        }
+#pragma unroll
+        for (int k = 0; k < UNROLL_FACTOR; k++) {
+	  // Suggest that the result be evicted first.
+#if (CUDART_VERSION >= 11000)
+	  __stcs(address[k], val[k]);
+#else
+          asm ("st.global.cs.f64 [%0], %1;" :: "l"(address[k]), "d"(val[k]));
+#endif
+        }
+      }
+    }
+
+    for ( ; J < delta_j; J += WARP_Y) {
+      for( int I = tid_xx; I < delta_i; I += WARP_X ) {
+        ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA];
+      }
+    }
   }
-    i += delta_i;
   }
-
-  } // batch_id check
 }
 
 
@@ -65,12 +103,16 @@ void task_pack_density_matrix( size_t           ntasks,
                                size_t           LDP,
                                cudaStream_t     stream ) {
 
-  dim3 threads(warp_size,max_warps_per_thread_block,1), blocks(1,1,ntasks);
-  submat_set_combined_kernel<<< blocks, threads, 0, stream >>>(
-    ntasks, device_tasks, P_device, LDP
-  );
-
+  dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks);
 
+  const int submat_block_size = get_submat_cut_block(LDP);
+  for (int i = 0; i < util::div_ceil(LDP, submat_block_size); i++) {
+    for (int j = 0; j < util::div_ceil(LDP, submat_block_size); j++) {
+      submat_set_combined_kernel<<< blocks, threads, 0, stream >>>(
+        ntasks, device_tasks, P_device, LDP, i, j
+      );
+    }
+  }
 }
 
 template 

diff --git a/src/integrator/cuda/gauxc-cuda_integrator.cmake b/src/integrator/cuda/gauxc-cuda_integrator.cmake
@@ -12,6 +12,7 @@ target_sources( gauxc PRIVATE cuda/collocation_device.cu
                               cuda/cublas_extensions.cu
                               cuda/cuda_zmat.cu
                               cuda/cuda_inc_potential.cu
+			      cuda/cuda_device_properties.cxx
 )
 
 target_compile_features( gauxc PRIVATE cuda_std_14 )