Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/gauxc/xc_integrator/xc_cuda_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ struct XCCudaData {
F* weights_device_buffer = nullptr;
size_t* shell_list_device_buffer = nullptr;
size_t* shell_offs_device_buffer = nullptr;
int64_t* submat_cut_device_buffer = nullptr;
int32_t* submat_cut_device_buffer = nullptr;
int32_t* submat_block_device_buffer = nullptr;
int32_t* iparent_device_buffer = nullptr;
F* dist_nearest_buffer = nullptr;

Expand Down
4 changes: 3 additions & 1 deletion include/gauxc/xc_task.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,15 @@ struct XCTaskDevice {
size_t nbe;
size_t npts;
size_t ncut;
size_t nblock;
size_t nshells;

double* points;
double* weights;
size_t* shell_list;
size_t* shell_offs;
int64_t* submat_cut;
int32_t* submat_cut;
int32_t* submat_block;

Shell<T>* shells;
double* nbe_scr;
Expand Down
26 changes: 26 additions & 0 deletions src/integrator/cuda/cuda_device_properties.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include <cmath>
#include <algorithm>

#include "cuda_runtime.h"

#include "cuda_device_properties.hpp"

namespace GauXC {
namespace cuda {


uint32_t get_submat_cut_block(int32_t LDA) {
int l2_cache_size;
cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, 0);

int l2_block_size = (int) sqrt(0.75 * ((double) l2_cache_size / 8));
int min_block_size = LDA / max_submat_blocks;

int block_size = std::max(l2_block_size, min_block_size);
block_size = std::min(block_size, LDA);

return block_size;
}

}
}
2 changes: 2 additions & 0 deletions src/integrator/cuda/cuda_device_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@ static constexpr uint32_t max_threads_per_thread_block = 1024;
static constexpr uint32_t max_warps_per_thread_block =
max_threads_per_thread_block / warp_size;

static constexpr uint32_t max_submat_blocks = 10;
uint32_t get_submat_cut_block(int32_t LDA);
}
}
122 changes: 79 additions & 43 deletions src/integrator/cuda/cuda_inc_potential.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "cuda_inc_potential.hpp"
#include "cuda_device_properties.hpp"
#include <gauxc/util/div_ceil.hpp>

#include "cuda_device_properties.hpp"

namespace GauXC {
Expand All @@ -7,55 +10,85 @@ namespace cuda {

using namespace GauXC::cuda;

template <typename T>
__global__ void inc_by_submat_combined_kernel( size_t ntasks,
XCTaskDevice<T>* device_tasks,
T* A,
size_t LDA ) {

const int batch_id = blockIdx.z;
#define WARP_X 16
#define WARP_Y 1
#define UNROLL_FACTOR 4
#define EFF_UNROLL 4
#define CUT_X 8
#define CUT_Y 8

if( batch_id < ntasks ) {

template <typename T>
__global__ __launch_bounds__(1024, 1)
void inc_by_submat_combined_kernel( size_t ntasks,
XCTaskDevice<T>* device_tasks,
T* A,
size_t LDA,
const int block_y,
const int block_x ) {

const int batch_id = blockIdx.z;
auto& task = device_tasks[ batch_id ];

const auto ncut = task.ncut;
const auto* submat_cut_device = task.submat_cut;
const auto* submat_block_device = task.submat_block;
const auto LDAS = task.nbe;
auto* ASmall_device = task.nbe_scr;

//if( LDAS == LDAB ) return;
const int tid_xx = threadIdx.x % WARP_X;
const int tid_xy = threadIdx.x / WARP_X;

const int tid_yx = threadIdx.y % CUT_X;
const int tid_yy = threadIdx.y / CUT_X;

const int start_cut_y = submat_block_device[block_y];
const int end_cut_y = submat_block_device[block_y+1];
const int start_cut_x = submat_block_device[block_x];
const int end_cut_x = submat_block_device[block_x+1];

for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) {
const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut));
const int i_cut_first = i_data.x;
const int delta_i = i_data.y;
const int i_cut_small = i_data.z;

for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) {
const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut));
const int j_cut_first = j_data.x;
const int delta_j = j_data.y;
const int j_cut_small = j_data.z;

auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS;
auto* ABig_begin = A + i_cut_first + j_cut_first*LDA;

int J;
for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) {
for( int I = tid_xx; I < delta_i; I += WARP_X ) {

double val[UNROLL_FACTOR];
double* address[UNROLL_FACTOR];
#pragma unroll
for (int k = 0; k < UNROLL_FACTOR; k++) {
val[k] = ASmall_begin[I + (J+k*WARP_Y)*LDAS];
address[k] = ABig_begin + I + (J+k*WARP_Y)*LDA;
}
#pragma unroll
for (int k = 0; k < UNROLL_FACTOR; k++) {
atomicAdd(address[k], val[k] );
}
}
}

for ( ; J < delta_j; J += WARP_Y) {
for( int I = tid_xx; I < delta_i; I += WARP_X ) {
atomicAdd(ABig_begin + I + J*LDA, ASmall_begin[I + J*LDAS] );
}
}


const int tid_x = blockDim.x * blockIdx.x + threadIdx.x;
const int tid_y = blockDim.y * blockIdx.y + threadIdx.y;

int64_t i(0);
for( size_t i_cut = 0; i_cut < ncut; ++i_cut ) {
const int64_t i_cut_first = submat_cut_device[ 2*i_cut ];
const int64_t i_cut_second = submat_cut_device[ 2*i_cut + 1 ];
const int64_t delta_i = i_cut_second - i_cut_first;

int64_t j(0);
for( size_t j_cut = 0; j_cut < ncut; ++j_cut ) {
const int64_t j_cut_first = submat_cut_device[ 2*j_cut ];
const int64_t j_cut_second = submat_cut_device[ 2*j_cut + 1 ];
const int64_t delta_j = j_cut_second - j_cut_first;

auto* ASmall_begin = ASmall_device + i + j *LDAS;
auto* ABig_begin = A + i_cut_first + j_cut_first*LDA ;

for( size_t J = tid_y; J < delta_j; J += blockDim.y )
for( size_t I = tid_x; I < delta_i; I += blockDim.x )
//ABig_begin[I + J*LDA] += ASmall_begin[I + J*LDAS];
atomicAdd( ABig_begin + I + J*LDA, ASmall_begin[I+J*LDAS] );

j += delta_j;
}
i += delta_i;
}

} // batch_id check
}


Expand All @@ -65,13 +98,16 @@ void task_inc_potential( size_t ntasks,
T* V_device,
size_t LDV,
cudaStream_t stream ) {

dim3 threads(warp_size,max_warps_per_thread_block,1), blocks(1,1,ntasks);
inc_by_submat_combined_kernel<<< blocks, threads, 0, stream >>>(
ntasks, device_tasks, V_device, LDV
);


dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks);

const int submat_block_size = get_submat_cut_block(LDV);
for (int i = 0; i < util::div_ceil(LDV, submat_block_size); i++) {
for (int j = 0; j < util::div_ceil(LDV, submat_block_size); j++) {
inc_by_submat_combined_kernel<<< blocks, threads, 0, stream >>>(
ntasks, device_tasks, V_device, LDV, i, j
);
}
}
}

template
Expand Down
122 changes: 82 additions & 40 deletions src/integrator/cuda/cuda_pack_density.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "cuda_pack_density.hpp"
#include "cuda_device_properties.hpp"
#include <gauxc/util/div_ceil.hpp>

#include "cuda_device_properties.hpp"

namespace GauXC {
Expand All @@ -7,54 +10,89 @@ namespace cuda {

using namespace GauXC::cuda;

template <typename T>
__global__ void submat_set_combined_kernel( size_t ntasks,
XCTaskDevice<T>* device_tasks,
T* A,
size_t LDA ) {
#define WARP_X 16
#define WARP_Y 1
#define UNROLL_FACTOR 4
#define EFF_UNROLL 4
#define CUT_X 8
#define CUT_Y 8

const int batch_id = blockIdx.z;
template <typename T>
__global__ __launch_bounds__(1024, 1)
void submat_set_combined_kernel( size_t ntasks,
XCTaskDevice<T>* device_tasks,
T* A,
size_t LDA,
const int block_y,
const int block_x) {

if( batch_id < ntasks ) {

const int batch_id = blockIdx.z;
auto& task = device_tasks[ batch_id ];

const auto ncut = task.ncut;
const auto* submat_cut_device = task.submat_cut;
const auto* submat_block_device = task.submat_block;
const auto LDAS = task.nbe;
auto* ASmall_device = task.nbe_scr;

//if( LDAS == LDAB ) return;


const int tid_x = blockDim.x * blockIdx.x + threadIdx.x;
const int tid_y = blockDim.y * blockIdx.y + threadIdx.y;

int64_t i(0);
for( size_t i_cut = 0; i_cut < ncut; ++i_cut ) {
const int64_t i_cut_first = submat_cut_device[ 2*i_cut ];
const int64_t i_cut_second = submat_cut_device[ 2*i_cut + 1 ];
const int64_t delta_i = i_cut_second - i_cut_first;

int64_t j(0);
for( size_t j_cut = 0; j_cut < ncut; ++j_cut ) {
const int64_t j_cut_first = submat_cut_device[ 2*j_cut ];
const int64_t j_cut_second = submat_cut_device[ 2*j_cut + 1 ];
const int64_t delta_j = j_cut_second - j_cut_first;

auto* ASmall_begin = ASmall_device + i + j *LDAS;
auto* ABig_begin = A + i_cut_first + j_cut_first*LDA ;

for( size_t J = tid_y; J < delta_j; J += blockDim.y )
for( size_t I = tid_x; I < delta_i; I += blockDim.x )
ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA];

j += delta_j;
const int tid_xx = threadIdx.x % WARP_X;
const int tid_xy = threadIdx.x / WARP_X;

const int tid_yx = threadIdx.y % CUT_X;
const int tid_yy = threadIdx.y / CUT_X;

const int start_cut_y = submat_block_device[block_y];
const int end_cut_y = submat_block_device[block_y+1];
const int start_cut_x = submat_block_device[block_x];
const int end_cut_x = submat_block_device[block_x+1];

for( int i_cut = tid_yy + start_cut_y; i_cut < end_cut_y; i_cut += CUT_Y ) {
const int3 i_data = *((int3*)(submat_cut_device + 3*i_cut));
const int i_cut_first = i_data.x;
const int delta_i = i_data.y;
const int i_cut_small = i_data.z;

for( int j_cut = tid_yx + start_cut_x; j_cut < end_cut_x; j_cut += CUT_X ) {
const int3 j_data = *((int3*)(submat_cut_device + 3*j_cut));
const int j_cut_first = j_data.x;
const int delta_j = j_data.y;
const int j_cut_small = j_data.z;

auto* ASmall_begin = ASmall_device + i_cut_small + j_cut_small*LDAS;
auto* ABig_begin = A + i_cut_first + j_cut_first*LDA;

int J;
for( J = tid_xy; J < (delta_j / EFF_UNROLL) * EFF_UNROLL; J += EFF_UNROLL ) {
for( int I = tid_xx; I < delta_i; I += WARP_X ) {

double val[UNROLL_FACTOR];
double* address[UNROLL_FACTOR];
#pragma unroll
for (int k = 0; k < UNROLL_FACTOR; k++) {
val[k] = ABig_begin[I + (J + k*WARP_Y)*LDA];
address[k] = ASmall_begin + I + (J + k*WARP_Y) * LDAS;
}
#pragma unroll
for (int k = 0; k < UNROLL_FACTOR; k++) {
// Suggest that the result be evicted first.
#if (CUDART_VERSION >= 11000)
__stcs(address[k], val[k]);
#else
asm ("st.global.cs.f64 [%0], %1;" :: "l"(address[k]), "d"(val[k]));
#endif
}
}
}

for ( ; J < delta_j; J += WARP_Y) {
for( int I = tid_xx; I < delta_i; I += WARP_X ) {
ASmall_begin[I + J*LDAS] = ABig_begin[I + J*LDA];
}
}
}
i += delta_i;
}

} // batch_id check
}


Expand All @@ -65,12 +103,16 @@ void task_pack_density_matrix( size_t ntasks,
size_t LDP,
cudaStream_t stream ) {

dim3 threads(warp_size,max_warps_per_thread_block,1), blocks(1,1,ntasks);
submat_set_combined_kernel<<< blocks, threads, 0, stream >>>(
ntasks, device_tasks, P_device, LDP
);

dim3 threads(warp_size / 2, max_warps_per_thread_block * 2, 1), blocks(1,1,ntasks);

const int submat_block_size = get_submat_cut_block(LDP);
for (int i = 0; i < util::div_ceil(LDP, submat_block_size); i++) {
for (int j = 0; j < util::div_ceil(LDP, submat_block_size); j++) {
submat_set_combined_kernel<<< blocks, threads, 0, stream >>>(
ntasks, device_tasks, P_device, LDP, i, j
);
}
}
}

template
Expand Down
1 change: 1 addition & 0 deletions src/integrator/cuda/gauxc-cuda_integrator.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ target_sources( gauxc PRIVATE cuda/collocation_device.cu
cuda/cublas_extensions.cu
cuda/cuda_zmat.cu
cuda/cuda_inc_potential.cu
cuda/cuda_device_properties.cxx
)

target_compile_features( gauxc PRIVATE cuda_std_14 )
Expand Down
Loading