Skip to content

Commit

Permalink
feat(gpu): implement CompressedCudaCiphertextList, and public functio…
Browse files Browse the repository at this point in the history
…nal packing keyswitch
  • Loading branch information
pdroalves committed Aug 5, 2024
1 parent c772944 commit 006255d
Show file tree
Hide file tree
Showing 53 changed files with 4,145 additions and 362 deletions.
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O0 -G -g \
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")

Expand Down
146 changes: 138 additions & 8 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ enum SHIFT_OR_ROTATE_TYPE {
LEFT_ROTATE = 2,
RIGHT_ROTATE = 3
};
enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
enum BITOP_TYPE {
BITAND = 0,
BITOR = 1,
Expand All @@ -36,6 +35,11 @@ enum COMPARISON_TYPE {
MIN = 7,
};

enum COMPRESSION_MODE {
COMPRESS = 0,
DECOMPRESS = 1,
};

enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
Expand Down Expand Up @@ -202,6 +206,30 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);

void scratch_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
COMPRESSION_MODE mode, bool allocate_gpu_memory);

void cuda_compression_compress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr);

void cuda_compression_decompress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);

void cleanup_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down Expand Up @@ -424,6 +452,7 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,

struct int_radix_params {
PBS_TYPE pbs_type;
PBS_EXECUTION_MODE pbs_execution_mode;
uint32_t glwe_dimension;
uint32_t polynomial_size;
uint32_t big_lwe_dimension;
Expand All @@ -438,28 +467,45 @@ struct int_radix_params {

int_radix_params(){};

int_radix_params(PBS_TYPE pbs_type, PBS_EXECUTION_MODE pbs_execution_mode,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t message_modulus, uint32_t carry_modulus)
: pbs_type(pbs_type), pbs_execution_mode(pbs_execution_mode),
glwe_dimension(glwe_dimension), polynomial_size(polynomial_size),
big_lwe_dimension(big_lwe_dimension),
small_lwe_dimension(small_lwe_dimension), ks_level(ks_level),
ks_base_log(ks_base_log), pbs_level(pbs_level),
pbs_base_log(pbs_base_log), grouping_factor(grouping_factor),
message_modulus(message_modulus), carry_modulus(carry_modulus){};

int_radix_params(PBS_TYPE pbs_type, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t message_modulus, uint32_t carry_modulus)
: pbs_type(pbs_type), glwe_dimension(glwe_dimension),
polynomial_size(polynomial_size), big_lwe_dimension(big_lwe_dimension),
: pbs_type(pbs_type), pbs_execution_mode(PBS_EXECUTION_MODE::FULL_PBS),
glwe_dimension(glwe_dimension), polynomial_size(polynomial_size),
big_lwe_dimension(big_lwe_dimension),
small_lwe_dimension(small_lwe_dimension), ks_level(ks_level),
ks_base_log(ks_base_log), pbs_level(pbs_level),
pbs_base_log(pbs_base_log), grouping_factor(grouping_factor),
message_modulus(message_modulus), carry_modulus(carry_modulus){};

void print() {
printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
printf("pbs_type: %u, pbs_execution_mode: %u, glwe_dimension: %u, "
"polynomial_size: %u, "
"big_lwe_dimension: %u, "
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
"%u, pbs_base_log: "
"%u, grouping_factor: %u, message_modulus: %u, carry_modulus: %u\n",
pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension,
small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
pbs_type, pbs_execution_mode, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
pbs_level, pbs_base_log, grouping_factor, message_modulus,
carry_modulus);
};
};

Expand Down Expand Up @@ -526,7 +572,7 @@ template <typename Torus> struct int_radix_lut {
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
allocate_gpu_memory);
params.pbs_execution_mode, allocate_gpu_memory);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
}
Expand Down Expand Up @@ -790,6 +836,90 @@ template <typename Torus> struct int_radix_lut {
}
};

template <typename Torus> struct int_compression {
COMPRESSION_MODE mode;
int_radix_params encryption_params;
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;

uint32_t body_count;

// Compression
Torus *tmp_lwe;
Torus *tmp_glwe_array_out;

// Decompression
Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
int_radix_lut<Torus> *carry_extract_lut;

int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory) {
this->mode = mode;
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;

if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;

tmp_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);

if (mode == COMPRESSION_MODE::DECOMPRESS) {
carry_extract_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_radix_blocks, allocate_gpu_memory);

tmp_extracted_glwe = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Decompression
// Carry extract LUT
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
return x / encryption_params.message_modulus;
};

generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
carry_extract_lut->get_lut(gpu_indexes[0], 0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
encryption_params.message_modulus, encryption_params.carry_modulus,
carry_extract_f);

carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
if (mode == COMPRESSION_MODE::DECOMPRESS) {
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
}
}
};
template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
Expand Down
14 changes: 14 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/keyswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);

void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, uint32_t gpu_index,
void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count);

void cuda_fp_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
}

#endif // CNCRT_KS_H_
18 changes: 13 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
enum PBS_EXECUTION_MODE { FULL_PBS = 0, PBS_WITHOUT_MS = 1 };

extern "C" {
void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
Expand Down Expand Up @@ -54,12 +55,14 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
int8_t **pbs_buffer);

void scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
void *stream, uint32_t gpu_index, int8_t **buffer,
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

void scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
void *stream, uint32_t gpu_index, int8_t **buffer,
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

Expand Down Expand Up @@ -163,6 +166,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

PBS_VARIANT pbs_variant;

PBS_EXECUTION_MODE pbs_execution_mode = PBS_EXECUTION_MODE::FULL_PBS;

pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
Expand Down Expand Up @@ -368,20 +373,23 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
template <typename Torus>
void scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
#endif

template <typename Torus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
void scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(

void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory);

void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
Expand Down Expand Up @@ -65,14 +66,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory);

template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

template <typename Torus>
Expand All @@ -87,9 +90,10 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
PBS_EXECUTION_MODE pbs_execution_mode, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory);

template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Expand Down Expand Up @@ -150,6 +154,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {

PBS_VARIANT pbs_variant;

PBS_EXECUTION_MODE pbs_execution_mode = PBS_EXECUTION_MODE::FULL_PBS;

pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
Expand Down
85 changes: 85 additions & 0 deletions backends/tfhe-cuda-backend/cuda/nohup.out

Large diffs are not rendered by default.

14 changes: 0 additions & 14 deletions backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Expand Down
Loading

0 comments on commit 006255d

Please sign in to comment.