Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(gpu): fix sample extraction when nth > 0 and keep input unchanged #1425

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/ciphertext.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef CUDA_CIPHERTEXT_H
#define CUDA_CIPHERTEXT_H

#include "device.h"
#include <cstdint>

extern "C" {
Expand All @@ -14,5 +15,11 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
void *dest, void *src,
uint32_t number_of_cts,
uint32_t lwe_dimension);

void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t glwe_dimension,
uint32_t polynomial_size);
};
#endif
56 changes: 56 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "ciphertext.cuh"
#include "polynomial/parameters.cuh"

void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
uint32_t gpu_index,
Expand All @@ -19,3 +20,58 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
(uint64_t *)src, number_of_cts, lwe_dimension);
}

void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t glwe_dimension,
uint32_t polynomial_size) {

switch (polynomial_size) {
case 256:
host_sample_extract<uint64_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 512:
host_sample_extract<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 1024:
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 2048:
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 4096:
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 8192:
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
case 16384:
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
glwe_dimension);
break;
default:
PANIC("Cuda error: unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..16384].")
}
}
36 changes: 36 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "ciphertext.h"
#include "device.h"
#include "polynomial/functions.cuh"
#include <cstdint>

template <typename T>
Expand All @@ -25,4 +26,39 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
}

template <typename Torus, class params>
__global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t glwe_dimension) {

const int input_id = blockIdx.x;

const int glwe_input_size = (glwe_dimension + 1) * params::degree;
const int lwe_output_size = glwe_dimension * params::degree + 1;

auto lwe_out = lwe_array_out + input_id * lwe_output_size;

// We assume each GLWE will store the first polynomial_size inputs
uint32_t nth_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;

auto nth = nth_array[input_id];

sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
}

template <typename Torus, class params>
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);

dim3 grid(num_glwes);
pdroalves marked this conversation as resolved.
Show resolved Hide resolved
dim3 thds(params::degree / params::opt);
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
check_cuda_error(cudaGetLastError());
}

#endif
58 changes: 24 additions & 34 deletions backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -191,65 +191,55 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,

// Extracts the body of the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
uint32_t glwe_dimension, uint32_t nth = 0) {
// Set first coefficient of the accumulator as the body of the LWE sample
// Set first coefficient of the glwe as the body of the LWE sample
lwe_array_out[glwe_dimension * params::degree] =
accumulator[glwe_dimension * params::degree + nth];
glwe[glwe_dimension * params::degree + nth];
}

// Extracts the mask from the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
uint32_t num_poly = 1, uint32_t nth = 0) {
for (int z = 0; z < num_poly; z++) {
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
uint32_t glwe_dimension = 1,
uint32_t nth = 0) {
for (int z = 0; z < glwe_dimension; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
Torus *accumulator_slice =
(Torus *)accumulator + (ptrdiff_t)(z * params::degree);
Torus *glwe_slice = (Torus *)glwe + (ptrdiff_t)(z * params::degree);

synchronize_threads_in_block();
// Reverse the accumulator
// Reverse the glwe
// Set ACC = -ACC
int tid = threadIdx.x;
Torus result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = accumulator_slice[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();

// Set ACC = -ACC
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] =
SEL(-result[i], result[i], tid >= params::degree - nth);
auto x = glwe_slice[params::degree - tid - 1];
result[i] = SEL(-x, x, tid >= params::degree - nth);
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();

// Perform ACC * X
// (equivalent to multiply_by_monomial_negacyclic_inplace(1))
// Copy to the mask of the LWE sample
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
// if (tid < 1)
// result[i] = -accumulator_slice[tid - 1 + params::degree];
// result[i] = -glwe_slice[tid - 1 + params::degree];
// else
// result[i] = accumulator_slice[tid - 1];
int x = tid - 1 + SEL(0, params::degree - nth, tid < 1);
result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// result[i] = glwe_slice[tid - 1];
uint32_t dst_idx = tid + 1 + nth;
if (dst_idx == params::degree)
lwe_array_out_slice[0] = -result[i];
else {
dst_idx =
SEL(dst_idx, dst_idx - params::degree, dst_idx >= params::degree);
lwe_array_out_slice[dst_idx] = result[i];
}

// Copy to the mask of the LWE sample
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
lwe_array_out_slice[tid] = result[i];
tid = tid + params::degree / params::opt;
tid += params::degree / params::opt;
}
}
}
Expand Down
10 changes: 10 additions & 0 deletions backends/tfhe-cuda-backend/src/cuda_bind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,16 @@ extern "C" {
gpu_count: u32,
mem_ptr: *mut *mut i8,
);
pub fn cuda_glwe_sample_extract_64(
stream: *mut c_void,
pdroalves marked this conversation as resolved.
Show resolved Hide resolved
gpu_index: u32,
lwe_array_out: *mut c_void,
glwe_array_in: *const c_void,
nth_array: *const u32,
num_glwes: u32,
glwe_dimension: u32,
polynomial_size: u32,
);

pub fn scratch_cuda_integer_radix_comparison_kb_64(
streams: *const *mut c_void,
Expand Down
60 changes: 60 additions & 0 deletions tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::vec::CudaVec;
use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_async, CudaStreams};
use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
use itertools::Itertools;

/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
/// GPU-accelerated.
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
vec_nth: &[MonomialDegree],
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus,
{
let in_lwe_dim = input_glwe_list
.glwe_dimension()
.to_equivalent_lwe_dimension(input_glwe_list.polynomial_size());

let out_lwe_dim = output_lwe_list.lwe_dimension();

assert_eq!(
in_lwe_dim, out_lwe_dim,
"Mismatch between equivalent LweDimension of input ciphertext and output ciphertext. \
Got {in_lwe_dim:?} for input and {out_lwe_dim:?} for output.",
);

assert_eq!(
vec_nth.len(),
input_glwe_list.glwe_ciphertext_count().0 * input_glwe_list.polynomial_size().0,
"Mismatch between number of nths and number of GLWEs provided.",
);

assert_eq!(
input_glwe_list.ciphertext_modulus(),
output_lwe_list.ciphertext_modulus(),
"Mismatched moduli between input_glwe ({:?}) and output_lwe ({:?})",
input_glwe_list.ciphertext_modulus(),
output_lwe_list.ciphertext_modulus()
);

let nth_array: Vec<u32> = vec_nth.iter().map(|x| x.0 as u32).collect_vec();
let gpu_indexes = &streams.gpu_indexes;
unsafe {
let d_nth_array = CudaVec::from_cpu_async(&nth_array, streams, gpu_indexes[0]);
extract_lwe_samples_from_glwe_ciphertext_list_async(
streams,
&mut output_lwe_list.0.d_vec,
&input_glwe_list.0.d_vec,
&d_nth_array,
vec_nth.len() as u32,
input_glwe_list.glwe_dimension(),
input_glwe_list.polynomial_size(),
);
}
}
1 change: 1 addition & 0 deletions tfhe/src/core_crypto/gpu/algorithms/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pub mod lwe_linear_algebra;
pub mod lwe_multi_bit_programmable_bootstrapping;
pub mod lwe_programmable_bootstrapping;

pub mod glwe_sample_extraction;
mod lwe_keyswitch;
#[cfg(test)]
mod test;
Expand Down
Loading
Loading