diff --git a/.gitignore b/.gitignore index bcc8ef746ee..8bac7abce15 100644 --- a/.gitignore +++ b/.gitignore @@ -194,6 +194,9 @@ benchmarks/.out *.parquet !docs/_static/example.parquet +# Generated CUDA kernel artifacts +*.fatbin + # TPC-H benchmarking data /data/ diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs index e6024d3c275..82d46e80837 100644 --- a/vortex-cuda/build.rs +++ b/vortex-cuda/build.rs @@ -6,7 +6,7 @@ #![expect(clippy::use_debug)] use std::env; -use std::fs::File; +use std::fs; use std::io; use std::path::Path; use std::path::PathBuf; @@ -27,13 +27,13 @@ fn main() { // Source directory for kernels (hand-written and generated .cu/.cuh files) let kernels_src = Path::new(&manifest_dir).join("kernels/src"); - // Output directory for compiled .ptx files - separate by profile. + // Output directory for compiled CUDA module files - separate by profile. let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile); - std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory"); + fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory"); // Always emit the kernels output directory path as a compile-time env var so any binary - // linking against vortex-cuda can find the PTX files. This must be set regardless + // linking against vortex-cuda can find the CUDA module files. This must be set regardless // of CUDA availability since the code using env!() is always compiled. // At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path. println!( @@ -64,8 +64,8 @@ fn main() { return; } - // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen - if let Ok(entries) = std::fs::read_dir(&kernels_src) { + // Watch and compile .cu and .cuh files from kernels/src to CUDA modules in kernels/gen + if let Ok(entries) = fs::read_dir(&kernels_src) { for path in entries.flatten().map(|entry| entry.path()) { let is_generated = path .file_name() @@ -86,8 +86,8 @@ fn main() { if !is_generated { println!("cargo:rerun-if-changed={}", path.display()); } - // Compile all .cu files to PTX in gen directory - nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile) + // Compile all .cu files to CUDA fatbins in gen directory + nvcc_compile_fatbin(&kernels_src, &kernels_gen, &path, &profile) .map_err(|e| { format!("Failed to compile CUDA kernel {}: {}", path.display(), e) }) @@ -103,19 +103,19 @@ fn generate_unpack(output_dir: &Path, thread_count: usize) -> io:: // Generate the lanes header (.cuh) — device functions only, no __global__ kernels. // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh). let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T)); - let mut cuh_file = File::create(&cuh_path)?; + let mut cuh_file = fs::File::create(&cuh_path)?; generate_cuda_unpack_lanes::(&mut cuh_file)?; // Generate the standalone kernels (.cu) — includes the lanes header, - // adds _device template + __global__ wrappers. Compiled to its own PTX. + // adds _device template + __global__ wrappers. Compiled to its own CUDA module. let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T)); - let mut cu_file = File::create(&cu_path)?; + let mut cu_file = fs::File::create(&cu_path)?; generate_cuda_unpack_kernels::(&mut cu_file, thread_count)?; Ok(cu_path) } -fn nvcc_compile_ptx( +fn nvcc_compile_fatbin( include_dir: &Path, output_dir: &Path, cu_path: &Path, @@ -148,23 +148,24 @@ fn nvcc_compile_ptx( cmd.arg("-O3"); } - // Output PTX file goes to output_dir with same base name - let ptx_path = output_dir + // Output CUDA fatbin file goes to output_dir with same base name. + let fatbin_path = output_dir .join(cu_path.file_name().unwrap()) - .with_extension("ptx"); + .with_extension("fatbin"); + // Embed a single PTX image for Ampere and newer GPUs. The driver JIT-compiles + // PTX to the target GPU's SASS at runtime. cmd.arg("-std=c++20") - .arg("-arch=native") + .arg("-gencode=arch=compute_80,code=compute_80") // Flags forwarded to Clang. .arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror") .arg("--restrict") - .arg("--ptx") + .arg("--fatbin") .arg("--include-path") .arg(include_dir) - .arg("-c") .arg(cu_path) .arg("-o") - .arg(&ptx_path); + .arg(&fatbin_path); let res = cmd.output()?; diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs index 5a4e826da81..8682cbb40b6 100644 --- a/vortex-cuda/src/executor.rs +++ b/vortex-cuda/src/executor.rs @@ -187,7 +187,7 @@ impl CudaExecutionCtx { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `ptypes` - List of ptype strings for the kernel name /// /// # Errors @@ -212,7 +212,7 @@ impl CudaExecutionCtx { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `type_suffixes` - List of type suffix strings for the kernel name /// /// # Errors diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index 5665bc6c3f3..28d12a39433 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -191,9 +191,9 @@ pub(crate) fn launch_cuda_kernel_with_config( } } -/// Loader for CUDA kernels with PTX caching. +/// Loader for CUDA kernels with module caching. /// -/// Handles loading PTX files, compiling modules, and loading functions. +/// Handles loading CUDA module files and functions. #[derive(Debug)] pub(crate) struct KernelLoader { /// Cache of loaded CUDA modules, keyed by module name @@ -215,7 +215,7 @@ impl KernelLoader { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `type_suffixes` - List of type suffix strings for the kernel name (`kernel_i128`) /// * `cuda_context` - CUDA context for loading the module pub fn load_function( @@ -235,16 +235,16 @@ impl KernelLoader { let module = if let Some(entry) = self.modules.get(module_name) { Arc::clone(entry.value()) } else { - let ptx_path = Self::ptx_path_for_module(module_name); + let module_path = Self::path_for_module(module_name); - // Compile and load the CUDA module. + // Load the CUDA module. let module = cuda_context - .load_module(Ptx::from_file(&ptx_path)) + .load_module(Ptx::from_file(&module_path)) .map_err(|e| { vortex_err!( - "Failed to load CUDA module {}, ptx path {}: {}", + "Failed to load CUDA module {}, module path {}: {}", module_name, - ptx_path.display(), + module_path.display(), e ) })?; @@ -262,7 +262,7 @@ impl KernelLoader { .map_err(|e| vortex_err!("Failed to load kernel function '{}': {}", kernel_name, e)) } - /// Returns the PTX file path for a given module name. + /// Returns the CUDA module file path for a given module name. /// /// Checks for `VORTEX_CUDA_KERNELS_DIR` environment variable at runtime first, /// falling back to the path baked in at compile time by build.rs. @@ -273,11 +273,11 @@ impl KernelLoader { /// /// # Returns /// - /// The full path to the PTX file - fn ptx_path_for_module(module_name: &str) -> PathBuf { + /// The full path to the CUDA module file + fn path_for_module(module_name: &str) -> PathBuf { let kernels_dir = std::env::var("VORTEX_CUDA_KERNELS_DIR") .unwrap_or_else(|_| env!("VORTEX_CUDA_KERNELS_DIR").to_string()); - Path::new(&kernels_dir).join(format!("{}.ptx", module_name)) + Path::new(&kernels_dir).join(format!("{}.fatbin", module_name)) } } diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 4ef1736021c..5bcb4100723 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -29,7 +29,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4; /// CUDA session for GPU accelerated execution. /// /// Maintains a registry of CUDA kernel implementations for array encodings. -/// Holds the CUDA context for all GPU operations and caches compiled PTX modules. +/// Holds the CUDA context for all GPU operations and caches loaded CUDA modules. #[derive(Clone, Debug)] pub struct CudaSession { context: Arc, @@ -113,12 +113,12 @@ impl CudaSession { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `type_suffixes` - List of type suffix strings to generate kernel name /// /// # Errors /// - /// Returns an error if PTX file cannot be read or kernel cannot be loaded. + /// Returns an error if the CUDA module file cannot be read or kernel cannot be loaded. pub fn load_function_with_suffixes( &self, module_name: &str,