From 6847cbe6d661b5756a409c70e3824333f0564892 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Thu, 21 May 2026 13:15:55 +0000 Subject: [PATCH 1/2] fix: rebuild CUDA PTX when GPU config changes Signed-off-by: Alexander Droste --- vortex-cuda/build.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs index e6024d3c275..04f1533969f 100644 --- a/vortex-cuda/build.rs +++ b/vortex-cuda/build.rs @@ -6,6 +6,7 @@ #![expect(clippy::use_debug)] use std::env; +use std::fs; use std::fs::File; use std::io; use std::path::Path; @@ -20,6 +21,8 @@ use crate::bit_unpack_gen::generate_cuda_unpack_lanes; #[path = "src/bit_unpack_gen.rs"] pub mod bit_unpack_gen; +const NVIDIA_GPU_INFO_DIR: &str = "/proc/driver/nvidia/gpus"; + fn main() { let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir"); // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts @@ -30,7 +33,7 @@ fn main() { // Output directory for compiled .ptx files - separate by profile. let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile); - std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory"); + fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory"); // Always emit the kernels output directory path as a compile-time env var so any binary // linking against vortex-cuda can find the PTX files. This must be set regardless @@ -42,6 +45,11 @@ fn main() { ); println!("cargo:rerun-if-env-changed=PROFILE"); + // Re-run if the user changes which GPUs are visible to CUDA between builds. + println!("cargo:rerun-if-env-changed=CUDA_VISIBLE_DEVICES"); + // CI stale-rebuild checks require deterministic Cargo inputs, so skip volatile + // NVIDIA procfs watches when the standard CI marker is set. + println!("cargo:rerun-if-env-changed=CI"); // Regenerate bit_unpack kernels only when the generator changes println!( @@ -64,8 +72,12 @@ fn main() { return; } + if env::var_os("CI").is_none() { + watch_nvidia_gpu_info_files(); + } + // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen - if let Ok(entries) = std::fs::read_dir(&kernels_src) { + if let Ok(entries) = fs::read_dir(&kernels_src) { for path in entries.flatten().map(|entry| entry.path()) { let is_generated = path .file_name() @@ -99,6 +111,19 @@ fn main() { } } +fn watch_nvidia_gpu_info_files() { + let Ok(entries) = fs::read_dir(NVIDIA_GPU_INFO_DIR) else { + return; + }; + + for entry in entries.flatten() { + let info_path = entry.path().join("information"); + if info_path.is_file() { + println!("cargo:rerun-if-changed={}", info_path.display()); + } + } +} + fn generate_unpack(output_dir: &Path, thread_count: usize) -> io::Result { // Generate the lanes header (.cuh) — device functions only, no __global__ kernels. // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh). From b89019259bed1f5df2943e069cea7a3ff3a070e6 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Thu, 21 May 2026 14:55:09 +0000 Subject: [PATCH 2/2] compile cuda kernels as fatbin Signed-off-by: Alexander Droste --- .gitignore | 3 ++ vortex-cuda/build.rs | 58 ++++++++++------------------------- vortex-cuda/src/executor.rs | 4 +-- vortex-cuda/src/kernel/mod.rs | 24 +++++++-------- vortex-cuda/src/session.rs | 6 ++-- 5 files changed, 37 insertions(+), 58 deletions(-) diff --git a/.gitignore b/.gitignore index bcc8ef746ee..8bac7abce15 100644 --- a/.gitignore +++ b/.gitignore @@ -194,6 +194,9 @@ benchmarks/.out *.parquet !docs/_static/example.parquet +# Generated CUDA kernel artifacts +*.fatbin + # TPC-H benchmarking data /data/ diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs index 04f1533969f..82d46e80837 100644 --- a/vortex-cuda/build.rs +++ b/vortex-cuda/build.rs @@ -7,7 +7,6 @@ use std::env; use std::fs; -use std::fs::File; use std::io; use std::path::Path; use std::path::PathBuf; @@ -21,8 +20,6 @@ use crate::bit_unpack_gen::generate_cuda_unpack_lanes; #[path = "src/bit_unpack_gen.rs"] pub mod bit_unpack_gen; -const NVIDIA_GPU_INFO_DIR: &str = "/proc/driver/nvidia/gpus"; - fn main() { let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir"); // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts @@ -30,13 +27,13 @@ fn main() { // Source directory for kernels (hand-written and generated .cu/.cuh files) let kernels_src = Path::new(&manifest_dir).join("kernels/src"); - // Output directory for compiled .ptx files - separate by profile. + // Output directory for compiled CUDA module files - separate by profile. let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile); fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory"); // Always emit the kernels output directory path as a compile-time env var so any binary - // linking against vortex-cuda can find the PTX files. This must be set regardless + // linking against vortex-cuda can find the CUDA module files. This must be set regardless // of CUDA availability since the code using env!() is always compiled. // At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path. println!( @@ -45,11 +42,6 @@ fn main() { ); println!("cargo:rerun-if-env-changed=PROFILE"); - // Re-run if the user changes which GPUs are visible to CUDA between builds. - println!("cargo:rerun-if-env-changed=CUDA_VISIBLE_DEVICES"); - // CI stale-rebuild checks require deterministic Cargo inputs, so skip volatile - // NVIDIA procfs watches when the standard CI marker is set. - println!("cargo:rerun-if-env-changed=CI"); // Regenerate bit_unpack kernels only when the generator changes println!( @@ -72,11 +64,7 @@ fn main() { return; } - if env::var_os("CI").is_none() { - watch_nvidia_gpu_info_files(); - } - - // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen + // Watch and compile .cu and .cuh files from kernels/src to CUDA modules in kernels/gen if let Ok(entries) = fs::read_dir(&kernels_src) { for path in entries.flatten().map(|entry| entry.path()) { let is_generated = path @@ -98,8 +86,8 @@ fn main() { if !is_generated { println!("cargo:rerun-if-changed={}", path.display()); } - // Compile all .cu files to PTX in gen directory - nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile) + // Compile all .cu files to CUDA fatbins in gen directory + nvcc_compile_fatbin(&kernels_src, &kernels_gen, &path, &profile) .map_err(|e| { format!("Failed to compile CUDA kernel {}: {}", path.display(), e) }) @@ -111,36 +99,23 @@ fn main() { } } -fn watch_nvidia_gpu_info_files() { - let Ok(entries) = fs::read_dir(NVIDIA_GPU_INFO_DIR) else { - return; - }; - - for entry in entries.flatten() { - let info_path = entry.path().join("information"); - if info_path.is_file() { - println!("cargo:rerun-if-changed={}", info_path.display()); - } - } -} - fn generate_unpack(output_dir: &Path, thread_count: usize) -> io::Result { // Generate the lanes header (.cuh) — device functions only, no __global__ kernels. // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh). let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T)); - let mut cuh_file = File::create(&cuh_path)?; + let mut cuh_file = fs::File::create(&cuh_path)?; generate_cuda_unpack_lanes::(&mut cuh_file)?; // Generate the standalone kernels (.cu) — includes the lanes header, - // adds _device template + __global__ wrappers. Compiled to its own PTX. + // adds _device template + __global__ wrappers. Compiled to its own CUDA module. let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T)); - let mut cu_file = File::create(&cu_path)?; + let mut cu_file = fs::File::create(&cu_path)?; generate_cuda_unpack_kernels::(&mut cu_file, thread_count)?; Ok(cu_path) } -fn nvcc_compile_ptx( +fn nvcc_compile_fatbin( include_dir: &Path, output_dir: &Path, cu_path: &Path, @@ -173,23 +148,24 @@ fn nvcc_compile_ptx( cmd.arg("-O3"); } - // Output PTX file goes to output_dir with same base name - let ptx_path = output_dir + // Output CUDA fatbin file goes to output_dir with same base name. + let fatbin_path = output_dir .join(cu_path.file_name().unwrap()) - .with_extension("ptx"); + .with_extension("fatbin"); + // Embed a single PTX image for Ampere and newer GPUs. The driver JIT-compiles + // PTX to the target GPU's SASS at runtime. cmd.arg("-std=c++20") - .arg("-arch=native") + .arg("-gencode=arch=compute_80,code=compute_80") // Flags forwarded to Clang. .arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror") .arg("--restrict") - .arg("--ptx") + .arg("--fatbin") .arg("--include-path") .arg(include_dir) - .arg("-c") .arg(cu_path) .arg("-o") - .arg(&ptx_path); + .arg(&fatbin_path); let res = cmd.output()?; diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs index 5a4e826da81..8682cbb40b6 100644 --- a/vortex-cuda/src/executor.rs +++ b/vortex-cuda/src/executor.rs @@ -187,7 +187,7 @@ impl CudaExecutionCtx { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `ptypes` - List of ptype strings for the kernel name /// /// # Errors @@ -212,7 +212,7 @@ impl CudaExecutionCtx { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `type_suffixes` - List of type suffix strings for the kernel name /// /// # Errors diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index 5665bc6c3f3..28d12a39433 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -191,9 +191,9 @@ pub(crate) fn launch_cuda_kernel_with_config( } } -/// Loader for CUDA kernels with PTX caching. +/// Loader for CUDA kernels with module caching. /// -/// Handles loading PTX files, compiling modules, and loading functions. +/// Handles loading CUDA module files and functions. #[derive(Debug)] pub(crate) struct KernelLoader { /// Cache of loaded CUDA modules, keyed by module name @@ -215,7 +215,7 @@ impl KernelLoader { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `type_suffixes` - List of type suffix strings for the kernel name (`kernel_i128`) /// * `cuda_context` - CUDA context for loading the module pub fn load_function( @@ -235,16 +235,16 @@ impl KernelLoader { let module = if let Some(entry) = self.modules.get(module_name) { Arc::clone(entry.value()) } else { - let ptx_path = Self::ptx_path_for_module(module_name); + let module_path = Self::path_for_module(module_name); - // Compile and load the CUDA module. + // Load the CUDA module. let module = cuda_context - .load_module(Ptx::from_file(&ptx_path)) + .load_module(Ptx::from_file(&module_path)) .map_err(|e| { vortex_err!( - "Failed to load CUDA module {}, ptx path {}: {}", + "Failed to load CUDA module {}, module path {}: {}", module_name, - ptx_path.display(), + module_path.display(), e ) })?; @@ -262,7 +262,7 @@ impl KernelLoader { .map_err(|e| vortex_err!("Failed to load kernel function '{}': {}", kernel_name, e)) } - /// Returns the PTX file path for a given module name. + /// Returns the CUDA module file path for a given module name. /// /// Checks for `VORTEX_CUDA_KERNELS_DIR` environment variable at runtime first, /// falling back to the path baked in at compile time by build.rs. @@ -273,11 +273,11 @@ impl KernelLoader { /// /// # Returns /// - /// The full path to the PTX file - fn ptx_path_for_module(module_name: &str) -> PathBuf { + /// The full path to the CUDA module file + fn path_for_module(module_name: &str) -> PathBuf { let kernels_dir = std::env::var("VORTEX_CUDA_KERNELS_DIR") .unwrap_or_else(|_| env!("VORTEX_CUDA_KERNELS_DIR").to_string()); - Path::new(&kernels_dir).join(format!("{}.ptx", module_name)) + Path::new(&kernels_dir).join(format!("{}.fatbin", module_name)) } } diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 4ef1736021c..5bcb4100723 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -29,7 +29,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4; /// CUDA session for GPU accelerated execution. /// /// Maintains a registry of CUDA kernel implementations for array encodings. -/// Holds the CUDA context for all GPU operations and caches compiled PTX modules. +/// Holds the CUDA context for all GPU operations and caches loaded CUDA modules. #[derive(Clone, Debug)] pub struct CudaSession { context: Arc, @@ -113,12 +113,12 @@ impl CudaSession { /// /// # Arguments /// - /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`) + /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`) /// * `type_suffixes` - List of type suffix strings to generate kernel name /// /// # Errors /// - /// Returns an error if PTX file cannot be read or kernel cannot be loaded. + /// Returns an error if the CUDA module file cannot be read or kernel cannot be loaded. pub fn load_function_with_suffixes( &self, module_name: &str,