vortex-data · 0ax1 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -194,6 +194,9 @@ benchmarks/.out
 *.parquet
 !docs/_static/example.parquet
 
+# Generated CUDA kernel artifacts
+*.fatbin
+
 # TPC-H benchmarking data
 /data/
 

diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
@@ -6,7 +6,7 @@
 #![expect(clippy::use_debug)]
 
 use std::env;
-use std::fs::File;
+use std::fs;
 use std::io;
 use std::path::Path;
 use std::path::PathBuf;
@@ -27,13 +27,13 @@ fn main() {
 
     // Source directory for kernels (hand-written and generated .cu/.cuh files)
     let kernels_src = Path::new(&manifest_dir).join("kernels/src");
-    // Output directory for compiled .ptx files - separate by profile.
+    // Output directory for compiled CUDA module files - separate by profile.
     let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile);
 
-    std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
+    fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
 
     // Always emit the kernels output directory path as a compile-time env var so any binary
-    // linking against vortex-cuda can find the PTX files. This must be set regardless
+    // linking against vortex-cuda can find the CUDA module files. This must be set regardless
     // of CUDA availability since the code using env!() is always compiled.
     // At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path.
     println!(
@@ -64,8 +64,8 @@ fn main() {
         return;
     }
 
-    // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen
-    if let Ok(entries) = std::fs::read_dir(&kernels_src) {
+    // Watch and compile .cu and .cuh files from kernels/src to CUDA modules in kernels/gen
+    if let Ok(entries) = fs::read_dir(&kernels_src) {
         for path in entries.flatten().map(|entry| entry.path()) {
             let is_generated = path
                 .file_name()
@@ -86,8 +86,8 @@ fn main() {
                     if !is_generated {
                         println!("cargo:rerun-if-changed={}", path.display());
                     }
-                    // Compile all .cu files to PTX in gen directory
-                    nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile)
+                    // Compile all .cu files to CUDA fatbins in gen directory
+                    nvcc_compile_fatbin(&kernels_src, &kernels_gen, &path, &profile)
                         .map_err(|e| {
                             format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
                         })
@@ -103,19 +103,19 @@ fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::
     // Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
     // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
     let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
-    let mut cuh_file = File::create(&cuh_path)?;
+    let mut cuh_file = fs::File::create(&cuh_path)?;
     generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
 
     // Generate the standalone kernels (.cu) — includes the lanes header,
-    // adds _device template + __global__ wrappers. Compiled to its own PTX.
+    // adds _device template + __global__ wrappers. Compiled to its own CUDA module.
     let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
-    let mut cu_file = File::create(&cu_path)?;
+    let mut cu_file = fs::File::create(&cu_path)?;
     generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
 
     Ok(cu_path)
 }
 
-fn nvcc_compile_ptx(
+fn nvcc_compile_fatbin(
     include_dir: &Path,
     output_dir: &Path,
     cu_path: &Path,
@@ -148,23 +148,24 @@ fn nvcc_compile_ptx(
         cmd.arg("-O3");
     }
 
-    // Output PTX file goes to output_dir with same base name
-    let ptx_path = output_dir
+    // Output CUDA fatbin file goes to output_dir with same base name.
+    let fatbin_path = output_dir
         .join(cu_path.file_name().unwrap())
-        .with_extension("ptx");
+        .with_extension("fatbin");
 
+    // Embed a single PTX image for Ampere and newer GPUs. The driver JIT-compiles
+    // PTX to the target GPU's SASS at runtime.
     cmd.arg("-std=c++20")
-        .arg("-arch=native")
+        .arg("-gencode=arch=compute_80,code=compute_80")
         // Flags forwarded to Clang.
         .arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror")
         .arg("--restrict")
-        .arg("--ptx")
+        .arg("--fatbin")
         .arg("--include-path")
         .arg(include_dir)
-        .arg("-c")
         .arg(cu_path)
         .arg("-o")
-        .arg(&ptx_path);
+        .arg(&fatbin_path);
 
     let res = cmd.output()?;
 

diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
@@ -187,7 +187,7 @@ impl CudaExecutionCtx {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `ptypes` - List of ptype strings for the kernel name
     ///
     /// # Errors
@@ -212,7 +212,7 @@ impl CudaExecutionCtx {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `type_suffixes` - List of type suffix strings for the kernel name
     ///
     /// # Errors

diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
@@ -191,9 +191,9 @@ pub(crate) fn launch_cuda_kernel_with_config(
     }
 }
 
-/// Loader for CUDA kernels with PTX caching.
+/// Loader for CUDA kernels with module caching.
 ///
-/// Handles loading PTX files, compiling modules, and loading functions.
+/// Handles loading CUDA module files and functions.
 #[derive(Debug)]
 pub(crate) struct KernelLoader {
     /// Cache of loaded CUDA modules, keyed by module name
@@ -215,7 +215,7 @@ impl KernelLoader {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `type_suffixes` - List of type suffix strings for the kernel name (`kernel_i128`)
     /// * `cuda_context` - CUDA context for loading the module
     pub fn load_function(
@@ -235,16 +235,16 @@ impl KernelLoader {
         let module = if let Some(entry) = self.modules.get(module_name) {
             Arc::clone(entry.value())
         } else {
-            let ptx_path = Self::ptx_path_for_module(module_name);
+            let module_path = Self::path_for_module(module_name);
 
-            // Compile and load the CUDA module.
+            // Load the CUDA module.
             let module = cuda_context
-                .load_module(Ptx::from_file(&ptx_path))
+                .load_module(Ptx::from_file(&module_path))
                 .map_err(|e| {
                     vortex_err!(
-                        "Failed to load CUDA module {}, ptx path {}: {}",
+                        "Failed to load CUDA module {}, module path {}: {}",
                         module_name,
-                        ptx_path.display(),
+                        module_path.display(),
                         e
                     )
                 })?;
@@ -262,7 +262,7 @@ impl KernelLoader {
             .map_err(|e| vortex_err!("Failed to load kernel function '{}': {}", kernel_name, e))
     }
 
-    /// Returns the PTX file path for a given module name.
+    /// Returns the CUDA module file path for a given module name.
     ///
     /// Checks for `VORTEX_CUDA_KERNELS_DIR` environment variable at runtime first,
     /// falling back to the path baked in at compile time by build.rs.
@@ -273,11 +273,11 @@ impl KernelLoader {
     ///
     /// # Returns
     ///
-    /// The full path to the PTX file
-    fn ptx_path_for_module(module_name: &str) -> PathBuf {
+    /// The full path to the CUDA module file
+    fn path_for_module(module_name: &str) -> PathBuf {
         let kernels_dir = std::env::var("VORTEX_CUDA_KERNELS_DIR")
             .unwrap_or_else(|_| env!("VORTEX_CUDA_KERNELS_DIR").to_string());
-        Path::new(&kernels_dir).join(format!("{}.ptx", module_name))
+        Path::new(&kernels_dir).join(format!("{}.fatbin", module_name))
     }
 }
 

diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
@@ -29,7 +29,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4;
 /// CUDA session for GPU accelerated execution.
 ///
 /// Maintains a registry of CUDA kernel implementations for array encodings.
-/// Holds the CUDA context for all GPU operations and caches compiled PTX modules.
+/// Holds the CUDA context for all GPU operations and caches loaded CUDA modules.
 #[derive(Clone, Debug)]
 pub struct CudaSession {
     context: Arc<CudaContext>,
@@ -113,12 +113,12 @@ impl CudaSession {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `type_suffixes` - List of type suffix strings to generate kernel name
     ///
     /// # Errors
     ///
-    /// Returns an error if PTX file cannot be read or kernel cannot be loaded.
+    /// Returns an error if the CUDA module file cannot be read or kernel cannot be loaded.
     pub fn load_function_with_suffixes(
         &self,
         module_name: &str,