From 6847cbe6d661b5756a409c70e3824333f0564892 Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Thu, 21 May 2026 13:15:55 +0000
Subject: [PATCH 1/2] fix: rebuild CUDA PTX when GPU config changes

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 vortex-cuda/build.rs | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
index e6024d3c275..04f1533969f 100644
--- a/vortex-cuda/build.rs
+++ b/vortex-cuda/build.rs
@@ -6,6 +6,7 @@
 #![expect(clippy::use_debug)]
 
 use std::env;
+use std::fs;
 use std::fs::File;
 use std::io;
 use std::path::Path;
@@ -20,6 +21,8 @@ use crate::bit_unpack_gen::generate_cuda_unpack_lanes;
 #[path = "src/bit_unpack_gen.rs"]
 pub mod bit_unpack_gen;
 
+const NVIDIA_GPU_INFO_DIR: &str = "/proc/driver/nvidia/gpus";
+
 fn main() {
     let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
     // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
@@ -30,7 +33,7 @@ fn main() {
     // Output directory for compiled .ptx files - separate by profile.
     let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile);
 
-    std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
+    fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
 
     // Always emit the kernels output directory path as a compile-time env var so any binary
     // linking against vortex-cuda can find the PTX files. This must be set regardless
@@ -42,6 +45,11 @@ fn main() {
     );
 
     println!("cargo:rerun-if-env-changed=PROFILE");
+    // Re-run if the user changes which GPUs are visible to CUDA between builds.
+    println!("cargo:rerun-if-env-changed=CUDA_VISIBLE_DEVICES");
+    // CI stale-rebuild checks require deterministic Cargo inputs, so skip volatile
+    // NVIDIA procfs watches when the standard CI marker is set.
+    println!("cargo:rerun-if-env-changed=CI");
 
     // Regenerate bit_unpack kernels only when the generator changes
     println!(
@@ -64,8 +72,12 @@ fn main() {
         return;
     }
 
+    if env::var_os("CI").is_none() {
+        watch_nvidia_gpu_info_files();
+    }
+
     // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen
-    if let Ok(entries) = std::fs::read_dir(&kernels_src) {
+    if let Ok(entries) = fs::read_dir(&kernels_src) {
         for path in entries.flatten().map(|entry| entry.path()) {
             let is_generated = path
                 .file_name()
@@ -99,6 +111,19 @@ fn main() {
     }
 }
 
+fn watch_nvidia_gpu_info_files() {
+    let Ok(entries) = fs::read_dir(NVIDIA_GPU_INFO_DIR) else {
+        return;
+    };
+
+    for entry in entries.flatten() {
+        let info_path = entry.path().join("information");
+        if info_path.is_file() {
+            println!("cargo:rerun-if-changed={}", info_path.display());
+        }
+    }
+}
+
 fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<PathBuf> {
     // Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
     // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).

From b89019259bed1f5df2943e069cea7a3ff3a070e6 Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Thu, 21 May 2026 14:55:09 +0000
Subject: [PATCH 2/2] compile cuda kernels as fatbin

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
---
 .gitignore                    |  3 ++
 vortex-cuda/build.rs          | 58 ++++++++++-------------------------
 vortex-cuda/src/executor.rs   |  4 +--
 vortex-cuda/src/kernel/mod.rs | 24 +++++++--------
 vortex-cuda/src/session.rs    |  6 ++--
 5 files changed, 37 insertions(+), 58 deletions(-)

diff --git a/.gitignore b/.gitignore
index bcc8ef746ee..8bac7abce15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -194,6 +194,9 @@ benchmarks/.out
 *.parquet
 !docs/_static/example.parquet
 
+# Generated CUDA kernel artifacts
+*.fatbin
+
 # TPC-H benchmarking data
 /data/
 
diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
index 04f1533969f..82d46e80837 100644
--- a/vortex-cuda/build.rs
+++ b/vortex-cuda/build.rs
@@ -7,7 +7,6 @@
 
 use std::env;
 use std::fs;
-use std::fs::File;
 use std::io;
 use std::path::Path;
 use std::path::PathBuf;
@@ -21,8 +20,6 @@ use crate::bit_unpack_gen::generate_cuda_unpack_lanes;
 #[path = "src/bit_unpack_gen.rs"]
 pub mod bit_unpack_gen;
 
-const NVIDIA_GPU_INFO_DIR: &str = "/proc/driver/nvidia/gpus";
-
 fn main() {
     let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
     // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
@@ -30,13 +27,13 @@ fn main() {
 
     // Source directory for kernels (hand-written and generated .cu/.cuh files)
     let kernels_src = Path::new(&manifest_dir).join("kernels/src");
-    // Output directory for compiled .ptx files - separate by profile.
+    // Output directory for compiled CUDA module files - separate by profile.
     let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile);
 
     fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
 
     // Always emit the kernels output directory path as a compile-time env var so any binary
-    // linking against vortex-cuda can find the PTX files. This must be set regardless
+    // linking against vortex-cuda can find the CUDA module files. This must be set regardless
     // of CUDA availability since the code using env!() is always compiled.
     // At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path.
     println!(
@@ -45,11 +42,6 @@ fn main() {
     );
 
     println!("cargo:rerun-if-env-changed=PROFILE");
-    // Re-run if the user changes which GPUs are visible to CUDA between builds.
-    println!("cargo:rerun-if-env-changed=CUDA_VISIBLE_DEVICES");
-    // CI stale-rebuild checks require deterministic Cargo inputs, so skip volatile
-    // NVIDIA procfs watches when the standard CI marker is set.
-    println!("cargo:rerun-if-env-changed=CI");
 
     // Regenerate bit_unpack kernels only when the generator changes
     println!(
@@ -72,11 +64,7 @@ fn main() {
         return;
     }
 
-    if env::var_os("CI").is_none() {
-        watch_nvidia_gpu_info_files();
-    }
-
-    // Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen
+    // Watch and compile .cu and .cuh files from kernels/src to CUDA modules in kernels/gen
     if let Ok(entries) = fs::read_dir(&kernels_src) {
         for path in entries.flatten().map(|entry| entry.path()) {
             let is_generated = path
@@ -98,8 +86,8 @@ fn main() {
                     if !is_generated {
                         println!("cargo:rerun-if-changed={}", path.display());
                     }
-                    // Compile all .cu files to PTX in gen directory
-                    nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile)
+                    // Compile all .cu files to CUDA fatbins in gen directory
+                    nvcc_compile_fatbin(&kernels_src, &kernels_gen, &path, &profile)
                         .map_err(|e| {
                             format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
                         })
@@ -111,36 +99,23 @@ fn main() {
     }
 }
 
-fn watch_nvidia_gpu_info_files() {
-    let Ok(entries) = fs::read_dir(NVIDIA_GPU_INFO_DIR) else {
-        return;
-    };
-
-    for entry in entries.flatten() {
-        let info_path = entry.path().join("information");
-        if info_path.is_file() {
-            println!("cargo:rerun-if-changed={}", info_path.display());
-        }
-    }
-}
-
 fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<PathBuf> {
     // Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
     // This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
     let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
-    let mut cuh_file = File::create(&cuh_path)?;
+    let mut cuh_file = fs::File::create(&cuh_path)?;
     generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;
 
     // Generate the standalone kernels (.cu) — includes the lanes header,
-    // adds _device template + __global__ wrappers. Compiled to its own PTX.
+    // adds _device template + __global__ wrappers. Compiled to its own CUDA module.
     let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
-    let mut cu_file = File::create(&cu_path)?;
+    let mut cu_file = fs::File::create(&cu_path)?;
     generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;
 
     Ok(cu_path)
 }
 
-fn nvcc_compile_ptx(
+fn nvcc_compile_fatbin(
     include_dir: &Path,
     output_dir: &Path,
     cu_path: &Path,
@@ -173,23 +148,24 @@ fn nvcc_compile_ptx(
         cmd.arg("-O3");
     }
 
-    // Output PTX file goes to output_dir with same base name
-    let ptx_path = output_dir
+    // Output CUDA fatbin file goes to output_dir with same base name.
+    let fatbin_path = output_dir
         .join(cu_path.file_name().unwrap())
-        .with_extension("ptx");
+        .with_extension("fatbin");
 
+    // Embed a single PTX image for Ampere and newer GPUs. The driver JIT-compiles
+    // PTX to the target GPU's SASS at runtime.
     cmd.arg("-std=c++20")
-        .arg("-arch=native")
+        .arg("-gencode=arch=compute_80,code=compute_80")
         // Flags forwarded to Clang.
         .arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror")
         .arg("--restrict")
-        .arg("--ptx")
+        .arg("--fatbin")
         .arg("--include-path")
         .arg(include_dir)
-        .arg("-c")
         .arg(cu_path)
         .arg("-o")
-        .arg(&ptx_path);
+        .arg(&fatbin_path);
 
     let res = cmd.output()?;
 
diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs
index 5a4e826da81..8682cbb40b6 100644
--- a/vortex-cuda/src/executor.rs
+++ b/vortex-cuda/src/executor.rs
@@ -187,7 +187,7 @@ impl CudaExecutionCtx {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `ptypes` - List of ptype strings for the kernel name
     ///
     /// # Errors
@@ -212,7 +212,7 @@ impl CudaExecutionCtx {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `type_suffixes` - List of type suffix strings for the kernel name
     ///
     /// # Errors
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 5665bc6c3f3..28d12a39433 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -191,9 +191,9 @@ pub(crate) fn launch_cuda_kernel_with_config(
     }
 }
 
-/// Loader for CUDA kernels with PTX caching.
+/// Loader for CUDA kernels with module caching.
 ///
-/// Handles loading PTX files, compiling modules, and loading functions.
+/// Handles loading CUDA module files and functions.
 #[derive(Debug)]
 pub(crate) struct KernelLoader {
     /// Cache of loaded CUDA modules, keyed by module name
@@ -215,7 +215,7 @@ impl KernelLoader {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `type_suffixes` - List of type suffix strings for the kernel name (`kernel_i128`)
     /// * `cuda_context` - CUDA context for loading the module
     pub fn load_function(
@@ -235,16 +235,16 @@ impl KernelLoader {
         let module = if let Some(entry) = self.modules.get(module_name) {
             Arc::clone(entry.value())
         } else {
-            let ptx_path = Self::ptx_path_for_module(module_name);
+            let module_path = Self::path_for_module(module_name);
 
-            // Compile and load the CUDA module.
+            // Load the CUDA module.
             let module = cuda_context
-                .load_module(Ptx::from_file(&ptx_path))
+                .load_module(Ptx::from_file(&module_path))
                 .map_err(|e| {
                     vortex_err!(
-                        "Failed to load CUDA module {}, ptx path {}: {}",
+                        "Failed to load CUDA module {}, module path {}: {}",
                         module_name,
-                        ptx_path.display(),
+                        module_path.display(),
                         e
                     )
                 })?;
@@ -262,7 +262,7 @@ impl KernelLoader {
             .map_err(|e| vortex_err!("Failed to load kernel function '{}': {}", kernel_name, e))
     }
 
-    /// Returns the PTX file path for a given module name.
+    /// Returns the CUDA module file path for a given module name.
     ///
     /// Checks for `VORTEX_CUDA_KERNELS_DIR` environment variable at runtime first,
     /// falling back to the path baked in at compile time by build.rs.
@@ -273,11 +273,11 @@ impl KernelLoader {
     ///
     /// # Returns
     ///
-    /// The full path to the PTX file
-    fn ptx_path_for_module(module_name: &str) -> PathBuf {
+    /// The full path to the CUDA module file
+    fn path_for_module(module_name: &str) -> PathBuf {
         let kernels_dir = std::env::var("VORTEX_CUDA_KERNELS_DIR")
             .unwrap_or_else(|_| env!("VORTEX_CUDA_KERNELS_DIR").to_string());
-        Path::new(&kernels_dir).join(format!("{}.ptx", module_name))
+        Path::new(&kernels_dir).join(format!("{}.fatbin", module_name))
     }
 }
 
diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs
index 4ef1736021c..5bcb4100723 100644
--- a/vortex-cuda/src/session.rs
+++ b/vortex-cuda/src/session.rs
@@ -29,7 +29,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4;
 /// CUDA session for GPU accelerated execution.
 ///
 /// Maintains a registry of CUDA kernel implementations for array encodings.
-/// Holds the CUDA context for all GPU operations and caches compiled PTX modules.
+/// Holds the CUDA context for all GPU operations and caches loaded CUDA modules.
 #[derive(Clone, Debug)]
 pub struct CudaSession {
     context: Arc<CudaContext>,
@@ -113,12 +113,12 @@ impl CudaSession {
     ///
     /// # Arguments
     ///
-    /// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
+    /// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
     /// * `type_suffixes` - List of type suffix strings to generate kernel name
     ///
     /// # Errors
     ///
-    /// Returns an error if PTX file cannot be read or kernel cannot be loaded.
+    /// Returns an error if the CUDA module file cannot be read or kernel cannot be loaded.
     pub fn load_function_with_suffixes(
         &self,
         module_name: &str,