Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ benchmarks/.out
*.parquet
!docs/_static/example.parquet

# Generated CUDA kernel artifacts
*.fatbin

# TPC-H benchmarking data
/data/

Expand Down
39 changes: 20 additions & 19 deletions vortex-cuda/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#![expect(clippy::use_debug)]

use std::env;
use std::fs::File;
use std::fs;
use std::io;
use std::path::Path;
use std::path::PathBuf;
Expand All @@ -27,13 +27,13 @@ fn main() {

// Source directory for kernels (hand-written and generated .cu/.cuh files)
let kernels_src = Path::new(&manifest_dir).join("kernels/src");
// Output directory for compiled .ptx files - separate by profile.
// Output directory for compiled CUDA module files - separate by profile.
let kernels_gen = Path::new(&manifest_dir).join("kernels/gen").join(&profile);

std::fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");
fs::create_dir_all(&kernels_gen).expect("Failed to create kernels/gen directory");

// Always emit the kernels output directory path as a compile-time env var so any binary
// linking against vortex-cuda can find the PTX files. This must be set regardless
// linking against vortex-cuda can find the CUDA module files. This must be set regardless
// of CUDA availability since the code using env!() is always compiled.
// At runtime, VORTEX_CUDA_KERNELS_DIR can be set to override this path.
println!(
Expand Down Expand Up @@ -64,8 +64,8 @@ fn main() {
return;
}

// Watch and compile .cu and .cuh files from kernels/src to PTX in kernels/gen
if let Ok(entries) = std::fs::read_dir(&kernels_src) {
// Watch and compile .cu and .cuh files from kernels/src to CUDA modules in kernels/gen
if let Ok(entries) = fs::read_dir(&kernels_src) {
for path in entries.flatten().map(|entry| entry.path()) {
let is_generated = path
.file_name()
Expand All @@ -86,8 +86,8 @@ fn main() {
if !is_generated {
println!("cargo:rerun-if-changed={}", path.display());
}
// Compile all .cu files to PTX in gen directory
nvcc_compile_ptx(&kernels_src, &kernels_gen, &path, &profile)
// Compile all .cu files to CUDA fatbins in gen directory
nvcc_compile_fatbin(&kernels_src, &kernels_gen, &path, &profile)
.map_err(|e| {
format!("Failed to compile CUDA kernel {}: {}", path.display(), e)
})
Expand All @@ -103,19 +103,19 @@ fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::
// Generate the lanes header (.cuh) — device functions only, no __global__ kernels.
// This is what dynamic_dispatch.cu includes (via bit_unpack.cuh).
let cuh_path = output_dir.join(format!("bit_unpack_{}_lanes.cuh", T::T));
let mut cuh_file = File::create(&cuh_path)?;
let mut cuh_file = fs::File::create(&cuh_path)?;
generate_cuda_unpack_lanes::<T>(&mut cuh_file)?;

// Generate the standalone kernels (.cu) — includes the lanes header,
// adds _device template + __global__ wrappers. Compiled to its own PTX.
// adds _device template + __global__ wrappers. Compiled to its own CUDA module.
let cu_path = output_dir.join(format!("bit_unpack_{}.cu", T::T));
let mut cu_file = File::create(&cu_path)?;
let mut cu_file = fs::File::create(&cu_path)?;
generate_cuda_unpack_kernels::<T>(&mut cu_file, thread_count)?;

Ok(cu_path)
}

fn nvcc_compile_ptx(
fn nvcc_compile_fatbin(
include_dir: &Path,
output_dir: &Path,
cu_path: &Path,
Expand Down Expand Up @@ -148,23 +148,24 @@ fn nvcc_compile_ptx(
cmd.arg("-O3");
}

// Output PTX file goes to output_dir with same base name
let ptx_path = output_dir
// Output CUDA fatbin file goes to output_dir with same base name.
let fatbin_path = output_dir
.join(cu_path.file_name().unwrap())
.with_extension("ptx");
.with_extension("fatbin");

// Embed a single PTX image for Ampere and newer GPUs. The driver JIT-compiles
// PTX to the target GPU's SASS at runtime.
cmd.arg("-std=c++20")
.arg("-arch=native")
.arg("-gencode=arch=compute_80,code=compute_80")
// Flags forwarded to Clang.
.arg("--compiler-options=-Wall -Wextra -Wpedantic -Werror")
.arg("--restrict")
.arg("--ptx")
.arg("--fatbin")
.arg("--include-path")
.arg(include_dir)
.arg("-c")
.arg(cu_path)
.arg("-o")
.arg(&ptx_path);
.arg(&fatbin_path);

let res = cmd.output()?;

Expand Down
4 changes: 2 additions & 2 deletions vortex-cuda/src/executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ impl CudaExecutionCtx {
///
/// # Arguments
///
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
/// * `ptypes` - List of ptype strings for the kernel name
///
/// # Errors
Expand All @@ -212,7 +212,7 @@ impl CudaExecutionCtx {
///
/// # Arguments
///
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
/// * `type_suffixes` - List of type suffix strings for the kernel name
///
/// # Errors
Expand Down
24 changes: 12 additions & 12 deletions vortex-cuda/src/kernel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,9 @@ pub(crate) fn launch_cuda_kernel_with_config(
}
}

/// Loader for CUDA kernels with PTX caching.
/// Loader for CUDA kernels with module caching.
///
/// Handles loading PTX files, compiling modules, and loading functions.
/// Handles loading CUDA module files and functions.
#[derive(Debug)]
pub(crate) struct KernelLoader {
/// Cache of loaded CUDA modules, keyed by module name
Expand All @@ -215,7 +215,7 @@ impl KernelLoader {
///
/// # Arguments
///
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
/// * `type_suffixes` - List of type suffix strings for the kernel name (`kernel_i128`)
/// * `cuda_context` - CUDA context for loading the module
pub fn load_function(
Expand All @@ -235,16 +235,16 @@ impl KernelLoader {
let module = if let Some(entry) = self.modules.get(module_name) {
Arc::clone(entry.value())
} else {
let ptx_path = Self::ptx_path_for_module(module_name);
let module_path = Self::path_for_module(module_name);

// Compile and load the CUDA module.
// Load the CUDA module.
let module = cuda_context
.load_module(Ptx::from_file(&ptx_path))
.load_module(Ptx::from_file(&module_path))
.map_err(|e| {
vortex_err!(
"Failed to load CUDA module {}, ptx path {}: {}",
"Failed to load CUDA module {}, module path {}: {}",
module_name,
ptx_path.display(),
module_path.display(),
e
)
})?;
Expand All @@ -262,7 +262,7 @@ impl KernelLoader {
.map_err(|e| vortex_err!("Failed to load kernel function '{}': {}", kernel_name, e))
}

/// Returns the PTX file path for a given module name.
/// Returns the CUDA module file path for a given module name.
///
/// Checks for `VORTEX_CUDA_KERNELS_DIR` environment variable at runtime first,
/// falling back to the path baked in at compile time by build.rs.
Expand All @@ -273,11 +273,11 @@ impl KernelLoader {
///
/// # Returns
///
/// The full path to the PTX file
fn ptx_path_for_module(module_name: &str) -> PathBuf {
/// The full path to the CUDA module file
fn path_for_module(module_name: &str) -> PathBuf {
let kernels_dir = std::env::var("VORTEX_CUDA_KERNELS_DIR")
.unwrap_or_else(|_| env!("VORTEX_CUDA_KERNELS_DIR").to_string());
Path::new(&kernels_dir).join(format!("{}.ptx", module_name))
Path::new(&kernels_dir).join(format!("{}.fatbin", module_name))
}
}

Expand Down
6 changes: 3 additions & 3 deletions vortex-cuda/src/session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ const DEFAULT_STREAM_POOL_CAPACITY: usize = 4;
/// CUDA session for GPU accelerated execution.
///
/// Maintains a registry of CUDA kernel implementations for array encodings.
/// Holds the CUDA context for all GPU operations and caches compiled PTX modules.
/// Holds the CUDA context for all GPU operations and caches loaded CUDA modules.
#[derive(Clone, Debug)]
pub struct CudaSession {
context: Arc<CudaContext>,
Expand Down Expand Up @@ -113,12 +113,12 @@ impl CudaSession {
///
/// # Arguments
///
/// * `module_name` - Name of the module (`kernels/{module_name}.ptx`)
/// * `module_name` - Name of the module (`kernels/{module_name}.fatbin`)
/// * `type_suffixes` - List of type suffix strings to generate kernel name
///
/// # Errors
///
/// Returns an error if PTX file cannot be read or kernel cannot be loaded.
/// Returns an error if the CUDA module file cannot be read or kernel cannot be loaded.
pub fn load_function_with_suffixes(
&self,
module_name: &str,
Expand Down
Loading