diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock index 4f0ce3df18c..79c2a0c111d 100644 --- a/encodings/fastlanes/public-api.lock +++ b/encodings/fastlanes/public-api.lock @@ -34,10 +34,6 @@ pub fn vortex_fastlanes::bitpack_compress::gather_patches(&vortex_array::arrays: pub mod vortex_fastlanes::bitpack_decompress -pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> - -pub fn vortex_fastlanes::bitpack_decompress::apply_patches_to_uninit_range_fn T>(&mut vortex_array::builders::primitive::UninitRange<'_, T>, &vortex_array::patches::Patches, &mut vortex_array::executor::ExecutionCtx, F) -> vortex_error::VortexResult<()> - pub fn vortex_fastlanes::bitpack_decompress::count_exceptions(u8, &[usize]) -> usize pub fn vortex_fastlanes::bitpack_decompress::unpack_array(vortex_array::array::view::ArrayView<'_, vortex_fastlanes::BitPacked>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs index 6570d26bbc9..5b81580d7a6 100644 --- a/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs +++ b/encodings/fastlanes/src/bitpacking/array/bitpack_decompress.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::mem::MaybeUninit; + use fastlanes::BitPacking; use itertools::Itertools; use num_traits::AsPrimitive; @@ -21,6 +23,7 @@ use vortex_error::VortexResult; use crate::BitPacked; use crate::BitPackedArrayExt; use crate::unpack_iter::BitPacked as BitPackedUnpack; +use crate::unpack_iter::BitUnpackedChunks; /// Unpacks a bit-packed array into a primitive array. pub fn unpack_array( @@ -42,52 +45,96 @@ pub fn unpack_primitive_array( Ok(builder.finish_into_primitive()) } +/// Unpack a bit-packed array directly into a same-typed `PrimitiveBuilder`. +/// +/// This is the fast path for ordinary decompression: full FastLanes chunks are unpacked straight +/// into the final output buffer, avoiding the scratch chunk and copy needed by mapped decode. pub(crate) fn unpack_into_primitive_builder( array: ArrayView<'_, BitPacked>, - // TODO(ngates): do we want to use fastlanes alignment for this buffer? builder: &mut PrimitiveBuilder, ctx: &mut ExecutionCtx, ) -> VortexResult<()> { - // If the array is empty, then we don't need to add anything to the builder. + unpack_into_builder_with( + array, + builder, + ctx, + |v: T| v, + |chunks, output, _| { + chunks.decode_into(output); + }, + ) +} + +/// Unpack a bit-packed array of physical type `F` into a `PrimitiveBuilder`, applying `map` +/// to each value during decompression. +/// +/// Use [`unpack_into_primitive_builder`] for same-type plain decompression. This mapped path is +/// for widening casts or other element-wise transforms: each 1024-element FastLanes chunk is +/// unpacked into a cache-resident scratch buffer and written through `map` directly into the `T` +/// output, so when `F != T` no full-length `F`-typed intermediate is materialized. +/// +/// The caller must ensure that every valid source value is representable in `T` under `map`; no +/// per-value bounds check is performed. +pub(crate) fn unpack_map_into_builder( + array: ArrayView<'_, BitPacked>, + builder: &mut PrimitiveBuilder, + ctx: &mut ExecutionCtx, + map: M, +) -> VortexResult<()> +where + F: BitPackedUnpack, + T: NativePType, + M: Fn(F) -> T, +{ + unpack_into_builder_with(array, builder, ctx, map, |chunks, output, map| { + chunks.decode_map_into(output, map); + }) +} + +fn unpack_into_builder_with( + array: ArrayView<'_, BitPacked>, + builder: &mut PrimitiveBuilder, + ctx: &mut ExecutionCtx, + map: M, + decode: D, +) -> VortexResult<()> +where + F: BitPackedUnpack, + T: NativePType, + M: Fn(F) -> T, + D: FnOnce(&mut BitUnpackedChunks, &mut [MaybeUninit], &M), +{ if array.is_empty() { return Ok(()); } - let mut uninit_range = builder.uninit_range(array.len()); + let len = array.len(); + let mut uninit_range = builder.uninit_range(len); - // SAFETY: We later initialize the the uninitialized range of values with `copy_from_slice`. + // SAFETY: We initialize all `len` values below via `decode` and the patch loop. unsafe { - // Append a dense null Mask. - uninit_range.append_mask(array.validity()?.execute_mask(array.as_ref().len(), ctx)?); + uninit_range.append_mask(array.validity()?.execute_mask(len, ctx)?); } - // SAFETY: `decode_into` will initialize all values in this range. - let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, array.len()) }; + // SAFETY: `decode` writes a value to every slot in this range. + let uninit_slice = unsafe { uninit_range.slice_uninit_mut(0, len) }; - let mut bit_packed_iter = array.unpacked_chunks()?; - bit_packed_iter.decode_into(uninit_slice); + let mut chunks = array.unpacked_chunks::()?; + decode(&mut chunks, uninit_slice, &map); if let Some(patches) = array.patches() { - apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx)?; - }; + apply_patches_to_uninit_range(&mut uninit_range, &patches, ctx, &map)?; + } - // SAFETY: We have set a correct validity mask via `append_mask` with `array.len()` values and - // initialized the same number of values needed via `decode_into`. + // SAFETY: A correct validity mask of `len` values was set via `append_mask`, and the same + // number of values was initialized via `decode` (and overwritten by patches). unsafe { uninit_range.finish(); } Ok(()) } -pub fn apply_patches_to_uninit_range( - dst: &mut UninitRange, - patches: &Patches, - ctx: &mut ExecutionCtx, -) -> VortexResult<()> { - apply_patches_to_uninit_range_fn(dst, patches, ctx, |x| x) -} - -pub fn apply_patches_to_uninit_range_fn T>( +pub(crate) fn apply_patches_to_uninit_range T>( dst: &mut UninitRange, patches: &Patches, ctx: &mut ExecutionCtx, @@ -98,7 +145,7 @@ pub fn apply_patches_to_uninit_range_fn T>( let indices = patches.indices().clone().execute::(ctx)?; let values = patches.values().clone().execute::(ctx)?; assert!(values.all_valid(ctx)?, "Patch values must be all valid"); - let values = values.as_slice::(); + let values = values.as_slice::(); match_each_unsigned_integer_ptype!(indices.ptype(), |P| { for (index, &value) in indices.as_slice::

().iter().zip_eq(values) { @@ -335,10 +382,11 @@ mod tests { let bitpacked = encode(&empty, 0); let mut builder = PrimitiveBuilder::::new(Nullability::NonNullable); - unpack_into_primitive_builder( + unpack_map_into_builder( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), + |v: u32| v, )?; let result = builder.finish_into_primitive(); @@ -363,10 +411,11 @@ mod tests { // Unpack into a new builder. let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 5); - unpack_into_primitive_builder( + unpack_map_into_builder( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), + |v: u32| v, )?; let result = builder.finish_into_primitive(); @@ -400,10 +449,11 @@ mod tests { // Unpack into a new builder. let mut builder = PrimitiveBuilder::::with_capacity(Nullability::NonNullable, 100); - unpack_into_primitive_builder( + unpack_map_into_builder( bitpacked.as_view(), &mut builder, &mut SESSION.create_execution_ctx(), + |v: u32| v, )?; let result = builder.finish_into_primitive(); diff --git a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs index 2f7187d26f1..a02dfb6b998 100644 --- a/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs +++ b/encodings/fastlanes/src/bitpacking/array/unpack_iter.rs @@ -181,50 +181,96 @@ impl> UnpackedChunks { }) } - /// Decode all chunks (initial, full, and trailer) into the output range. - /// This consolidates the logic for handling all three chunk types in one place. + /// Decode all chunks (initial, full, and trailer) directly into the output range. pub fn decode_into(&mut self, output: &mut [MaybeUninit]) { + debug_assert_eq!(output.len(), self.len); let mut local_idx = 0; - // Handle initial partial chunk if present if let Some(initial) = self.initial() { local_idx = initial.len(); - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 + // TODO(connor): use maybe_uninit_write_slice when it gets stabilized. // SAFETY: &[T] and &[MaybeUninit] have the same layout. let init_initial: &[MaybeUninit] = unsafe { mem::transmute(initial) }; output[..local_idx].copy_from_slice(init_initial); } - // Handle full chunks local_idx = self.decode_full_chunks_into_at(output, local_idx); - // Handle trailing partial chunk if present if let Some(trailer) = self.trailer() { - // TODO(connor): use `maybe_uninit_write_slice` feature when it gets stabilized. - // https://github.com/rust-lang/rust/issues/79995 + // TODO(connor): use maybe_uninit_write_slice when it gets stabilized. // SAFETY: &[T] and &[MaybeUninit] have the same layout. let init_trailer: &[MaybeUninit] = unsafe { mem::transmute(trailer) }; output[local_idx..][..init_trailer.len()].copy_from_slice(init_trailer); + local_idx += init_trailer.len(); } + + debug_assert_eq!(local_idx, self.len); + } + + /// Decode all chunks (initial, full, and trailer), mapping each unpacked value through f. + pub(crate) fn decode_map_into( + &mut self, + output: &mut [MaybeUninit], + mut f: impl FnMut(T) -> U, + ) { + debug_assert_eq!(output.len(), self.len); + let mut local_idx = 0; + + if let Some(initial) = self.initial() { + let chunk_len = initial.len(); + write_map(initial, &mut output[..chunk_len], &mut f); + local_idx += chunk_len; + } + + if self.num_chunks != 1 { + let first_chunk_is_sliced = self.first_chunk_is_sliced(); + let last_chunk_is_sliced = self.last_chunk_is_sliced(); + let full_chunks_range = + (first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize); + + let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed); + let elems_per_chunk = self.elems_per_chunk(); + for i in full_chunks_range { + let chunk = &packed_slice[i * elems_per_chunk..][..elems_per_chunk]; + unsafe { + let dst: &mut [T::Physical] = mem::transmute(&mut self.buffer[..]); + self.strategy.unpack_chunk(self.bit_width, chunk, dst); + let unpacked: &[T] = mem::transmute(&self.buffer[..]); + write_map( + unpacked, + &mut output[local_idx..local_idx + CHUNK_SIZE], + &mut f, + ); + } + local_idx += CHUNK_SIZE; + } + } + + if let Some(trailer) = self.trailer() { + let chunk_len = trailer.len(); + write_map( + trailer, + &mut output[local_idx..local_idx + chunk_len], + &mut f, + ); + local_idx += chunk_len; + } + + debug_assert_eq!(local_idx, self.len); } /// Unpack full chunks into output range starting at the given index. - /// Returns the next local index to write to. fn decode_full_chunks_into_at( &mut self, output: &mut [MaybeUninit], start_idx: usize, ) -> usize { - // If there's only one chunk it has been handled already by `initial` method if self.num_chunks == 1 { - // Return the start_idx since initial already wrote everything. return start_idx; } let first_chunk_is_sliced = self.first_chunk_is_sliced(); - let last_chunk_is_sliced = self.last_chunk_is_sliced(); let full_chunks_range = (first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize); @@ -238,7 +284,7 @@ impl> UnpackedChunks { unsafe { let uninit_dst = &mut output[local_idx..local_idx + CHUNK_SIZE]; - // SAFETY: &[T] and &[MaybeUninit] have the same layout + // SAFETY: &[T] and &[MaybeUninit] have the same layout. let dst: &mut [T::Physical] = mem::transmute(uninit_dst); self.strategy.unpack_chunk(self.bit_width, chunk, dst); } @@ -340,6 +386,12 @@ fn buffer_as_slice(buffer: &ByteBuffer) -> &[T] { unsafe { std::slice::from_raw_parts(packed_ptr, packed_len) } } +fn write_map(src: &[T], dst: &mut [MaybeUninit], f: &mut impl FnMut(T) -> U) { + for (dst, &src) in dst.iter_mut().zip(src.iter()) { + dst.write(f(src)); + } +} + pub trait BitPacked: PhysicalPType {} impl BitPacked for i8 {} diff --git a/encodings/fastlanes/src/bitpacking/compute/cast.rs b/encodings/fastlanes/src/bitpacking/compute/cast.rs index 3cb810e0442..10060eb57e2 100644 --- a/encodings/fastlanes/src/bitpacking/compute/cast.rs +++ b/encodings/fastlanes/src/bitpacking/compute/cast.rs @@ -1,12 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use num_traits::AsPrimitive; use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; +use vortex_array::builders::PrimitiveBuilder; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::cast::CastKernel; use vortex_array::scalar_fn::fns::cast::CastReduce; use vortex_array::validity::Validity; @@ -14,6 +18,18 @@ use vortex_error::VortexResult; use crate::bitpacking::BitPacked; use crate::bitpacking::array::BitPackedArrayExt; +use crate::bitpacking::array::bitpack_decompress::unpack_map_into_builder; + +/// Returns `true` if casting `src` to `tgt` is a widening integer cast for which every value a +/// bit-packed array can hold is guaranteed to be representable in `tgt` (so no per-value bounds +/// check is needed). This holds when `tgt` is strictly wider and either the source is unsigned +/// (always non-negative, fits in any wider type) or the target is also signed (sign-extension). +fn is_widening_int_cast(src: PType, tgt: PType) -> bool { + src.is_int() + && tgt.is_int() + && tgt.byte_width() > src.byte_width() + && (src.is_unsigned_int() || tgt.is_signed_int()) +} fn build_with_validity( array: ArrayView<'_, BitPacked>, @@ -56,14 +72,41 @@ impl CastKernel for BitPacked { dtype: &DType, ctx: &mut ExecutionCtx, ) -> VortexResult> { - if !array.dtype().eq_ignore_nullability(dtype) { + // Nullability-only change: keep the values bit-packed, just adjust validity. + if array.dtype().eq_ignore_nullability(dtype) { + let new_validity = + array + .validity()? + .cast_nullability(dtype.nullability(), array.len(), ctx)?; + return build_with_validity(array, dtype, new_validity).map(Some); + } + + // Widening integer cast: unpack each FastLanes chunk into a cache-resident scratch buffer + // and cast-copy straight into the wide output, avoiding a full-length intermediate buffer + // and the generic cast kernel's bounds-check scan (unnecessary when widening). + let DType::Primitive(tgt, tgt_nullability) = dtype else { + return Ok(None); + }; + let (tgt, tgt_nullability) = (*tgt, *tgt_nullability); + let src = array.dtype().as_ptype(); + if !is_widening_int_cast(src, tgt) { return Ok(None); } - let new_validity = - array - .validity()? - .cast_nullability(dtype.nullability(), array.len(), ctx)?; - build_with_validity(array, dtype, new_validity).map(Some) + + // Surface the standard error if a nullable source with nulls is cast to a non-nullable + // type; on success the per-value validity is handled inside the unpack below. + array + .validity()? + .cast_nullability(tgt_nullability, array.len(), ctx)?; + + let result = match_each_integer_ptype!(tgt, |T| { + let mut builder = PrimitiveBuilder::::with_capacity(tgt_nullability, array.len()); + match_each_integer_ptype!(src, |F| { + unpack_map_into_builder::(array, &mut builder, ctx, |v: F| v.as_())?; + }); + builder.finish_into_primitive().into_array() + }); + Ok(Some(result)) } } @@ -79,9 +122,12 @@ mod tests { use vortex_array::builtins::ArrayBuiltins; use vortex_array::compute::conformance::cast::test_cast_conformance; use vortex_array::dtype::DType; + use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; + use vortex_array::match_each_integer_ptype; use vortex_buffer::buffer; + use vortex_error::VortexResult; use crate::BitPackedArray; use crate::BitPackedData; @@ -124,6 +170,87 @@ mod tests { ); } + /// End-to-end check that the real engine path `array.cast(target).execute()` routes through the + /// bit-packed widening pushdown and matches a plain primitive cast over the same values, across + /// every supported integer pair, several chunk-boundary lengths, and a sliced (offset > 0) case. + #[test] + fn test_cast_bitpacked_widening_via_execute() -> VortexResult<()> { + fn values(len: usize) -> PrimitiveArray { + PrimitiveArray::from_iter((0..len).map(|i| { + let value = if i % 17 == 0 { 31 } else { i % 8 }; + ::from_usize(value) + .expect("test values fit every integer ptype") + })) + } + + fn supported(src: PType, tgt: PType) -> bool { + src.is_int() + && tgt.is_int() + && tgt.byte_width() > src.byte_width() + && (src.is_unsigned_int() || tgt.is_signed_int()) + } + + let ptypes = [ + PType::I8, + PType::I16, + PType::I32, + PType::I64, + PType::U8, + PType::U16, + PType::U32, + PType::U64, + ]; + // Lengths exercise empty, sub-chunk, exact chunk, chunk+1, and multi-chunk-with-trailer. + let lengths = [0, 1, 7, 1023, 1024, 1025, 2051]; + + for src in ptypes { + for tgt in ptypes { + if !supported(src, tgt) { + continue; + } + + for len in lengths { + let source = match_each_integer_ptype!(src, |S| { values::(len) }); + let source_ref = source.into_array(); + let target = DType::Primitive(tgt, Nullability::NonNullable); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + // Reference: plain primitive cast of the same values. + let reference = source_ref + .clone() + .cast(target.clone())? + .execute::(&mut ctx)?; + + // Candidate: bit-pack, then cast through the real engine. This dispatches to + // `BitPacked`'s `CastKernel` widening pushdown. + let packed = bp(&source_ref, 3).into_array(); + let casted = packed + .cast(target.clone())? + .execute::(&mut ctx)?; + assert_arrays_eq!(casted, reference); + + // Also exercise the sliced/offset path (offset > 0, trailer present). + if len >= 4 { + let lo = len / 4; + let hi = len - len / 4; + let sliced = bp(&source_ref, 3).into_array().slice(lo..hi)?; + let casted = sliced + .cast(target.clone())? + .execute::(&mut ctx)?; + let reference = source_ref + .clone() + .slice(lo..hi)? + .cast(target.clone())? + .execute::(&mut ctx)?; + assert_arrays_eq!(casted, reference); + } + } + } + } + + Ok(()) + } + #[rstest] #[case(bp(&buffer![0u8, 10, 20, 30, 40, 50, 60, 63].into_array(), 6))] #[case(bp(&buffer![0u16, 100, 200, 300, 400, 500].into_array(), 9))] diff --git a/encodings/fastlanes/src/for/array/for_decompress.rs b/encodings/fastlanes/src/for/array/for_decompress.rs index a26d6e9053e..073864411fb 100644 --- a/encodings/fastlanes/src/for/array/for_decompress.rs +++ b/encodings/fastlanes/src/for/array/for_decompress.rs @@ -119,11 +119,11 @@ pub(crate) fn fused_decompress< unpacked.decode_into(uninit_slice); if let Some(patches) = bp.patches() { - bitpack_decompress::apply_patches_to_uninit_range_fn( + bitpack_decompress::apply_patches_to_uninit_range( &mut uninit_range, &patches, ctx, - |v| v.wrapping_add(&ref_), + |v: T| v.wrapping_add(&ref_), )?; };