Skip to content
Merged
24 changes: 15 additions & 9 deletions encodings/fastlanes/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,18 @@ pub mod vortex_fastlanes

pub mod vortex_fastlanes::bit_transpose

pub fn vortex_fastlanes::bit_transpose::transpose_bitbuffer(bits: vortex_buffer::bit::buf::BitBuffer) -> vortex_buffer::bit::buf::BitBuffer

pub fn vortex_fastlanes::bit_transpose::transpose_bits(input: &[u8; 128], output: &mut [u8; 128])

pub fn vortex_fastlanes::bit_transpose::transpose_validity(validity: &vortex_array::validity::Validity, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::validity::Validity>

pub fn vortex_fastlanes::bit_transpose::untranspose_bitbuffer(bits: vortex_buffer::bit::buf::BitBuffer) -> vortex_buffer::bit::buf::BitBuffer

pub fn vortex_fastlanes::bit_transpose::untranspose_bits(input: &[u8; 128], output: &mut [u8; 128])

pub fn vortex_fastlanes::bit_transpose::untranspose_validity(validity: &vortex_array::validity::Validity, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::validity::Validity>

pub mod vortex_fastlanes::bitpack_compress

pub fn vortex_fastlanes::bitpack_compress::bit_width_histogram(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<alloc::vec::Vec<usize>>
Expand Down Expand Up @@ -296,7 +304,7 @@ pub type vortex_fastlanes::Delta::Metadata = vortex_array::metadata::ProstMetada

pub type vortex_fastlanes::Delta::OperationsVTable = vortex_fastlanes::Delta

pub type vortex_fastlanes::Delta::ValidityVTable = vortex_array::vtable::validity::ValidityVTableFromChildSliceHelper
pub type vortex_fastlanes::Delta::ValidityVTable = vortex_fastlanes::Delta

pub fn vortex_fastlanes::Delta::array_eq(array: &vortex_fastlanes::DeltaArray, other: &vortex_fastlanes::DeltaArray, precision: vortex_array::hash::Precision) -> bool

Expand Down Expand Up @@ -340,6 +348,10 @@ impl vortex_array::vtable::operations::OperationsVTable<vortex_fastlanes::Delta>

pub fn vortex_fastlanes::Delta::scalar_at(array: &vortex_fastlanes::DeltaArray, index: usize) -> vortex_error::VortexResult<vortex_array::scalar::Scalar>

impl vortex_array::vtable::validity::ValidityVTable<vortex_fastlanes::Delta> for vortex_fastlanes::Delta

pub fn vortex_fastlanes::Delta::validity(array: &vortex_fastlanes::DeltaArray) -> vortex_error::VortexResult<vortex_array::validity::Validity>

pub struct vortex_fastlanes::DeltaArray

impl vortex_fastlanes::DeltaArray
Expand All @@ -358,9 +370,7 @@ pub fn vortex_fastlanes::DeltaArray::offset(&self) -> usize

pub fn vortex_fastlanes::DeltaArray::try_from_delta_compress_parts(bases: vortex_array::array::ArrayRef, deltas: vortex_array::array::ArrayRef) -> vortex_error::VortexResult<Self>

pub fn vortex_fastlanes::DeltaArray::try_from_primitive_array(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<Self>

pub fn vortex_fastlanes::DeltaArray::try_from_vec<T: vortex_array::dtype::ptype::NativePType>(vec: alloc::vec::Vec<T>) -> vortex_error::VortexResult<Self>
pub fn vortex_fastlanes::DeltaArray::try_from_primitive_array(array: &vortex_array::arrays::primitive::array::PrimitiveArray, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<Self>

pub fn vortex_fastlanes::DeltaArray::try_new(bases: vortex_array::array::ArrayRef, deltas: vortex_array::array::ArrayRef, offset: usize, logical_len: usize) -> vortex_error::VortexResult<Self>

Expand Down Expand Up @@ -394,10 +404,6 @@ impl vortex_array::array::IntoArray for vortex_fastlanes::DeltaArray

pub fn vortex_fastlanes::DeltaArray::into_array(self) -> vortex_array::array::ArrayRef

impl vortex_array::vtable::validity::ValidityChildSliceHelper for vortex_fastlanes::DeltaArray

pub fn vortex_fastlanes::DeltaArray::unsliced_child_and_slice(&self) -> (&vortex_array::array::ArrayRef, usize, usize)

pub struct vortex_fastlanes::FoR

impl vortex_fastlanes::FoR
Expand Down Expand Up @@ -668,6 +674,6 @@ impl vortex_array::vtable::validity::ValidityChildSliceHelper for vortex_fastlan

pub fn vortex_fastlanes::RLEArray::unsliced_child_and_slice(&self) -> (&vortex_array::array::ArrayRef, usize, usize)

pub fn vortex_fastlanes::delta_compress(array: &vortex_array::arrays::primitive::array::PrimitiveArray) -> vortex_error::VortexResult<(vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::arrays::primitive::array::PrimitiveArray)>
pub fn vortex_fastlanes::delta_compress(array: &vortex_array::arrays::primitive::array::PrimitiveArray, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<(vortex_array::arrays::primitive::array::PrimitiveArray, vortex_array::arrays::primitive::array::PrimitiveArray)>

pub fn vortex_fastlanes::initialize(session: &mut vortex_session::VortexSession)
8 changes: 8 additions & 0 deletions encodings/fastlanes/src/bit_transpose/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ mod scalar;
#[cfg(not(feature = "_test-harness"))]
mod x86;

mod validity;

pub use validity::*;

/// Base indices for the first 64 output bytes (lanes 0-7).
/// Each entry indicates the starting input byte index for that output byte group.
/// Pattern: [0*2, 4*2, 2*2, 6*2, 1*2, 5*2, 3*2, 7*2] = [0, 8, 4, 12, 2, 10, 6, 14]
Expand All @@ -39,6 +43,8 @@ const TRANSPOSE_2X2: u64 = 0x00AA_00AA_00AA_00AA;
const TRANSPOSE_4X4: u64 = 0x0000_CCCC_0000_CCCC;
const TRANSPOSE_8X8: u64 = 0x0000_0000_F0F0_F0F0;

/// Transpose 1024-bits into FastLanes layout.
///
/// Dispatch to the best available implementation at runtime.
#[inline]
pub fn transpose_bits(input: &[u8; 128], output: &mut [u8; 128]) {
Expand All @@ -64,6 +70,8 @@ pub fn transpose_bits(input: &[u8; 128], output: &mut [u8; 128]) {
scalar::transpose_bits_scalar(input, output);
}

/// Untranspose 1024-bits from FastLanes layout.
///
/// Dispatch untranspose to the best available implementation at runtime.
#[inline]
pub fn untranspose_bits(input: &[u8; 128], output: &mut [u8; 128]) {
Expand Down
139 changes: 139 additions & 0 deletions encodings/fastlanes/src/bit_transpose/validity.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::mem;
use std::mem::MaybeUninit;

use vortex_array::Canonical;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::arrays::BoolArray;
use vortex_array::validity::Validity;
use vortex_buffer::BitBuffer;
use vortex_buffer::ByteBuffer;
use vortex_buffer::ByteBufferMut;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;

use crate::bit_transpose::transpose_bits;
use crate::bit_transpose::untranspose_bits;

pub fn transpose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> VortexResult<Validity> {
match validity {
Validity::Array(mask) => {
let bools = mask
.clone()
.execute::<Canonical>(ctx)?
.into_bool()
.into_bit_buffer();

Ok(Validity::Array(
BoolArray::new(transpose_bitbuffer(bools), Validity::NonNullable).into_array(),
))
}
v @ Validity::AllValid | v @ Validity::AllInvalid | v @ Validity::NonNullable => {
Ok(v.clone())
}
}
}

#[inline]
pub fn transpose_bitbuffer(bits: BitBuffer) -> BitBuffer {
let (offset, len, bytes) = bits.into_inner();

if bytes.len().is_multiple_of(128) {
match bytes.try_into_mut() {
Ok(mut bytes_mut) => {
// We can ignore the spare trailer capacity that can be an artifact of allocator as we requested 128 multiple chunks
let (chunks, _) = bytes_mut.as_chunks_mut::<128>();
let mut tmp = [0u8; 128];
for chunk in chunks {
transpose_bits(chunk, &mut tmp);
chunk.copy_from_slice(&tmp);
}
BitBuffer::new_with_offset(bytes_mut.freeze().into_byte_buffer(), len, offset)
}
Err(bytes) => bits_op_with_copy(bytes, len, offset, transpose_bits),
}
} else {
bits_op_with_copy(bytes, len, offset, transpose_bits)
}
}

pub fn untranspose_validity(validity: &Validity, ctx: &mut ExecutionCtx) -> VortexResult<Validity> {
match validity {
Validity::Array(mask) => {
let bools = mask
.clone()
.execute::<Canonical>(ctx)?
.into_bool()
.into_bit_buffer();

Ok(Validity::Array(
BoolArray::new(untranspose_bitbuffer(bools), Validity::NonNullable).into_array(),
))
}
v @ Validity::AllValid | v @ Validity::AllInvalid | v @ Validity::NonNullable => {
Ok(v.clone())
}
}
}

#[inline]
pub fn untranspose_bitbuffer(bits: BitBuffer) -> BitBuffer {
assert!(
bits.inner().len().is_multiple_of(128),
"Transpose BitBuffer must be 128-byte aligned"
);
let (offset, len, bytes) = bits.into_inner();
match bytes.try_into_mut() {
Ok(mut bytes_mut) => {
let (chunks, _) = bytes_mut.as_chunks_mut::<128>();
let mut tmp = [0u8; 128];
for chunk in chunks {
untranspose_bits(chunk, &mut tmp);
chunk.copy_from_slice(&tmp);
}
BitBuffer::new_with_offset(bytes_mut.freeze().into_byte_buffer(), len, offset)
}
Err(bytes) => bits_op_with_copy(bytes, len, offset, untranspose_bits),
}
}

fn bits_op_with_copy<F: Fn(&[u8; 128], &mut [u8; 128])>(
bytes: ByteBuffer,
len: usize,
offset: usize,
op: F,
) -> BitBuffer {
let output_len = bytes.len().next_multiple_of(128);
let mut output = ByteBufferMut::with_capacity(output_len);
let (input_chunks, input_trailer) = bytes.as_chunks::<128>();
// We can ignore the spare trailer capacity that can be an artifact of allocator as we requested 128 multiple chunks
let (output_chunks, _) = output.spare_capacity_mut().as_chunks_mut::<128>();

for (input, output) in input_chunks.iter().zip(output_chunks.iter_mut()) {
op(input, unsafe {
mem::transmute::<&mut [MaybeUninit<u8>; 128], &mut [u8; 128]>(output)
});
}

if !input_trailer.is_empty() {
let mut padded_input = [0u8; 128];
padded_input[0..input_trailer.len()].clone_from_slice(input_trailer);
op(&padded_input, unsafe {
mem::transmute::<&mut [MaybeUninit<u8>; 128], &mut [u8; 128]>(
output_chunks
.last_mut()
.vortex_expect("Output wasn't a multiple of 128 bytes"),
)
});
}

unsafe { output.set_len(output_len) };
BitBuffer::new_with_offset(
output.freeze().into_byte_buffer(),
len.next_multiple_of(1024),
offset,
)
}
Loading
Loading