diff --git a/encodings/dict/src/builders/bytes.rs b/encodings/dict/src/builders/bytes.rs index 883301797b7..3037f8bbf10 100644 --- a/encodings/dict/src/builders/bytes.rs +++ b/encodings/dict/src/builders/bytes.rs @@ -6,9 +6,8 @@ use std::sync::Arc; use arrow_buffer::NullBufferBuilder; use vortex_array::accessor::ArrayAccessor; -use vortex_array::arrays::{ - BinaryView, PrimitiveArray, VarBinVTable, VarBinViewArray, VarBinViewVTable, -}; +use vortex_array::arrays::binary_view::BinaryView; +use vortex_array::arrays::{PrimitiveArray, VarBinVTable, VarBinViewArray, VarBinViewVTable}; use vortex_array::validity::Validity; use vortex_array::{Array, ArrayRef, IntoArray}; use vortex_buffer::{BufferMut, ByteBufferMut}; @@ -65,7 +64,7 @@ impl BytesDictBuilder { if bin_view.is_inlined() { bin_view.as_inlined().value() } else { - &self.values[bin_view.as_view().to_range()] + &self.values[bin_view.as_view().as_range()] } }) } diff --git a/encodings/fsst/src/canonical.rs b/encodings/fsst/src/canonical.rs index bc067b6c8e3..73c36405403 100644 --- a/encodings/fsst/src/canonical.rs +++ b/encodings/fsst/src/canonical.rs @@ -3,7 +3,8 @@ use std::sync::Arc; -use vortex_array::arrays::{BinaryView, VarBinViewArray}; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::binary_view::BinaryView; use vortex_array::builders::{ArrayBuilder, VarBinViewBuilder}; use vortex_array::vtable::{CanonicalVTable, ValidityHelper}; use vortex_array::{Canonical, IntoArray, ToCanonical}; diff --git a/encodings/sparse/src/canonical.rs b/encodings/sparse/src/canonical.rs index 0ae910bdd5a..2eeeace8428 100644 --- a/encodings/sparse/src/canonical.rs +++ b/encodings/sparse/src/canonical.rs @@ -5,9 +5,10 @@ use std::sync::Arc; use itertools::Itertools; use num_traits::NumCast; +use vortex_array::arrays::binary_view::BinaryView; use vortex_array::arrays::{ - BinaryView, BoolArray, BooleanBuffer, ConstantArray, FixedSizeListArray, ListArray, NullArray, - PrimitiveArray, StructArray, VarBinViewArray, smallest_storage_type, + BoolArray, BooleanBuffer, ConstantArray, FixedSizeListArray, ListArray, NullArray, + PrimitiveArray, StructArray, VarBinViewArray, smallest_decimal_value_type, }; use vortex_array::builders::{ArrayBuilder, DecimalBuilder, ListBuilder, builder_with_capacity}; use vortex_array::patches::Patches; @@ -56,7 +57,7 @@ impl CanonicalVTable for SparseVTable { array.len(), ), DType::Decimal(decimal_dtype, nullability) => { - let canonical_decimal_value_type = smallest_storage_type(decimal_dtype); + let canonical_decimal_value_type = smallest_decimal_value_type(decimal_dtype); let fill_value = array.fill_scalar().as_decimal(); match_each_decimal_value_type!(canonical_decimal_value_type, |D| { canonicalize_sparse_decimal::( diff --git a/encodings/zstd/src/array.rs b/encodings/zstd/src/array.rs index 5131e020499..29c35a6eba5 100644 --- a/encodings/zstd/src/array.rs +++ b/encodings/zstd/src/array.rs @@ -7,7 +7,8 @@ use std::sync::Arc; use itertools::Itertools as _; use vortex_array::accessor::ArrayAccessor; -use vortex_array::arrays::{BinaryView, ConstantArray, PrimitiveArray, VarBinViewArray}; +use vortex_array::arrays::binary_view::BinaryView; +use vortex_array::arrays::{ConstantArray, PrimitiveArray, VarBinViewArray}; use vortex_array::compute::filter; use vortex_array::stats::{ArrayStats, StatsSetRef}; use vortex_array::validity::Validity; diff --git a/vortex-array/src/arrays/arbitrary.rs b/vortex-array/src/arrays/arbitrary.rs index cb295932611..031fc96c23f 100644 --- a/vortex-array/src/arrays/arbitrary.rs +++ b/vortex-array/src/arrays/arbitrary.rs @@ -13,10 +13,8 @@ use vortex_error::{VortexExpect, VortexUnwrap}; use vortex_scalar::arbitrary::random_scalar; use vortex_scalar::{Scalar, match_each_decimal_value_type}; -use super::{ - BoolArray, ChunkedArray, NullArray, PrimitiveArray, StructArray, smallest_storage_type, -}; -use crate::arrays::{VarBinArray, VarBinViewArray}; +use super::{BoolArray, ChunkedArray, NullArray, PrimitiveArray, StructArray}; +use crate::arrays::{VarBinArray, VarBinViewArray, smallest_decimal_value_type}; use crate::builders::{ArrayBuilder, DecimalBuilder, FixedSizeListBuilder}; use crate::validity::Validity; use crate::{Array, ArrayRef, IntoArray, ToCanonical, builders}; @@ -99,7 +97,7 @@ fn random_array_chunk( }, DType::Decimal(decimal, n) => { let elem_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?); - match_each_decimal_value_type!(smallest_storage_type(decimal), |DVT| { + match_each_decimal_value_type!(smallest_decimal_value_type(decimal), |DVT| { let mut builder = DecimalBuilder::new::(decimal.precision(), decimal.scale(), *n); for _i in 0..elem_len { diff --git a/vortex-array/src/arrays/bool/array.rs b/vortex-array/src/arrays/bool/array.rs index 051acdcfede..4f8df83066b 100644 --- a/vortex-array/src/arrays/bool/array.rs +++ b/vortex-array/src/arrays/bool/array.rs @@ -5,17 +5,35 @@ use std::ops::BitAnd; use arrow_array::BooleanArray; use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer}; +use itertools::Itertools; use vortex_buffer::ByteBuffer; -use vortex_dtype::DType; +use vortex_dtype::{DType, match_each_integer_ptype}; use vortex_error::{VortexExpect, VortexResult, vortex_ensure}; use vortex_mask::Mask; -use crate::Canonical; -use crate::arrays::{BoolVTable, bool}; -use crate::builders::ArrayBuilder; -use crate::stats::{ArrayStats, StatsSetRef}; +use crate::ToCanonical; +use crate::arrays::bool; +use crate::patches::Patches; +use crate::stats::ArrayStats; use crate::validity::Validity; -use crate::vtable::{ArrayVTable, CanonicalVTable, ValidityHelper}; +use crate::vtable::ValidityHelper; + +pub trait BooleanBufferExt { + /// Slice any full bytes from the buffer, leaving the offset < 8. + fn shrink_offset(self) -> Self; +} + +impl BooleanBufferExt for BooleanBuffer { + fn shrink_offset(self) -> Self { + let byte_offset = self.offset() / 8; + let bit_offset = self.offset() % 8; + let len = self.len(); + let buffer = self + .into_inner() + .slice_with_length(byte_offset, (len + bit_offset).div_ceil(8)); + BooleanBuffer::new(buffer, bit_offset, len) + } +} /// A boolean array that stores true/false values in a compact bit-packed format. /// @@ -48,10 +66,10 @@ use crate::vtable::{ArrayVTable, CanonicalVTable, ValidityHelper}; /// ``` #[derive(Clone, Debug)] pub struct BoolArray { - dtype: DType, - buffer: BooleanBuffer, - pub(crate) validity: Validity, - pub(crate) stats_set: ArrayStats, + pub(super) dtype: DType, + pub(super) buffer: BooleanBuffer, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, } impl BoolArray { @@ -96,6 +114,10 @@ impl BoolArray { len: usize, validity: Validity, ) -> Self { + #[cfg(debug_assertions)] + Self::validate(&buffer, offset, len, &validity) + .vortex_expect("[Debug Assertion]: Invalid `BoolArray` parameters"); + let buffer = BooleanBuffer::new(buffer.into_arrow_buffer(), offset, len); let buffer = buffer.shrink_offset(); Self { @@ -243,6 +265,32 @@ impl BoolArray { }; Mask::from_buffer(buffer) } + + pub fn patch(self, patches: &Patches) -> Self { + let len = self.len(); + let offset = patches.offset(); + let indices = patches.indices().to_primitive(); + let values = patches.values().to_bool(); + + let patched_validity = + self.validity() + .clone() + .patch(len, offset, indices.as_ref(), values.validity()); + + let (mut own_values, bit_offset) = self.into_boolean_builder(); + match_each_integer_ptype!(indices.ptype(), |I| { + for (idx, value) in indices + .as_slice::() + .iter() + .zip_eq(values.boolean_buffer().iter()) + { + #[allow(clippy::cast_possible_truncation)] + own_values.set_bit(*idx as usize - offset + bit_offset, value); + } + }); + + Self::from_bool_buffer(own_values.finish().slice(bit_offset, len), patched_validity) + } } impl From for BoolArray { @@ -268,53 +316,6 @@ impl FromIterator> for BoolArray { } } -impl ValidityHelper for BoolArray { - fn validity(&self) -> &Validity { - &self.validity - } -} - -impl ArrayVTable for BoolVTable { - fn len(array: &BoolArray) -> usize { - array.buffer.len() - } - - fn dtype(array: &BoolArray) -> &DType { - &array.dtype - } - - fn stats(array: &BoolArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl CanonicalVTable for BoolVTable { - fn canonicalize(array: &BoolArray) -> Canonical { - Canonical::Bool(array.clone()) - } - - fn append_to_builder(array: &BoolArray, builder: &mut dyn ArrayBuilder) { - builder.extend_from_array(array.as_ref()) - } -} - -pub trait BooleanBufferExt { - /// Slice any full bytes from the buffer, leaving the offset < 8. - fn shrink_offset(self) -> Self; -} - -impl BooleanBufferExt for BooleanBuffer { - fn shrink_offset(self) -> Self { - let byte_offset = self.offset() / 8; - let bit_offset = self.offset() % 8; - let len = self.len(); - let buffer = self - .into_inner() - .slice_with_length(byte_offset, (len + bit_offset).div_ceil(8)); - BooleanBuffer::new(buffer, bit_offset, len) - } -} - #[cfg(test)] mod tests { use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; @@ -367,6 +368,13 @@ mod tests { #[test] fn patch_sliced_bools() { + let arr = BoolArray::from(BooleanBuffer::new_set(12)); + let sliced = arr.slice(4..12); + let (values, offset) = sliced.to_bool().into_boolean_builder(); + assert_eq!(offset, 4); + assert_eq!(values.len(), 12); + assert_eq!(values.as_slice(), &[255, 15]); + let arr = { let mut builder = BooleanBufferBuilder::new(12); builder.append(false); @@ -431,4 +439,13 @@ mod tests { let (values, _byte_bit_offset) = arr.to_bool().into_boolean_builder(); assert_eq!(values.as_slice(), &[254, 127]); } + + #[test] + fn patch_sliced_bools_offset() { + let arr = BoolArray::from(BooleanBuffer::new_set(15)); + let sliced = arr.slice(4..15); + let (values, offset) = sliced.to_bool().into_boolean_builder(); + assert_eq!(offset, 4); + assert_eq!(values.as_slice(), &[255, 127]); + } } diff --git a/vortex-array/src/arrays/bool/mod.rs b/vortex-array/src/arrays/bool/mod.rs index d8b905e2ee3..7c168a1bfb3 100644 --- a/vortex-array/src/arrays/bool/mod.rs +++ b/vortex-array/src/arrays/bool/mod.rs @@ -2,45 +2,14 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors mod array; -pub mod compute; -mod ops; -mod patch; -mod serde; -#[cfg(feature = "test-harness")] -mod test; - -pub use array::*; -// Re-export the BooleanBuffer type on our API surface. +pub use array::{BoolArray, BooleanBufferExt}; +// Re-export Arrow's `BooleanBuffer` type on our API surface. pub use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; -use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; -use crate::{EncodingId, EncodingRef, vtable}; - -vtable!(Bool); - -impl VTable for BoolVTable { - type Array = BoolArray; - type Encoding = BoolEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - // Enable serde for this encoding - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.bool") - } +pub mod compute; - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(BoolEncoding.as_ref()) - } -} +mod vtable; +pub use vtable::{BoolEncoding, BoolVTable}; -#[derive(Clone, Debug)] -pub struct BoolEncoding; +#[cfg(feature = "test-harness")] +mod test_harness; diff --git a/vortex-array/src/arrays/bool/patch.rs b/vortex-array/src/arrays/bool/patch.rs deleted file mode 100644 index d3fe1a7a050..00000000000 --- a/vortex-array/src/arrays/bool/patch.rs +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use itertools::Itertools; -use vortex_dtype::match_each_integer_ptype; - -use crate::ToCanonical; -use crate::arrays::BoolArray; -use crate::patches::Patches; -use crate::vtable::ValidityHelper; - -impl BoolArray { - pub fn patch(self, patches: &Patches) -> Self { - let len = self.len(); - let offset = patches.offset(); - let indices = patches.indices().to_primitive(); - let values = patches.values().to_bool(); - - let patched_validity = - self.validity() - .clone() - .patch(len, offset, indices.as_ref(), values.validity()); - - let (mut own_values, bit_offset) = self.into_boolean_builder(); - match_each_integer_ptype!(indices.ptype(), |I| { - for (idx, value) in indices - .as_slice::() - .iter() - .zip_eq(values.boolean_buffer().iter()) - { - #[allow(clippy::cast_possible_truncation)] - own_values.set_bit(*idx as usize - offset + bit_offset, value); - } - }); - - Self::from_bool_buffer(own_values.finish().slice(bit_offset, len), patched_validity) - } -} - -#[cfg(test)] -mod tests { - use arrow_buffer::BooleanBuffer; - - use crate::ToCanonical; - use crate::arrays::BoolArray; - - #[test] - fn patch_sliced_bools() { - let arr = BoolArray::from(BooleanBuffer::new_set(12)); - let sliced = arr.slice(4..12); - let (values, offset) = sliced.to_bool().into_boolean_builder(); - assert_eq!(offset, 4); - assert_eq!(values.len(), 12); - assert_eq!(values.as_slice(), &[255, 15]); - } - - #[test] - fn patch_sliced_bools_offset() { - let arr = BoolArray::from(BooleanBuffer::new_set(15)); - let sliced = arr.slice(4..15); - let (values, offset) = sliced.to_bool().into_boolean_builder(); - assert_eq!(offset, 4); - assert_eq!(values.as_slice(), &[255, 127]); - } -} diff --git a/vortex-array/src/arrays/bool/test.rs b/vortex-array/src/arrays/bool/test_harness.rs similarity index 100% rename from vortex-array/src/arrays/bool/test.rs rename to vortex-array/src/arrays/bool/test_harness.rs diff --git a/vortex-array/src/arrays/bool/vtable/array.rs b/vortex-array/src/arrays/bool/vtable/array.rs new file mode 100644 index 00000000000..416b6e89fcd --- /dev/null +++ b/vortex-array/src/arrays/bool/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::{BoolArray, BoolVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for BoolVTable { + fn len(array: &BoolArray) -> usize { + array.buffer.len() + } + + fn dtype(array: &BoolArray) -> &DType { + &array.dtype + } + + fn stats(array: &BoolArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/bool/vtable/canonical.rs b/vortex-array/src/arrays/bool/vtable/canonical.rs new file mode 100644 index 00000000000..b80e43378e4 --- /dev/null +++ b/vortex-array/src/arrays/bool/vtable/canonical.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::{BoolArray, BoolVTable}; +use crate::builders::ArrayBuilder; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for BoolVTable { + fn canonicalize(array: &BoolArray) -> Canonical { + Canonical::Bool(array.clone()) + } + + fn append_to_builder(array: &BoolArray, builder: &mut dyn ArrayBuilder) { + builder.extend_from_array(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/bool/vtable/mod.rs b/vortex-array/src/arrays/bool/vtable/mod.rs new file mode 100644 index 00000000000..56fda71891e --- /dev/null +++ b/vortex-array/src/arrays/bool/vtable/mod.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::BoolArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; +mod visitor; + +vtable!(Bool); + +impl VTable for BoolVTable { + type Array = BoolArray; + type Encoding = BoolEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.bool") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(BoolEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct BoolEncoding; diff --git a/vortex-array/src/arrays/bool/ops.rs b/vortex-array/src/arrays/bool/vtable/operations.rs similarity index 100% rename from vortex-array/src/arrays/bool/ops.rs rename to vortex-array/src/arrays/bool/vtable/operations.rs diff --git a/vortex-array/src/arrays/bool/serde.rs b/vortex-array/src/arrays/bool/vtable/serde.rs similarity index 73% rename from vortex-array/src/arrays/bool/serde.rs rename to vortex-array/src/arrays/bool/vtable/serde.rs index 589fac34fae..b2e284976e6 100644 --- a/vortex-array/src/arrays/bool/serde.rs +++ b/vortex-array/src/arrays/bool/vtable/serde.rs @@ -1,16 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use vortex_buffer::{Alignment, ByteBuffer}; +use vortex_buffer::ByteBuffer; use vortex_dtype::DType; use vortex_error::{VortexResult, vortex_bail, vortex_err}; use super::BoolArray; +use crate::ProstMetadata; use crate::arrays::BoolVTable; use crate::serde::ArrayChildren; use crate::validity::Validity; -use crate::vtable::{SerdeVTable, VTable, VisitorVTable}; -use crate::{ArrayBufferVisitor, ArrayChildVisitor, ProstMetadata}; +use crate::vtable::{SerdeVTable, VTable}; #[derive(prost::Message)] pub struct BoolMetadata { @@ -53,16 +53,3 @@ impl SerdeVTable for BoolVTable { BoolArray::try_new(buffers[0].clone(), metadata.offset as usize, len, validity) } } - -impl VisitorVTable for BoolVTable { - fn visit_buffers(array: &BoolArray, visitor: &mut dyn ArrayBufferVisitor) { - visitor.visit_buffer(&ByteBuffer::from_arrow_buffer( - array.boolean_buffer().clone().into_inner(), - Alignment::none(), - )) - } - - fn visit_children(array: &BoolArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_validity(&array.validity, array.len()); - } -} diff --git a/vortex-array/src/arrays/bool/vtable/validity.rs b/vortex-array/src/arrays/bool/vtable/validity.rs new file mode 100644 index 00000000000..5aec7fbddb5 --- /dev/null +++ b/vortex-array/src/arrays/bool/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::BoolArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for BoolArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/bool/vtable/visitor.rs b/vortex-array/src/arrays/bool/vtable/visitor.rs new file mode 100644 index 00000000000..6ca6b309bd0 --- /dev/null +++ b/vortex-array/src/arrays/bool/vtable/visitor.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_buffer::{Alignment, ByteBuffer}; + +use crate::arrays::{BoolArray, BoolVTable}; +use crate::vtable::VisitorVTable; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for BoolVTable { + fn visit_buffers(array: &BoolArray, visitor: &mut dyn ArrayBufferVisitor) { + visitor.visit_buffer(&ByteBuffer::from_arrow_buffer( + array.boolean_buffer().clone().into_inner(), + Alignment::none(), + )) + } + + fn visit_children(array: &BoolArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_validity(&array.validity, array.len()); + } +} diff --git a/vortex-array/src/arrays/chunked/array.rs b/vortex-array/src/arrays/chunked/array.rs index 36cfac34ab7..3b76d009c50 100644 --- a/vortex-array/src/arrays/chunked/array.rs +++ b/vortex-array/src/arrays/chunked/array.rs @@ -11,23 +11,20 @@ use futures::stream; use vortex_buffer::{Buffer, BufferMut}; use vortex_dtype::DType; use vortex_error::{VortexExpect as _, VortexResult, VortexUnwrap, vortex_bail}; -use vortex_mask::Mask; -use crate::arrays::ChunkedVTable; use crate::iter::{ArrayIterator, ArrayIteratorAdapter}; use crate::search_sorted::{SearchSorted, SearchSortedSide}; -use crate::stats::{ArrayStats, StatsSetRef}; +use crate::stats::ArrayStats; use crate::stream::{ArrayStream, ArrayStreamAdapter}; -use crate::vtable::{ArrayVTable, ValidityVTable}; use crate::{Array, ArrayRef, IntoArray}; #[derive(Clone, Debug)] pub struct ChunkedArray { - dtype: DType, - len: usize, - chunk_offsets: Buffer, - chunks: Vec, - stats_set: ArrayStats, + pub(super) dtype: DType, + pub(super) len: usize, + pub(super) chunk_offsets: Buffer, + pub(super) chunks: Vec, + pub(super) stats_set: ArrayStats, } impl ChunkedArray { @@ -55,6 +52,10 @@ impl ChunkedArray { /// /// All chunks must have exactly the same [`DType`] as the provided `dtype`. pub unsafe fn new_unchecked(chunks: Vec, dtype: DType) -> Self { + #[cfg(debug_assertions)] + Self::validate(&chunks, &dtype) + .vortex_expect("[Debug Assertion]: Invalid `ChunkedArray` parameters"); + let nchunks = chunks.len(); let mut chunk_offsets = BufferMut::::with_capacity(nchunks + 1); @@ -206,58 +207,6 @@ impl FromIterator for ChunkedArray { } } -impl ArrayVTable for ChunkedVTable { - fn len(array: &ChunkedArray) -> usize { - array.len - } - - fn dtype(array: &ChunkedArray) -> &DType { - &array.dtype - } - - fn stats(array: &ChunkedArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl ValidityVTable for ChunkedVTable { - fn is_valid(array: &ChunkedArray, index: usize) -> bool { - if !array.dtype.is_nullable() { - return true; - } - let (chunk, offset_in_chunk) = array.find_chunk_idx(index); - array.chunk(chunk).is_valid(offset_in_chunk) - } - - fn all_valid(array: &ChunkedArray) -> bool { - if !array.dtype().is_nullable() { - return true; - } - for chunk in array.non_empty_chunks() { - if !chunk.all_valid() { - return false; - } - } - true - } - - fn all_invalid(array: &ChunkedArray) -> bool { - if !array.dtype().is_nullable() { - return false; - } - for chunk in array.non_empty_chunks() { - if !chunk.all_invalid() { - return false; - } - } - true - } - - fn validity_mask(array: &ChunkedArray) -> Mask { - array.chunks().iter().map(|a| a.validity_mask()).collect() - } -} - #[cfg(test)] mod test { use vortex_buffer::buffer; diff --git a/vortex-array/src/arrays/chunked/compute/mod.rs b/vortex-array/src/arrays/chunked/compute/mod.rs index bd7e77ec736..fa9b2030613 100644 --- a/vortex-array/src/arrays/chunked/compute/mod.rs +++ b/vortex-array/src/arrays/chunked/compute/mod.rs @@ -3,7 +3,6 @@ mod cast; mod compare; -mod elementwise; mod fill_null; mod filter; mod invert; diff --git a/vortex-array/src/arrays/chunked/mod.rs b/vortex-array/src/arrays/chunked/mod.rs index 092556d2d95..4bc68bd6bbe 100644 --- a/vortex-array/src/arrays/chunked/mod.rs +++ b/vortex-array/src/arrays/chunked/mod.rs @@ -2,40 +2,12 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors mod array; -mod compute; -mod decode; -mod ops; -mod serde; - -pub use array::*; - -use crate::vtable::{NotSupported, VTable}; -use crate::{EncodingId, EncodingRef, vtable}; - -vtable!(Chunked); +pub use array::ChunkedArray; -impl VTable for ChunkedVTable { - type Array = ChunkedArray; - type Encoding = ChunkedEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = Self; - type VisitorVTable = Self; - type ComputeVTable = Self; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.chunked") - } +mod compute; - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(ChunkedEncoding.as_ref()) - } -} +mod vtable; +pub use vtable::{ChunkedEncoding, ChunkedVTable}; -#[derive(Clone, Debug)] -pub struct ChunkedEncoding; +#[cfg(test)] +mod tests; diff --git a/vortex-array/src/arrays/chunked/tests.rs b/vortex-array/src/arrays/chunked/tests.rs new file mode 100644 index 00000000000..0950d2acfd3 --- /dev/null +++ b/vortex-array/src/arrays/chunked/tests.rs @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_buffer::{Buffer, buffer}; +use vortex_dtype::PType::I32; +use vortex_dtype::{DType, NativePType, Nullability, PType}; + +use crate::IntoArray; +use crate::accessor::ArrayAccessor; +use crate::array::Array; +use crate::arrays::{ChunkedArray, ChunkedVTable, ListArray, StructArray, VarBinViewArray}; +use crate::canonical::ToCanonical; +use crate::validity::Validity; + +fn chunked_array() -> ChunkedArray { + ChunkedArray::try_new( + vec![ + buffer![1u64, 2, 3].into_array(), + buffer![4u64, 5, 6].into_array(), + buffer![7u64, 8, 9].into_array(), + ], + DType::Primitive(PType::U64, Nullability::NonNullable), + ) + .unwrap() +} + +fn assert_equal_slices(arr: &dyn Array, slice: &[T]) { + let mut values = Vec::with_capacity(arr.len()); + if let Some(arr) = arr.as_opt::() { + arr.chunks() + .iter() + .map(|a| a.to_primitive()) + .for_each(|a| values.extend_from_slice(a.as_slice::())); + } else { + values.extend_from_slice(arr.to_primitive().as_slice::()); + } + assert_eq!(values, slice); +} + +#[test] +fn slice_middle() { + assert_equal_slices(&chunked_array().slice(2..5), &[3u64, 4, 5]) +} + +#[test] +fn slice_begin() { + assert_equal_slices(&chunked_array().slice(1..3), &[2u64, 3]); +} + +#[test] +fn slice_aligned() { + assert_equal_slices(&chunked_array().slice(3..6), &[4u64, 5, 6]); +} + +#[test] +fn slice_many_aligned() { + assert_equal_slices(&chunked_array().slice(0..6), &[1u64, 2, 3, 4, 5, 6]); +} + +#[test] +fn slice_end() { + assert_equal_slices(&chunked_array().slice(7..8), &[8u64]); +} + +#[test] +fn slice_exactly_end() { + assert_equal_slices(&chunked_array().slice(6..9), &[7u64, 8, 9]); +} + +#[test] +fn slice_empty() { + let chunked = ChunkedArray::try_new(vec![], PType::U32.into()).unwrap(); + let sliced = chunked.slice(0..0); + + assert!(sliced.is_empty()); +} + +#[test] +fn scalar_at_empty_children_both_sides() { + let array = ChunkedArray::try_new( + vec![ + Buffer::::empty().into_array(), + Buffer::::empty().into_array(), + buffer![1u64, 2].into_array(), + Buffer::::empty().into_array(), + Buffer::::empty().into_array(), + ], + DType::Primitive(PType::U64, Nullability::NonNullable), + ) + .unwrap(); + assert_eq!(array.scalar_at(0), 1u64.into()); + assert_eq!(array.scalar_at(1), 2u64.into()); +} + +#[test] +fn scalar_at_empty_children_trailing() { + let array = ChunkedArray::try_new( + vec![ + buffer![1u64, 2].into_array(), + Buffer::::empty().into_array(), + Buffer::::empty().into_array(), + buffer![3u64, 4].into_array(), + ], + DType::Primitive(PType::U64, Nullability::NonNullable), + ) + .unwrap(); + assert_eq!(array.scalar_at(0), 1u64.into()); + assert_eq!(array.scalar_at(1), 2u64.into()); + assert_eq!(array.scalar_at(2), 3u64.into()); + assert_eq!(array.scalar_at(3), 4u64.into()); +} + +#[test] +fn scalar_at_empty_children_leading() { + let array = ChunkedArray::try_new( + vec![ + Buffer::::empty().into_array(), + Buffer::::empty().into_array(), + buffer![1u64, 2].into_array(), + buffer![3u64, 4].into_array(), + ], + DType::Primitive(PType::U64, Nullability::NonNullable), + ) + .unwrap(); + assert_eq!(array.scalar_at(0), 1u64.into()); + assert_eq!(array.scalar_at(1), 2u64.into()); + assert_eq!(array.scalar_at(2), 3u64.into()); + assert_eq!(array.scalar_at(3), 4u64.into()); +} + +#[test] +pub fn pack_nested_structs() { + let struct_array = StructArray::try_new( + ["a"].into(), + vec![VarBinViewArray::from_iter_str(["foo", "bar", "baz", "quak"]).into_array()], + 4, + Validity::NonNullable, + ) + .unwrap(); + let dtype = struct_array.dtype().clone(); + let chunked = ChunkedArray::try_new( + vec![ + ChunkedArray::try_new(vec![struct_array.to_array()], dtype.clone()) + .unwrap() + .into_array(), + ], + dtype, + ) + .unwrap() + .into_array(); + let canonical_struct = chunked.to_struct(); + let canonical_varbin = canonical_struct.fields()[0].to_varbinview(); + let original_varbin = struct_array.fields()[0].to_varbinview(); + let orig_values = original_varbin + .with_iterator(|it| it.map(|a| a.map(|v| v.to_vec())).collect::>()) + .unwrap(); + let canon_values = canonical_varbin + .with_iterator(|it| it.map(|a| a.map(|v| v.to_vec())).collect::>()) + .unwrap(); + assert_eq!(orig_values, canon_values); +} + +#[test] +pub fn pack_nested_lists() { + let l1 = ListArray::try_new( + buffer![1, 2, 3, 4].into_array(), + buffer![0, 3].into_array(), + Validity::NonNullable, + ) + .unwrap(); + + let l2 = ListArray::try_new( + buffer![5, 6].into_array(), + buffer![0, 2].into_array(), + Validity::NonNullable, + ) + .unwrap(); + + let chunked_list = ChunkedArray::try_new( + vec![l1.clone().into_array(), l2.clone().into_array()], + DType::List( + Arc::new(DType::Primitive(I32, Nullability::NonNullable)), + Nullability::NonNullable, + ), + ); + + let canon_values = chunked_list.unwrap().to_list(); + + assert_eq!(l1.scalar_at(0), canon_values.scalar_at(0)); + assert_eq!(l2.scalar_at(0), canon_values.scalar_at(1)); +} diff --git a/vortex-array/src/arrays/chunked/vtable/array.rs b/vortex-array/src/arrays/chunked/vtable/array.rs new file mode 100644 index 00000000000..fd55478241c --- /dev/null +++ b/vortex-array/src/arrays/chunked/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::{ChunkedArray, ChunkedVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for ChunkedVTable { + fn len(array: &ChunkedArray) -> usize { + array.len + } + + fn dtype(array: &ChunkedArray) -> &DType { + &array.dtype + } + + fn stats(array: &ChunkedArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/chunked/decode.rs b/vortex-array/src/arrays/chunked/vtable/canonical.rs similarity index 97% rename from vortex-array/src/arrays/chunked/decode.rs rename to vortex-array/src/arrays/chunked/vtable/canonical.rs index a57f44ec936..7a8c0133e5a 100644 --- a/vortex-array/src/arrays/chunked/decode.rs +++ b/vortex-array/src/arrays/chunked/vtable/canonical.rs @@ -5,13 +5,12 @@ use vortex_buffer::BufferMut; use vortex_dtype::{DType, Nullability, PType, StructFields}; use vortex_error::{VortexExpect, VortexUnwrap, vortex_err}; -use super::ChunkedArray; -use crate::arrays::{ChunkedVTable, ListArray, PrimitiveArray, StructArray}; +use crate::arrays::{ChunkedArray, ChunkedVTable, ListArray, PrimitiveArray, StructArray}; use crate::builders::{ArrayBuilder, builder_with_capacity}; use crate::compute::cast; use crate::validity::Validity; use crate::vtable::CanonicalVTable; -use crate::{Array as _, ArrayRef, Canonical, IntoArray, ToCanonical}; +use crate::{Array, ArrayRef, Canonical, IntoArray, ToCanonical}; impl CanonicalVTable for ChunkedVTable { fn canonicalize(array: &ChunkedArray) -> Canonical { diff --git a/vortex-array/src/arrays/chunked/compute/elementwise.rs b/vortex-array/src/arrays/chunked/vtable/compute.rs similarity index 100% rename from vortex-array/src/arrays/chunked/compute/elementwise.rs rename to vortex-array/src/arrays/chunked/vtable/compute.rs diff --git a/vortex-array/src/arrays/chunked/vtable/mod.rs b/vortex-array/src/arrays/chunked/vtable/mod.rs new file mode 100644 index 00000000000..d91f92b8630 --- /dev/null +++ b/vortex-array/src/arrays/chunked/vtable/mod.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::ChunkedArray; +use crate::vtable::{NotSupported, VTable}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod compute; +mod operations; +mod serde; +mod validity; +mod visitor; + +vtable!(Chunked); + +impl VTable for ChunkedVTable { + type Array = ChunkedArray; + type Encoding = ChunkedEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = Self; + type VisitorVTable = Self; + type ComputeVTable = Self; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.chunked") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(ChunkedEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct ChunkedEncoding; diff --git a/vortex-array/src/arrays/chunked/ops.rs b/vortex-array/src/arrays/chunked/vtable/operations.rs similarity index 98% rename from vortex-array/src/arrays/chunked/ops.rs rename to vortex-array/src/arrays/chunked/vtable/operations.rs index 948c39dfb2f..5ffc206d5c4 100644 --- a/vortex-array/src/arrays/chunked/ops.rs +++ b/vortex-array/src/arrays/chunked/vtable/operations.rs @@ -6,8 +6,7 @@ use std::ops::Range; use itertools::Itertools; use vortex_scalar::Scalar; -use crate::arrays::ChunkedVTable; -use crate::arrays::chunked::ChunkedArray; +use crate::arrays::{ChunkedArray, ChunkedVTable}; use crate::vtable::OperationsVTable; use crate::{Array, ArrayRef, IntoArray}; diff --git a/vortex-array/src/arrays/chunked/serde.rs b/vortex-array/src/arrays/chunked/vtable/serde.rs similarity index 71% rename from vortex-array/src/arrays/chunked/serde.rs rename to vortex-array/src/arrays/chunked/vtable/serde.rs index a88aaacb6e3..e919b812ce4 100644 --- a/vortex-array/src/arrays/chunked/serde.rs +++ b/vortex-array/src/arrays/chunked/vtable/serde.rs @@ -6,12 +6,10 @@ use vortex_buffer::ByteBuffer; use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{VortexResult, vortex_bail, vortex_err}; -use super::ChunkedEncoding; -use crate::arrays::{ChunkedArray, ChunkedVTable, PrimitiveArray}; +use crate::arrays::{ChunkedArray, ChunkedEncoding, ChunkedVTable}; use crate::serde::ArrayChildren; -use crate::validity::Validity; -use crate::vtable::{SerdeVTable, VisitorVTable}; -use crate::{ArrayBufferVisitor, ArrayChildVisitor, EmptyMetadata, ToCanonical}; +use crate::vtable::SerdeVTable; +use crate::{EmptyMetadata, ToCanonical}; impl SerdeVTable for ChunkedVTable { type Metadata = EmptyMetadata; @@ -62,17 +60,3 @@ impl SerdeVTable for ChunkedVTable { unsafe { Ok(ChunkedArray::new_unchecked(chunks, dtype.clone())) } } } - -impl VisitorVTable for ChunkedVTable { - fn visit_buffers(_array: &ChunkedArray, _visitor: &mut dyn ArrayBufferVisitor) {} - - fn visit_children(array: &ChunkedArray, visitor: &mut dyn ArrayChildVisitor) { - let chunk_offsets = - PrimitiveArray::new(array.chunk_offsets().clone(), Validity::NonNullable); - visitor.visit_child("chunk_offsets", chunk_offsets.as_ref()); - - for (idx, chunk) in array.chunks().iter().enumerate() { - visitor.visit_child(format!("chunks[{idx}]").as_str(), chunk); - } - } -} diff --git a/vortex-array/src/arrays/chunked/vtable/validity.rs b/vortex-array/src/arrays/chunked/vtable/validity.rs new file mode 100644 index 00000000000..8d579ab3218 --- /dev/null +++ b/vortex-array/src/arrays/chunked/vtable/validity.rs @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_mask::Mask; + +use crate::Array; +use crate::arrays::{ChunkedArray, ChunkedVTable}; +use crate::vtable::ValidityVTable; + +impl ValidityVTable for ChunkedVTable { + fn is_valid(array: &ChunkedArray, index: usize) -> bool { + if !array.dtype.is_nullable() { + return true; + } + let (chunk, offset_in_chunk) = array.find_chunk_idx(index); + array.chunk(chunk).is_valid(offset_in_chunk) + } + + fn all_valid(array: &ChunkedArray) -> bool { + if !array.dtype().is_nullable() { + return true; + } + for chunk in array.non_empty_chunks() { + if !chunk.all_valid() { + return false; + } + } + true + } + + fn all_invalid(array: &ChunkedArray) -> bool { + if !array.dtype().is_nullable() { + return false; + } + for chunk in array.non_empty_chunks() { + if !chunk.all_invalid() { + return false; + } + } + true + } + + fn validity_mask(array: &ChunkedArray) -> Mask { + array.chunks().iter().map(|a| a.validity_mask()).collect() + } +} diff --git a/vortex-array/src/arrays/chunked/vtable/visitor.rs b/vortex-array/src/arrays/chunked/vtable/visitor.rs new file mode 100644 index 00000000000..8582f9a20a1 --- /dev/null +++ b/vortex-array/src/arrays/chunked/vtable/visitor.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::{ChunkedArray, ChunkedVTable, PrimitiveArray}; +use crate::validity::Validity; +use crate::vtable::VisitorVTable; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for ChunkedVTable { + fn visit_buffers(_array: &ChunkedArray, _visitor: &mut dyn ArrayBufferVisitor) {} + + fn visit_children(array: &ChunkedArray, visitor: &mut dyn ArrayChildVisitor) { + let chunk_offsets = + PrimitiveArray::new(array.chunk_offsets().clone(), Validity::NonNullable); + visitor.visit_child("chunk_offsets", chunk_offsets.as_ref()); + + for (idx, chunk) in array.chunks().iter().enumerate() { + visitor.visit_child(format!("chunks[{idx}]").as_str(), chunk); + } + } +} diff --git a/vortex-array/src/arrays/constant/array.rs b/vortex-array/src/arrays/constant/array.rs new file mode 100644 index 00000000000..0828170d842 --- /dev/null +++ b/vortex-array/src/arrays/constant/array.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_scalar::Scalar; + +use crate::stats::ArrayStats; + +#[derive(Clone, Debug)] +pub struct ConstantArray { + pub(super) scalar: Scalar, + pub(super) len: usize, + pub(super) stats_set: ArrayStats, +} + +impl ConstantArray { + pub fn new(scalar: S, len: usize) -> Self + where + S: Into, + { + let scalar = scalar.into(); + Self { + scalar, + len, + stats_set: Default::default(), + } + } + + /// Returns the [`Scalar`] value of this constant array. + pub fn scalar(&self) -> &Scalar { + &self.scalar + } +} diff --git a/vortex-array/src/arrays/constant/mod.rs b/vortex-array/src/arrays/constant/mod.rs index 0d79a64de37..ab523cb0341 100644 --- a/vortex-array/src/arrays/constant/mod.rs +++ b/vortex-array/src/arrays/constant/mod.rs @@ -1,136 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::ops::Range; +mod array; +pub use array::ConstantArray; -use vortex_buffer::ByteBufferMut; -use vortex_dtype::DType; -use vortex_mask::Mask; -use vortex_scalar::Scalar; - -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::vtable::{ - ArrayVTable, NotSupported, OperationsVTable, VTable, ValidityVTable, VisitorVTable, -}; -use crate::{ - ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, EncodingId, EncodingRef, IntoArray, vtable, -}; - -mod canonical; mod compute; -mod encode; -mod operator; -mod serde; - -vtable!(Constant); - -#[derive(Clone, Debug)] -pub struct ConstantArray { - scalar: Scalar, - len: usize, - stats_set: ArrayStats, -} - -#[derive(Clone, Debug)] -pub struct ConstantEncoding; - -impl VTable for ConstantVTable { - type Array = ConstantArray; - type Encoding = ConstantEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = Self; - type VisitorVTable = Self; - // TODO(ngates): implement a compute kernel for elementwise operations - type ComputeVTable = NotSupported; - type EncodeVTable = Self; - type PipelineVTable = Self; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.constant") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(ConstantEncoding.as_ref()) - } -} - -impl ConstantArray { - pub fn new(scalar: S, len: usize) -> Self - where - S: Into, - { - let scalar = scalar.into(); - Self { - scalar, - len, - stats_set: Default::default(), - } - } - - /// Returns the [`Scalar`] value of this constant array. - pub fn scalar(&self) -> &Scalar { - &self.scalar - } -} - -impl ArrayVTable for ConstantVTable { - fn len(array: &ConstantArray) -> usize { - array.len - } - - fn dtype(array: &ConstantArray) -> &DType { - array.scalar.dtype() - } - - fn stats(array: &ConstantArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl OperationsVTable for ConstantVTable { - fn slice(array: &ConstantArray, range: Range) -> ArrayRef { - ConstantArray::new(array.scalar.clone(), range.len()).into_array() - } - - fn scalar_at(array: &ConstantArray, _index: usize) -> Scalar { - array.scalar.clone() - } -} - -impl ValidityVTable for ConstantVTable { - fn is_valid(array: &ConstantArray, _index: usize) -> bool { - !array.scalar().is_null() - } - - fn all_valid(array: &ConstantArray) -> bool { - !array.scalar().is_null() - } - - fn all_invalid(array: &ConstantArray) -> bool { - array.scalar().is_null() - } - - fn validity_mask(array: &ConstantArray) -> Mask { - match array.scalar().is_null() { - true => Mask::AllFalse(array.len()), - false => Mask::AllTrue(array.len()), - } - } -} - -impl VisitorVTable for ConstantVTable { - fn visit_buffers(array: &ConstantArray, visitor: &mut dyn ArrayBufferVisitor) { - let buffer = array - .scalar - .value() - .to_protobytes::() - .freeze(); - visitor.visit_buffer(&buffer); - } - fn visit_children(_array: &ConstantArray, _visitor: &mut dyn ArrayChildVisitor) {} -} +mod vtable; +pub use vtable::{ConstantEncoding, ConstantVTable}; diff --git a/vortex-array/src/arrays/constant/vtable/array.rs b/vortex-array/src/arrays/constant/vtable/array.rs new file mode 100644 index 00000000000..7bf80ab3d21 --- /dev/null +++ b/vortex-array/src/arrays/constant/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::{ConstantArray, ConstantVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for ConstantVTable { + fn len(array: &ConstantArray) -> usize { + array.len + } + + fn dtype(array: &ConstantArray) -> &DType { + array.scalar.dtype() + } + + fn stats(array: &ConstantArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/constant/canonical.rs b/vortex-array/src/arrays/constant/vtable/canonical.rs similarity index 98% rename from vortex-array/src/arrays/constant/canonical.rs rename to vortex-array/src/arrays/constant/vtable/canonical.rs index 8b98e89f2e3..c01fdf92c7d 100644 --- a/vortex-array/src/arrays/constant/canonical.rs +++ b/vortex-array/src/arrays/constant/vtable/canonical.rs @@ -12,11 +12,12 @@ use vortex_scalar::{ StructScalar, Utf8Scalar, match_each_decimal_value, match_each_decimal_value_type, }; +use crate::arrays::binary_view::BinaryView; use crate::arrays::constant::ConstantArray; use crate::arrays::primitive::PrimitiveArray; use crate::arrays::{ - BinaryView, BoolArray, ConstantVTable, DecimalArray, ExtensionArray, FixedSizeListArray, - ListArray, NullArray, StructArray, VarBinViewArray, smallest_storage_type, + BoolArray, ConstantVTable, DecimalArray, ExtensionArray, FixedSizeListArray, ListArray, + NullArray, StructArray, VarBinViewArray, smallest_decimal_value_type, }; use crate::builders::builder_with_capacity; use crate::validity::Validity; @@ -66,7 +67,7 @@ impl CanonicalVTable for ConstantVTable { }) } DType::Decimal(decimal_type, ..) => { - let size = smallest_storage_type(decimal_type); + let size = smallest_decimal_value_type(decimal_type); let decimal = scalar.as_decimal(); let Some(value) = decimal.decimal_value() else { let all_null = match_each_decimal_value_type!(size, |D| { diff --git a/vortex-array/src/arrays/constant/encode.rs b/vortex-array/src/arrays/constant/vtable/encode.rs similarity index 100% rename from vortex-array/src/arrays/constant/encode.rs rename to vortex-array/src/arrays/constant/vtable/encode.rs diff --git a/vortex-array/src/arrays/constant/vtable/mod.rs b/vortex-array/src/arrays/constant/vtable/mod.rs new file mode 100644 index 00000000000..9ac5a1071fb --- /dev/null +++ b/vortex-array/src/arrays/constant/vtable/mod.rs @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::ConstantArray; +use crate::vtable::{NotSupported, VTable}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod encode; +mod operations; +mod pipeline; +mod serde; +mod validity; +mod visitor; + +vtable!(Constant); + +#[derive(Clone, Debug)] +pub struct ConstantEncoding; + +impl VTable for ConstantVTable { + type Array = ConstantArray; + type Encoding = ConstantEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = Self; + type VisitorVTable = Self; + // TODO(ngates): implement a compute kernel for elementwise operations + type ComputeVTable = NotSupported; + type EncodeVTable = Self; + type PipelineVTable = Self; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.constant") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(ConstantEncoding.as_ref()) + } +} diff --git a/vortex-array/src/arrays/constant/vtable/operations.rs b/vortex-array/src/arrays/constant/vtable/operations.rs new file mode 100644 index 00000000000..193f8b1a733 --- /dev/null +++ b/vortex-array/src/arrays/constant/vtable/operations.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::ops::Range; + +use vortex_scalar::Scalar; + +use crate::arrays::{ConstantArray, ConstantVTable}; +use crate::vtable::OperationsVTable; +use crate::{ArrayRef, IntoArray}; + +impl OperationsVTable for ConstantVTable { + fn slice(array: &ConstantArray, range: Range) -> ArrayRef { + ConstantArray::new(array.scalar.clone(), range.len()).into_array() + } + + fn scalar_at(array: &ConstantArray, _index: usize) -> Scalar { + array.scalar.clone() + } +} diff --git a/vortex-array/src/arrays/constant/operator.rs b/vortex-array/src/arrays/constant/vtable/pipeline.rs similarity index 100% rename from vortex-array/src/arrays/constant/operator.rs rename to vortex-array/src/arrays/constant/vtable/pipeline.rs diff --git a/vortex-array/src/arrays/constant/serde.rs b/vortex-array/src/arrays/constant/vtable/serde.rs similarity index 100% rename from vortex-array/src/arrays/constant/serde.rs rename to vortex-array/src/arrays/constant/vtable/serde.rs diff --git a/vortex-array/src/arrays/constant/vtable/validity.rs b/vortex-array/src/arrays/constant/vtable/validity.rs new file mode 100644 index 00000000000..3fe29089813 --- /dev/null +++ b/vortex-array/src/arrays/constant/vtable/validity.rs @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_mask::Mask; + +use crate::arrays::{ConstantArray, ConstantVTable}; +use crate::vtable::ValidityVTable; + +impl ValidityVTable for ConstantVTable { + fn is_valid(array: &ConstantArray, _index: usize) -> bool { + !array.scalar().is_null() + } + + fn all_valid(array: &ConstantArray) -> bool { + !array.scalar().is_null() + } + + fn all_invalid(array: &ConstantArray) -> bool { + array.scalar().is_null() + } + + fn validity_mask(array: &ConstantArray) -> Mask { + match array.scalar().is_null() { + true => Mask::AllFalse(array.len), + false => Mask::AllTrue(array.len), + } + } +} diff --git a/vortex-array/src/arrays/constant/vtable/visitor.rs b/vortex-array/src/arrays/constant/vtable/visitor.rs new file mode 100644 index 00000000000..197e6a48787 --- /dev/null +++ b/vortex-array/src/arrays/constant/vtable/visitor.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_buffer::ByteBufferMut; + +use crate::arrays::{ConstantArray, ConstantVTable}; +use crate::vtable::VisitorVTable; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for ConstantVTable { + fn visit_buffers(array: &ConstantArray, visitor: &mut dyn ArrayBufferVisitor) { + let buffer = array + .scalar + .value() + .to_protobytes::() + .freeze(); + visitor.visit_buffer(&buffer); + } + + fn visit_children(_array: &ConstantArray, _visitor: &mut dyn ArrayChildVisitor) {} +} diff --git a/vortex-array/src/arrays/decimal/array.rs b/vortex-array/src/arrays/decimal/array.rs new file mode 100644 index 00000000000..654aca2b64d --- /dev/null +++ b/vortex-array/src/arrays/decimal/array.rs @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use arrow_buffer::BooleanBufferBuilder; +use itertools::Itertools; +use vortex_buffer::{Buffer, BufferMut, ByteBuffer}; +use vortex_dtype::{DType, DecimalDType, IntegerPType, match_each_integer_ptype}; +use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_panic}; +use vortex_scalar::{BigCast, DecimalValueType, NativeDecimalType, match_each_decimal_value_type}; + +use crate::ToCanonical; +use crate::arrays::is_compatible_decimal_value_type; +use crate::patches::Patches; +use crate::stats::ArrayStats; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +/// A decimal array that stores fixed-precision decimal numbers with configurable scale. +/// +/// This mirrors the Apache Arrow Decimal encoding and provides exact arithmetic for +/// financial and scientific computations where floating-point precision loss is unacceptable. +/// +/// ## Storage Format +/// +/// Decimals are stored as scaled integers in a supported scalar value type. +/// +/// The precisions supported for each scalar type are: +/// - **i8**: precision 1-2 digits +/// - **i16**: precision 3-4 digits +/// - **i32**: precision 5-9 digits +/// - **i64**: precision 10-18 digits +/// - **i128**: precision 19-38 digits +/// - **i256**: precision 39-76 digits +/// +/// These are just the maximal ranges for each scalar type, but it is perfectly legal to store +/// values with precision that does not match this exactly. For example, a valid DecimalArray with +/// precision=39 may store its values in an `i8` if all of the actual values fit into it. +/// +/// Similarly, a `DecimalArray` can be built that stores a set of precision=2 values in a +/// `Buffer`. +/// +/// ## Precision and Scale +/// +/// - **Precision**: Total number of significant digits (1-76, u8 range) +/// - **Scale**: Number of digits after the decimal point (-128 to 127, i8 range) +/// - **Value**: `stored_integer / 10^scale` +/// +/// For example, with precision=5 and scale=2: +/// - Stored value 12345 represents 123.45 +/// - Range: -999.99 to 999.99 +/// +/// ## Valid Scalar Types +/// +/// The underlying storage uses these native types based on precision: +/// - `DecimalValueType::I8`, `I16`, `I32`, `I64`, `I128`, `I256` +/// - Type selection is automatic based on the required precision +/// +/// # Examples +/// +/// ``` +/// use vortex_array::arrays::DecimalArray; +/// use vortex_dtype::DecimalDType; +/// use vortex_buffer::{buffer, Buffer}; +/// use vortex_array::validity::Validity; +/// +/// // Create a decimal array with precision=5, scale=2 (e.g., 123.45) +/// let decimal_dtype = DecimalDType::new(5, 2); +/// let values = buffer![12345i32, 67890i32, -12300i32]; // 123.45, 678.90, -123.00 +/// let array = DecimalArray::new(values, decimal_dtype, Validity::NonNullable); +/// +/// assert_eq!(array.precision(), 5); +/// assert_eq!(array.scale(), 2); +/// assert_eq!(array.len(), 3); +/// ``` +#[derive(Clone, Debug)] +pub struct DecimalArray { + pub(super) dtype: DType, + pub(super) values: ByteBuffer, + pub(super) values_type: DecimalValueType, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, +} + +impl DecimalArray { + /// Creates a new [`DecimalArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented in + /// [`DecimalArray::new_unchecked`]. + pub fn new( + buffer: Buffer, + decimal_dtype: DecimalDType, + validity: Validity, + ) -> Self { + Self::try_new(buffer, decimal_dtype, validity) + .vortex_expect("DecimalArray construction failed") + } + + /// Constructs a new `DecimalArray`. + /// + /// See [`DecimalArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the provided components do not satisfy the invariants documented in + /// [`DecimalArray::new_unchecked`]. + pub fn try_new( + buffer: Buffer, + decimal_dtype: DecimalDType, + validity: Validity, + ) -> VortexResult { + Self::validate(&buffer, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(buffer, decimal_dtype, validity) }) + } + + /// Creates a new [`DecimalArray`] without validation from these components: + /// + /// * `buffer` is a typed buffer containing the decimal values. + /// * `decimal_dtype` specifies the decimal precision and scale. + /// * `validity` holds the null values. + /// + /// # Safety + /// + /// The caller must ensure all of the following invariants are satisfied: + /// + /// - All non-null values in `buffer` must be representable within the specified precision. + /// - For example, with precision=5 and scale=2, all values must be in range [-999.99, 999.99]. + /// - If `validity` is [`Validity::Array`], its length must exactly equal `buffer.len()`. + pub unsafe fn new_unchecked( + buffer: Buffer, + decimal_dtype: DecimalDType, + validity: Validity, + ) -> Self { + #[cfg(debug_assertions)] + Self::validate(&buffer, &validity) + .vortex_expect("[Debug Assertion]: Invalid `DecimalArray` parameters"); + + Self { + values: buffer.into_byte_buffer(), + values_type: T::VALUES_TYPE, + dtype: DType::Decimal(decimal_dtype, validity.nullability()), + validity, + stats_set: Default::default(), + } + } + + /// Validates the components that would be used to create a [`DecimalArray`]. + /// + /// This function checks all the invariants required by [`DecimalArray::new_unchecked`]. + pub(crate) fn validate( + buffer: &Buffer, + validity: &Validity, + ) -> VortexResult<()> { + if let Some(len) = validity.maybe_len() { + vortex_ensure!( + buffer.len() == len, + "Buffer and validity length mismatch: buffer={}, validity={}", + buffer.len(), + len, + ); + } + + Ok(()) + } + + /// Returns the underlying [`ByteBuffer`] of the array. + pub fn byte_buffer(&self) -> ByteBuffer { + self.values.clone() + } + + pub fn buffer(&self) -> Buffer { + if self.values_type != T::VALUES_TYPE { + vortex_panic!( + "Cannot extract Buffer<{:?}> for DecimalArray with values_type {:?}", + T::VALUES_TYPE, + self.values_type, + ); + } + Buffer::::from_byte_buffer(self.values.clone()) + } + + /// Returns the decimal type information + pub fn decimal_dtype(&self) -> DecimalDType { + if let DType::Decimal(decimal_dtype, _) = self.dtype { + decimal_dtype + } else { + vortex_panic!("Expected Decimal dtype, got {:?}", self.dtype) + } + } + + pub fn values_type(&self) -> DecimalValueType { + self.values_type + } + + pub fn precision(&self) -> u8 { + self.decimal_dtype().precision() + } + + pub fn scale(&self) -> i8 { + self.decimal_dtype().scale() + } + + pub fn from_option_iter>>( + iter: I, + decimal_dtype: DecimalDType, + ) -> Self { + let iter = iter.into_iter(); + let mut values = BufferMut::with_capacity(iter.size_hint().0); + let mut validity = BooleanBufferBuilder::new(values.capacity()); + + for i in iter { + match i { + None => { + validity.append(false); + values.push(T::default()); + } + Some(e) => { + validity.append(true); + values.push(e); + } + } + } + Self::new( + values.freeze(), + decimal_dtype, + Validity::from(validity.finish()), + ) + } + + #[allow(clippy::cognitive_complexity)] + pub fn patch(self, patches: &Patches) -> Self { + let offset = patches.offset(); + let patch_indices = patches.indices().to_primitive(); + let patch_values = patches.values().to_decimal(); + + let patched_validity = self.validity().clone().patch( + self.len(), + offset, + patch_indices.as_ref(), + patch_values.validity(), + ); + assert_eq!(self.decimal_dtype(), patch_values.decimal_dtype()); + + match_each_integer_ptype!(patch_indices.ptype(), |I| { + let patch_indices = patch_indices.as_slice::(); + match_each_decimal_value_type!(patch_values.values_type(), |PatchDVT| { + let patch_values = patch_values.buffer::(); + match_each_decimal_value_type!(self.values_type(), |ValuesDVT| { + let buffer = self.buffer::().into_mut(); + patch_typed( + buffer, + self.decimal_dtype(), + patch_indices, + offset, + patch_values, + patched_validity, + ) + }) + }) + }) + } +} + +fn patch_typed( + mut buffer: BufferMut, + decimal_dtype: DecimalDType, + patch_indices: &[I], + patch_indices_offset: usize, + patch_values: Buffer, + patched_validity: Validity, +) -> DecimalArray +where + I: IntegerPType, + PatchDVT: NativeDecimalType, + ValuesDVT: NativeDecimalType, +{ + if !is_compatible_decimal_value_type(ValuesDVT::VALUES_TYPE, decimal_dtype) { + vortex_panic!( + "patch_typed: {:?} cannot represent every value in {}.", + ValuesDVT::VALUES_TYPE, + decimal_dtype + ) + } + + for (idx, value) in patch_indices.iter().zip_eq(patch_values.into_iter()) { + buffer[idx.as_() - patch_indices_offset] = ::from(value).vortex_expect( + "values of a given DecimalDType are representable in all compatible NativeDecimalType", + ); + } + + DecimalArray::new(buffer.freeze(), decimal_dtype, patched_validity) +} diff --git a/vortex-array/src/arrays/decimal/mod.rs b/vortex-array/src/arrays/decimal/mod.rs index 5117738c4ed..803ededda81 100644 --- a/vortex-array/src/arrays/decimal/mod.rs +++ b/vortex-array/src/arrays/decimal/mod.rs @@ -1,338 +1,19 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -mod compute; -mod narrow; -mod ops; -mod patch; -mod serde; - -use arrow_buffer::BooleanBufferBuilder; -use vortex_buffer::{Buffer, BufferMut, ByteBuffer}; -use vortex_dtype::{DType, DecimalDType}; -use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_panic}; -use vortex_scalar::{DecimalValueType, NativeDecimalType}; - -pub use crate::arrays::decimal::narrow::narrowed_decimal; -use crate::builders::ArrayBuilder; -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::vtable::{ - ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper, - ValidityVTableFromValidityHelper, VisitorVTable, -}; -use crate::{ArrayBufferVisitor, ArrayChildVisitor, Canonical, EncodingId, EncodingRef, vtable}; - -vtable!(Decimal); - -impl VTable for DecimalVTable { - type Array = DecimalArray; - type Encoding = DecimalEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.decimal") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(DecimalEncoding.as_ref()) - } -} - -#[derive(Clone, Debug)] -pub struct DecimalEncoding; - -/// Maps a decimal precision into the smallest type that can represent it. -pub fn smallest_storage_type(decimal_dtype: &DecimalDType) -> DecimalValueType { - match decimal_dtype.precision() { - 1..=2 => DecimalValueType::I8, - 3..=4 => DecimalValueType::I16, - 5..=9 => DecimalValueType::I32, - 10..=18 => DecimalValueType::I64, - 19..=38 => DecimalValueType::I128, - 39..=76 => DecimalValueType::I256, - 0 => unreachable!("precision must be greater than 0"), - p => unreachable!("precision larger than 76 is invalid found precision {p}"), - } -} - -/// True if `value_type` can represent every value of the type `dtype`. -pub fn compatible_storage_type(value_type: DecimalValueType, dtype: DecimalDType) -> bool { - value_type >= smallest_storage_type(&dtype) -} - -/// A decimal array that stores fixed-precision decimal numbers with configurable scale. -/// -/// This mirrors the Apache Arrow Decimal encoding and provides exact arithmetic for -/// financial and scientific computations where floating-point precision loss is unacceptable. -/// -/// ## Storage Format -/// -/// Decimals are stored as scaled integers in a supported scalar value type. -/// -/// The precisions supported for each scalar type are: -/// - **i8**: precision 1-2 digits -/// - **i16**: precision 3-4 digits -/// - **i32**: precision 5-9 digits -/// - **i64**: precision 10-18 digits -/// - **i128**: precision 19-38 digits -/// - **i256**: precision 39-76 digits -/// -/// These are just the maximal ranges for each scalar type, but it is perfectly legal to store -/// values with precision that does not match this exactly. For example, a valid DecimalArray with -/// precision=39 may store its values in an `i8` if all of the actual values fit into it. -/// -/// Similarly, a `DecimalArray` can be built that stores a set of precision=2 values in a -/// `Buffer`. -/// -/// ## Precision and Scale -/// -/// - **Precision**: Total number of significant digits (1-76, u8 range) -/// - **Scale**: Number of digits after the decimal point (-128 to 127, i8 range) -/// - **Value**: `stored_integer / 10^scale` -/// -/// For example, with precision=5 and scale=2: -/// - Stored value 12345 represents 123.45 -/// - Range: -999.99 to 999.99 -/// -/// ## Valid Scalar Types -/// -/// The underlying storage uses these native types based on precision: -/// - `DecimalValueType::I8`, `I16`, `I32`, `I64`, `I128`, `I256` -/// - Type selection is automatic based on the required precision -/// -/// # Examples -/// -/// ``` -/// use vortex_array::arrays::DecimalArray; -/// use vortex_dtype::DecimalDType; -/// use vortex_buffer::{buffer, Buffer}; -/// use vortex_array::validity::Validity; -/// -/// // Create a decimal array with precision=5, scale=2 (e.g., 123.45) -/// let decimal_dtype = DecimalDType::new(5, 2); -/// let values = buffer![12345i32, 67890i32, -12300i32]; // 123.45, 678.90, -123.00 -/// let array = DecimalArray::new(values, decimal_dtype, Validity::NonNullable); -/// -/// assert_eq!(array.precision(), 5); -/// assert_eq!(array.scale(), 2); -/// assert_eq!(array.len(), 3); -/// ``` -#[derive(Clone, Debug)] -pub struct DecimalArray { - dtype: DType, - values: ByteBuffer, - values_type: DecimalValueType, - validity: Validity, - stats_set: ArrayStats, -} - -impl DecimalArray { - /// Creates a new [`DecimalArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented in - /// [`DecimalArray::new_unchecked`]. - pub fn new( - buffer: Buffer, - decimal_dtype: DecimalDType, - validity: Validity, - ) -> Self { - Self::try_new(buffer, decimal_dtype, validity) - .vortex_expect("DecimalArray construction failed") - } - - /// Constructs a new `DecimalArray`. - /// - /// See [`DecimalArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the provided components do not satisfy the invariants documented in - /// [`DecimalArray::new_unchecked`]. - pub fn try_new( - buffer: Buffer, - decimal_dtype: DecimalDType, - validity: Validity, - ) -> VortexResult { - Self::validate(&buffer, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(buffer, decimal_dtype, validity) }) - } +mod array; +pub use array::DecimalArray; - /// Creates a new [`DecimalArray`] without validation from these components: - /// - /// * `buffer` is a typed buffer containing the decimal values. - /// * `decimal_dtype` specifies the decimal precision and scale. - /// * `validity` holds the null values. - /// - /// # Safety - /// - /// The caller must ensure all of the following invariants are satisfied: - /// - /// - All non-null values in `buffer` must be representable within the specified precision. - /// - For example, with precision=5 and scale=2, all values must be in range [-999.99, 999.99]. - /// - If `validity` is [`Validity::Array`], its length must exactly equal `buffer.len()`. - pub unsafe fn new_unchecked( - buffer: Buffer, - decimal_dtype: DecimalDType, - validity: Validity, - ) -> Self { - Self { - values: buffer.into_byte_buffer(), - values_type: T::VALUES_TYPE, - dtype: DType::Decimal(decimal_dtype, validity.nullability()), - validity, - stats_set: Default::default(), - } - } - - /// Validates the components that would be used to create a [`DecimalArray`]. - /// - /// This function checks all the invariants required by [`DecimalArray::new_unchecked`]. - pub(crate) fn validate( - buffer: &Buffer, - validity: &Validity, - ) -> VortexResult<()> { - if let Some(len) = validity.maybe_len() { - vortex_ensure!( - buffer.len() == len, - "Buffer and validity length mismatch: buffer={}, validity={}", - buffer.len(), - len, - ); - } - - Ok(()) - } - - /// Returns the underlying [`ByteBuffer`] of the array. - pub fn byte_buffer(&self) -> ByteBuffer { - self.values.clone() - } - - pub fn buffer(&self) -> Buffer { - if self.values_type != T::VALUES_TYPE { - vortex_panic!( - "Cannot extract Buffer<{:?}> for DecimalArray with values_type {:?}", - T::VALUES_TYPE, - self.values_type, - ); - } - Buffer::::from_byte_buffer(self.values.clone()) - } - - /// Returns the decimal type information - pub fn decimal_dtype(&self) -> DecimalDType { - if let DType::Decimal(decimal_dtype, _) = self.dtype { - decimal_dtype - } else { - vortex_panic!("Expected Decimal dtype, got {:?}", self.dtype) - } - } - - pub fn values_type(&self) -> DecimalValueType { - self.values_type - } - - pub fn precision(&self) -> u8 { - self.decimal_dtype().precision() - } - - pub fn scale(&self) -> i8 { - self.decimal_dtype().scale() - } - - pub fn from_option_iter>>( - iter: I, - decimal_dtype: DecimalDType, - ) -> Self { - let iter = iter.into_iter(); - let mut values = BufferMut::with_capacity(iter.size_hint().0); - let mut validity = BooleanBufferBuilder::new(values.capacity()); - - for i in iter { - match i { - None => { - validity.append(false); - values.push(T::default()); - } - Some(e) => { - validity.append(true); - values.push(e); - } - } - } - Self::new( - values.freeze(), - decimal_dtype, - Validity::from(validity.finish()), - ) - } -} - -impl ArrayVTable for DecimalVTable { - fn len(array: &DecimalArray) -> usize { - let divisor = match array.values_type { - DecimalValueType::I8 => 1, - DecimalValueType::I16 => 2, - DecimalValueType::I32 => 4, - DecimalValueType::I64 => 8, - DecimalValueType::I128 => 16, - DecimalValueType::I256 => 32, - ty => vortex_panic!("unknown decimal value type {:?}", ty), - }; - array.values.len() / divisor - } - - fn dtype(array: &DecimalArray) -> &DType { - &array.dtype - } - - fn stats(array: &DecimalArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl VisitorVTable for DecimalVTable { - fn visit_buffers(array: &DecimalArray, visitor: &mut dyn ArrayBufferVisitor) { - visitor.visit_buffer(&array.values); - } - - fn visit_children(array: &DecimalArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_validity(array.validity(), array.len()) - } -} +mod compute; -impl CanonicalVTable for DecimalVTable { - fn canonicalize(array: &DecimalArray) -> Canonical { - Canonical::Decimal(array.clone()) - } +mod vtable; +pub use vtable::{DecimalEncoding, DecimalVTable}; - fn append_to_builder(array: &DecimalArray, builder: &mut dyn ArrayBuilder) { - builder.extend_from_array(array.as_ref()) - } -} - -impl ValidityHelper for DecimalArray { - fn validity(&self) -> &Validity { - &self.validity - } -} +mod utils; +pub use utils::*; #[cfg(test)] -mod test { +mod tests { use arrow_array::Decimal128Array; #[test] diff --git a/vortex-array/src/arrays/decimal/patch.rs b/vortex-array/src/arrays/decimal/patch.rs deleted file mode 100644 index cc3317f5c2e..00000000000 --- a/vortex-array/src/arrays/decimal/patch.rs +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use itertools::Itertools as _; -use vortex_buffer::{Buffer, BufferMut}; -use vortex_dtype::{DecimalDType, IntegerPType, match_each_integer_ptype}; -use vortex_error::{VortexExpect as _, vortex_panic}; -use vortex_scalar::{BigCast, NativeDecimalType, match_each_decimal_value_type}; - -use super::{DecimalArray, compatible_storage_type}; -use crate::ToCanonical as _; -use crate::patches::Patches; -use crate::validity::Validity; -use crate::vtable::ValidityHelper; - -impl DecimalArray { - #[allow(clippy::cognitive_complexity)] - pub fn patch(self, patches: &Patches) -> Self { - let offset = patches.offset(); - let patch_indices = patches.indices().to_primitive(); - let patch_values = patches.values().to_decimal(); - - let patched_validity = self.validity().clone().patch( - self.len(), - offset, - patch_indices.as_ref(), - patch_values.validity(), - ); - assert_eq!(self.decimal_dtype(), patch_values.decimal_dtype()); - - match_each_integer_ptype!(patch_indices.ptype(), |I| { - let patch_indices = patch_indices.as_slice::(); - match_each_decimal_value_type!(patch_values.values_type(), |PatchDVT| { - let patch_values = patch_values.buffer::(); - match_each_decimal_value_type!(self.values_type(), |ValuesDVT| { - let buffer = self.buffer::().into_mut(); - patch_typed( - buffer, - self.decimal_dtype(), - patch_indices, - offset, - patch_values, - patched_validity, - ) - }) - }) - }) - } -} - -fn patch_typed( - mut buffer: BufferMut, - decimal_dtype: DecimalDType, - patch_indices: &[I], - patch_indices_offset: usize, - patch_values: Buffer, - patched_validity: Validity, -) -> DecimalArray -where - I: IntegerPType, - PatchDVT: NativeDecimalType, - ValuesDVT: NativeDecimalType, -{ - if !compatible_storage_type(ValuesDVT::VALUES_TYPE, decimal_dtype) { - vortex_panic!( - "patch_typed: {:?} cannot represent every value in {}.", - ValuesDVT::VALUES_TYPE, - decimal_dtype - ) - } - - for (idx, value) in patch_indices.iter().zip_eq(patch_values.into_iter()) { - buffer[idx.as_() - patch_indices_offset] = ::from(value).vortex_expect( - "values of a given DecimalDType are representable in all compatible NativeDecimalType", - ); - } - - DecimalArray::new(buffer.freeze(), decimal_dtype, patched_validity) -} diff --git a/vortex-array/src/arrays/decimal/narrow.rs b/vortex-array/src/arrays/decimal/utils.rs similarity index 72% rename from vortex-array/src/arrays/decimal/narrow.rs rename to vortex-array/src/arrays/decimal/utils.rs index 7eebecc30d5..3a54b4de371 100644 --- a/vortex-array/src/arrays/decimal/narrow.rs +++ b/vortex-array/src/arrays/decimal/utils.rs @@ -2,12 +2,32 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use itertools::{Itertools, MinMaxResult}; +use vortex_dtype::DecimalDType; use vortex_error::VortexExpect; use vortex_scalar::{BigCast, DecimalValueType, i256}; use crate::arrays::DecimalArray; use crate::vtable::ValidityHelper; +/// Maps a decimal precision into the smallest type that can represent it. +pub fn smallest_decimal_value_type(decimal_dtype: &DecimalDType) -> DecimalValueType { + match decimal_dtype.precision() { + 1..=2 => DecimalValueType::I8, + 3..=4 => DecimalValueType::I16, + 5..=9 => DecimalValueType::I32, + 10..=18 => DecimalValueType::I64, + 19..=38 => DecimalValueType::I128, + 39..=76 => DecimalValueType::I256, + 0 => unreachable!("precision must be greater than 0"), + p => unreachable!("precision larger than 76 is invalid found precision {p}"), + } +} + +/// True if `value_type` can represent every value of the type `dtype`. +pub fn is_compatible_decimal_value_type(value_type: DecimalValueType, dtype: DecimalDType) -> bool { + value_type >= smallest_decimal_value_type(&dtype) +} + macro_rules! try_downcast { ($array:expr, from: $src:ty, to: $($dst:ty),*) => {{ // Collect the min/max of the values diff --git a/vortex-array/src/arrays/decimal/vtable/array.rs b/vortex-array/src/arrays/decimal/vtable/array.rs new file mode 100644 index 00000000000..e6af95487b9 --- /dev/null +++ b/vortex-array/src/arrays/decimal/vtable/array.rs @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; +use vortex_error::vortex_panic; +use vortex_scalar::DecimalValueType; + +use crate::arrays::{DecimalArray, DecimalVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for DecimalVTable { + fn len(array: &DecimalArray) -> usize { + let divisor = match array.values_type { + DecimalValueType::I8 => 1, + DecimalValueType::I16 => 2, + DecimalValueType::I32 => 4, + DecimalValueType::I64 => 8, + DecimalValueType::I128 => 16, + DecimalValueType::I256 => 32, + ty => vortex_panic!("unknown decimal value type {:?}", ty), + }; + array.values.len() / divisor + } + + fn dtype(array: &DecimalArray) -> &DType { + &array.dtype + } + + fn stats(array: &DecimalArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/decimal/vtable/canonical.rs b/vortex-array/src/arrays/decimal/vtable/canonical.rs new file mode 100644 index 00000000000..8b99f7e9c00 --- /dev/null +++ b/vortex-array/src/arrays/decimal/vtable/canonical.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::{DecimalArray, DecimalVTable}; +use crate::builders::ArrayBuilder; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for DecimalVTable { + fn canonicalize(array: &DecimalArray) -> Canonical { + Canonical::Decimal(array.clone()) + } + + fn append_to_builder(array: &DecimalArray, builder: &mut dyn ArrayBuilder) { + builder.extend_from_array(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/decimal/vtable/mod.rs b/vortex-array/src/arrays/decimal/vtable/mod.rs new file mode 100644 index 00000000000..fa408626053 --- /dev/null +++ b/vortex-array/src/arrays/decimal/vtable/mod.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::DecimalArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; +mod visitor; + +vtable!(Decimal); + +impl VTable for DecimalVTable { + type Array = DecimalArray; + type Encoding = DecimalEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.decimal") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(DecimalEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct DecimalEncoding; diff --git a/vortex-array/src/arrays/decimal/ops.rs b/vortex-array/src/arrays/decimal/vtable/operations.rs similarity index 100% rename from vortex-array/src/arrays/decimal/ops.rs rename to vortex-array/src/arrays/decimal/vtable/operations.rs diff --git a/vortex-array/src/arrays/decimal/serde.rs b/vortex-array/src/arrays/decimal/vtable/serde.rs similarity index 100% rename from vortex-array/src/arrays/decimal/serde.rs rename to vortex-array/src/arrays/decimal/vtable/serde.rs diff --git a/vortex-array/src/arrays/decimal/vtable/validity.rs b/vortex-array/src/arrays/decimal/vtable/validity.rs new file mode 100644 index 00000000000..fd66742629c --- /dev/null +++ b/vortex-array/src/arrays/decimal/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::DecimalArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for DecimalArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/decimal/vtable/visitor.rs b/vortex-array/src/arrays/decimal/vtable/visitor.rs new file mode 100644 index 00000000000..5e0265b0b9a --- /dev/null +++ b/vortex-array/src/arrays/decimal/vtable/visitor.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::{DecimalArray, DecimalVTable}; +use crate::vtable::{ValidityHelper, VisitorVTable}; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for DecimalVTable { + fn visit_buffers(array: &DecimalArray, visitor: &mut dyn ArrayBufferVisitor) { + visitor.visit_buffer(&array.values); + } + + fn visit_children(array: &DecimalArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_validity(array.validity(), array.len()) + } +} diff --git a/vortex-array/src/arrays/extension/array.rs b/vortex-array/src/arrays/extension/array.rs new file mode 100644 index 00000000000..aee1467b8a8 --- /dev/null +++ b/vortex-array/src/arrays/extension/array.rs @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_dtype::{DType, ExtDType, ExtID}; + +use crate::ArrayRef; +use crate::stats::ArrayStats; + +/// An extension array that wraps another array with additional type information. +/// +/// **⚠️ Unstable API**: This is an experimental feature that may change significantly +/// in future versions. The extension type system is still evolving. +/// +/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible +/// mechanism for adding semantic meaning to existing array types without requiring +/// changes to the core type system. +/// +/// ## Design Philosophy +/// +/// Extension arrays serve as a type-safe wrapper that: +/// - Preserves the underlying storage format and operations +/// - Adds semantic type information via `ExtDType` +/// - Enables custom serialization and deserialization logic +/// - Allows domain-specific interpretations of generic data +/// +/// ## Storage and Type Relationship +/// +/// The extension array maintains a strict contract: +/// - **Storage array**: Contains the actual data in a standard Vortex encoding +/// - **Extension type**: Defines how to interpret the storage data semantically +/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype +/// +/// ## Use Cases +/// +/// Extension arrays are ideal for: +/// - **Custom numeric types**: Units of measurement, currencies +/// - **Temporal types**: Custom date/time formats, time zones, calendars +/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates +/// - **Encoded types**: Base64 strings, compressed data, encrypted values +/// +/// ## Validity and Operations +/// +/// Extension arrays delegate validity and most operations to their storage array: +/// - Validity is inherited from the underlying storage +/// - Slicing preserves the extension type +/// - Scalar access wraps storage scalars with extension metadata +/// +/// # Examples +/// +/// ``` +/// use std::sync::Arc; +/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray}; +/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType}; +/// use vortex_array::validity::Validity; +/// use vortex_array::IntoArray; +/// use vortex_buffer::buffer; +/// +/// // Define a custom extension type for representing currency values +/// let currency_id = ExtID::from("example.currency"); +/// let currency_dtype = Arc::new(ExtDType::new( +/// currency_id, +/// Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents +/// None, // No additional metadata needed +/// )); +/// +/// // Create storage array with currency values in cents +/// let cents_storage = PrimitiveArray::new( +/// buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99 +/// Validity::NonNullable +/// ); +/// +/// // Wrap with extension type +/// let currency_array = ExtensionArray::new( +/// currency_dtype.clone(), +/// cents_storage.into_array() +/// ); +/// +/// assert_eq!(currency_array.len(), 3); +/// assert_eq!(currency_array.id().as_ref(), "example.currency"); +/// +/// // Access maintains extension type information +/// let first_value = currency_array.scalar_at(0); +/// assert!(first_value.as_extension_opt().is_some()); +/// ``` +#[derive(Clone, Debug)] +pub struct ExtensionArray { + pub(super) dtype: DType, + pub(super) storage: ArrayRef, + pub(super) stats_set: ArrayStats, +} + +impl ExtensionArray { + pub fn new(ext_dtype: Arc, storage: ArrayRef) -> Self { + assert_eq!( + ext_dtype.storage_dtype(), + storage.dtype(), + "ExtensionArray: storage_dtype must match storage array DType", + ); + Self { + dtype: DType::Extension(ext_dtype), + storage, + stats_set: ArrayStats::default(), + } + } + + pub fn ext_dtype(&self) -> &Arc { + let DType::Extension(ext) = &self.dtype else { + unreachable!("ExtensionArray: dtype must be an ExtDType") + }; + ext + } + + pub fn storage(&self) -> &ArrayRef { + &self.storage + } + + #[allow(dead_code)] + #[inline] + pub fn id(&self) -> &ExtID { + self.ext_dtype().id() + } +} diff --git a/vortex-array/src/arrays/extension/mod.rs b/vortex-array/src/arrays/extension/mod.rs index 38ec63590ae..c89ec4bc180 100644 --- a/vortex-array/src/arrays/extension/mod.rs +++ b/vortex-array/src/arrays/extension/mod.rs @@ -1,208 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::ops::Range; -use std::sync::Arc; - -use vortex_dtype::{DType, ExtDType, ExtID}; -use vortex_scalar::Scalar; - -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::vtable::{ - ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityChild, - ValidityVTableFromChild, VisitorVTable, -}; -use crate::{ - Array, ArrayBufferVisitor, ArrayChildVisitor, ArrayRef, Canonical, EncodingId, EncodingRef, - IntoArray, vtable, -}; +mod array; +pub use array::ExtensionArray; mod compute; -mod serde; - -vtable!(Extension); - -impl VTable for ExtensionVTable { - type Array = ExtensionArray; - type Encoding = ExtensionEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromChild; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.ext") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(ExtensionEncoding.as_ref()) - } -} - -#[derive(Clone, Debug)] -pub struct ExtensionEncoding; - -/// An extension array that wraps another array with additional type information. -/// -/// **⚠️ Unstable API**: This is an experimental feature that may change significantly -/// in future versions. The extension type system is still evolving. -/// -/// Unlike Apache Arrow's extension arrays, Vortex extension arrays provide a more flexible -/// mechanism for adding semantic meaning to existing array types without requiring -/// changes to the core type system. -/// -/// ## Design Philosophy -/// -/// Extension arrays serve as a type-safe wrapper that: -/// - Preserves the underlying storage format and operations -/// - Adds semantic type information via `ExtDType` -/// - Enables custom serialization and deserialization logic -/// - Allows domain-specific interpretations of generic data -/// -/// ## Storage and Type Relationship -/// -/// The extension array maintains a strict contract: -/// - **Storage array**: Contains the actual data in a standard Vortex encoding -/// - **Extension type**: Defines how to interpret the storage data semantically -/// - **Type safety**: The storage array's dtype must match the extension type's storage dtype -/// -/// ## Use Cases -/// -/// Extension arrays are ideal for: -/// - **Custom numeric types**: Units of measurement, currencies -/// - **Temporal types**: Custom date/time formats, time zones, calendars -/// - **Domain-specific types**: UUIDs, IP addresses, geographic coordinates -/// - **Encoded types**: Base64 strings, compressed data, encrypted values -/// -/// ## Validity and Operations -/// -/// Extension arrays delegate validity and most operations to their storage array: -/// - Validity is inherited from the underlying storage -/// - Slicing preserves the extension type -/// - Scalar access wraps storage scalars with extension metadata -/// -/// # Examples -/// -/// ``` -/// use std::sync::Arc; -/// use vortex_array::arrays::{ExtensionArray, PrimitiveArray}; -/// use vortex_dtype::{ExtDType, ExtID, DType, Nullability, PType}; -/// use vortex_array::validity::Validity; -/// use vortex_array::IntoArray; -/// use vortex_buffer::buffer; -/// -/// // Define a custom extension type for representing currency values -/// let currency_id = ExtID::from("example.currency"); -/// let currency_dtype = Arc::new(ExtDType::new( -/// currency_id, -/// Arc::new(DType::Primitive(PType::I64, Nullability::NonNullable)), // Storage as i64 cents -/// None, // No additional metadata needed -/// )); -/// -/// // Create storage array with currency values in cents -/// let cents_storage = PrimitiveArray::new( -/// buffer![12345i64, 67890, 99999], // $123.45, $678.90, $999.99 -/// Validity::NonNullable -/// ); -/// -/// // Wrap with extension type -/// let currency_array = ExtensionArray::new( -/// currency_dtype.clone(), -/// cents_storage.into_array() -/// ); -/// -/// assert_eq!(currency_array.len(), 3); -/// assert_eq!(currency_array.id().as_ref(), "example.currency"); -/// -/// // Access maintains extension type information -/// let first_value = currency_array.scalar_at(0); -/// assert!(first_value.as_extension_opt().is_some()); -/// ``` -#[derive(Clone, Debug)] -pub struct ExtensionArray { - dtype: DType, - storage: ArrayRef, - stats_set: ArrayStats, -} - -impl ExtensionArray { - pub fn new(ext_dtype: Arc, storage: ArrayRef) -> Self { - assert_eq!( - ext_dtype.storage_dtype(), - storage.dtype(), - "ExtensionArray: storage_dtype must match storage array DType", - ); - Self { - dtype: DType::Extension(ext_dtype), - storage, - stats_set: ArrayStats::default(), - } - } - - pub fn ext_dtype(&self) -> &Arc { - let DType::Extension(ext) = &self.dtype else { - unreachable!("ExtensionArray: dtype must be an ExtDType") - }; - ext - } - - pub fn storage(&self) -> &ArrayRef { - &self.storage - } - - #[allow(dead_code)] - #[inline] - pub fn id(&self) -> &ExtID { - self.ext_dtype().id() - } -} - -impl ArrayVTable for ExtensionVTable { - fn len(array: &ExtensionArray) -> usize { - array.storage.len() - } - - fn dtype(array: &ExtensionArray) -> &DType { - &array.dtype - } - - fn stats(array: &ExtensionArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl ValidityChild for ExtensionVTable { - fn validity_child(array: &ExtensionArray) -> &dyn Array { - array.storage.as_ref() - } -} - -impl CanonicalVTable for ExtensionVTable { - fn canonicalize(array: &ExtensionArray) -> Canonical { - Canonical::Extension(array.clone()) - } -} - -impl OperationsVTable for ExtensionVTable { - fn slice(array: &ExtensionArray, range: Range) -> ArrayRef { - ExtensionArray::new(array.ext_dtype().clone(), array.storage().slice(range)).into_array() - } - - fn scalar_at(array: &ExtensionArray, index: usize) -> Scalar { - Scalar::extension(array.ext_dtype().clone(), array.storage().scalar_at(index)) - } -} - -impl VisitorVTable for ExtensionVTable { - fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {} - fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_child("storage", array.storage.as_ref()); - } -} +mod vtable; +pub use vtable::{ExtensionEncoding, ExtensionVTable}; diff --git a/vortex-array/src/arrays/extension/vtable/array.rs b/vortex-array/src/arrays/extension/vtable/array.rs new file mode 100644 index 00000000000..ebd49f97850 --- /dev/null +++ b/vortex-array/src/arrays/extension/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::extension::{ExtensionArray, ExtensionVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for ExtensionVTable { + fn len(array: &ExtensionArray) -> usize { + array.storage.len() + } + + fn dtype(array: &ExtensionArray) -> &DType { + &array.dtype + } + + fn stats(array: &ExtensionArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/extension/vtable/canonical.rs b/vortex-array/src/arrays/extension/vtable/canonical.rs new file mode 100644 index 00000000000..475335656ca --- /dev/null +++ b/vortex-array/src/arrays/extension/vtable/canonical.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::extension::{ExtensionArray, ExtensionVTable}; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for ExtensionVTable { + fn canonicalize(array: &ExtensionArray) -> Canonical { + Canonical::Extension(array.clone()) + } +} diff --git a/vortex-array/src/arrays/extension/vtable/mod.rs b/vortex-array/src/arrays/extension/vtable/mod.rs new file mode 100644 index 00000000000..3a56ba4b1a7 --- /dev/null +++ b/vortex-array/src/arrays/extension/vtable/mod.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; +mod visitor; + +use crate::arrays::extension::ExtensionArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromChild}; +use crate::{EncodingId, EncodingRef, vtable}; + +vtable!(Extension); + +impl VTable for ExtensionVTable { + type Array = ExtensionArray; + type Encoding = ExtensionEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromChild; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.ext") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(ExtensionEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct ExtensionEncoding; diff --git a/vortex-array/src/arrays/extension/vtable/operations.rs b/vortex-array/src/arrays/extension/vtable/operations.rs new file mode 100644 index 00000000000..3beda9809ee --- /dev/null +++ b/vortex-array/src/arrays/extension/vtable/operations.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::ops::Range; + +use vortex_scalar::Scalar; + +use crate::arrays::extension::{ExtensionArray, ExtensionVTable}; +use crate::vtable::OperationsVTable; +use crate::{ArrayRef, IntoArray}; + +impl OperationsVTable for ExtensionVTable { + fn slice(array: &ExtensionArray, range: Range) -> ArrayRef { + ExtensionArray::new(array.ext_dtype().clone(), array.storage().slice(range)).into_array() + } + + fn scalar_at(array: &ExtensionArray, index: usize) -> Scalar { + Scalar::extension(array.ext_dtype().clone(), array.storage().scalar_at(index)) + } +} diff --git a/vortex-array/src/arrays/extension/serde.rs b/vortex-array/src/arrays/extension/vtable/serde.rs similarity index 93% rename from vortex-array/src/arrays/extension/serde.rs rename to vortex-array/src/arrays/extension/vtable/serde.rs index 694480f7222..4299cf9b005 100644 --- a/vortex-array/src/arrays/extension/serde.rs +++ b/vortex-array/src/arrays/extension/vtable/serde.rs @@ -5,9 +5,8 @@ use vortex_buffer::ByteBuffer; use vortex_dtype::DType; use vortex_error::{VortexResult, vortex_bail}; -use super::ExtensionEncoding; use crate::EmptyMetadata; -use crate::arrays::{ExtensionArray, ExtensionVTable}; +use crate::arrays::extension::{ExtensionArray, ExtensionEncoding, ExtensionVTable}; use crate::serde::ArrayChildren; use crate::vtable::SerdeVTable; diff --git a/vortex-array/src/arrays/extension/vtable/validity.rs b/vortex-array/src/arrays/extension/vtable/validity.rs new file mode 100644 index 00000000000..690a13f6377 --- /dev/null +++ b/vortex-array/src/arrays/extension/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Array; +use crate::arrays::extension::{ExtensionArray, ExtensionVTable}; +use crate::vtable::ValidityChild; + +impl ValidityChild for ExtensionVTable { + fn validity_child(array: &ExtensionArray) -> &dyn Array { + array.storage.as_ref() + } +} diff --git a/vortex-array/src/arrays/extension/vtable/visitor.rs b/vortex-array/src/arrays/extension/vtable/visitor.rs new file mode 100644 index 00000000000..03e9603911f --- /dev/null +++ b/vortex-array/src/arrays/extension/vtable/visitor.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::extension::{ExtensionArray, ExtensionVTable}; +use crate::vtable::VisitorVTable; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for ExtensionVTable { + fn visit_buffers(_array: &ExtensionArray, _visitor: &mut dyn ArrayBufferVisitor) {} + + fn visit_children(array: &ExtensionArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_child("storage", array.storage.as_ref()); + } +} diff --git a/vortex-array/src/arrays/fixed_size_list/array.rs b/vortex-array/src/arrays/fixed_size_list/array.rs new file mode 100644 index 00000000000..8165a15e4ce --- /dev/null +++ b/vortex-array/src/arrays/fixed_size_list/array.rs @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_dtype::DType; +use vortex_error::{VortexExpect, VortexResult, vortex_ensure}; + +use crate::stats::ArrayStats; +use crate::validity::Validity; +use crate::{Array, ArrayRef}; + +/// The canonical encoding for fixed-size list arrays. +/// +/// A fixed-size list array stores lists where each list has the same number of elements. This is +/// similar to a 2D array or matrix where the inner dimension is fixed. +/// +/// ## Data Layout +/// +/// Unlike [`ListArray`] which uses offsets, `FixedSizeListArray` stores elements contiguously and +/// uses a fixed `list_size`: +/// +/// - **Elements array**: A flat array containing all list elements concatenated together +/// - **List size**: The fixed number of elements in each list +/// - **Validity**: Optional mask indicating which lists are null +/// +/// The list at index `i` contains elements from `elements[i * list_size..(i + 1) * list_size]`. +/// +/// [`ListArray`]: crate::arrays::ListArray +/// +/// # Examples +/// +/// ``` +/// use vortex_array::arrays::{FixedSizeListArray, PrimitiveArray}; +/// use vortex_array::validity::Validity; +/// use vortex_array::IntoArray; +/// use vortex_buffer::buffer; +/// +/// // Create a fixed-size list array representing [[1, 2] [3, 4], [5, 6], [7, 8]] +/// let elements = buffer![1i32, 2, 3, 4, 5, 6, 7, 8].into_array(); +/// let list_size = 2; +/// +/// let fixed_list_array = FixedSizeListArray::new( +/// elements.into_array(), +/// list_size, +/// Validity::NonNullable, +/// 4, // 4 lists +/// ); +/// +/// assert_eq!(fixed_list_array.len(), 4); +/// assert_eq!(fixed_list_array.list_size(), 2); +/// +/// // Access individual lists +/// let first_list = fixed_list_array.fixed_size_list_elements_at(0); +/// assert_eq!(first_list.len(), 2); +/// ``` +#[derive(Clone, Debug)] +pub struct FixedSizeListArray { + /// The [`DType`] of the fixed-size list. + /// + /// This type **must** be the variant [`DType::FixedSizeList`]. + pub(super) dtype: DType, + + /// The `elements` data array, where each fixed-size list scalar is a _slice_ of the `elements` + /// array, and each inner list element is a _scalar_ of the `elements` array. + /// + /// The fixed-size list scalars (or the elements of the array) are contiguous (regardless of + /// nullability for easy lookups), each with equal size in memory. + elements: ArrayRef, + + /// The size of each fixed-size list scalar in the array. + /// + /// We store the size of each fixed-size list in the array as a field for convenience. + list_size: u32, + + /// The validity / null map of the array. + /// + /// Note that this null map refers to which fixed-size list scalars are null, **not** which + /// sub-elements of fixed-size list scalars are null. The `elements` array will track individual + /// value nullability. + pub(super) validity: Validity, + + /// The length of the array. + /// + /// Note that this is different from the size of each fixed-size list scalar (`list_size`). + /// + /// The main reason we need to store this (rather than calculate it on the fly via `list_size` + /// and `elements.len()`) is because in the degenerate case where `list_size == 0`, we cannot + /// use `0 / 0` to determine the length. + pub(super) len: usize, + + /// The stats for this array. + pub(super) stats_set: ArrayStats, +} + +impl FixedSizeListArray { + /// Creates a new [`FixedSizeListArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented + /// in [`FixedSizeListArray::new_unchecked`]. + pub fn new(elements: ArrayRef, list_size: u32, validity: Validity, len: usize) -> Self { + Self::try_new(elements, list_size, validity, len) + .vortex_expect("FixedSizeListArray construction failed") + } + + /// Constructs a new `FixedSizeListArray`. + /// + /// See [`FixedSizeListArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the inputs are invalid. The inputs are **valid** if: + /// + /// - The `list_size` is 0 and: + /// - The `elements` array is empty. + /// - The `len` is equal to the length of the `validity` map. + /// - The length of the `elements` array is a multiple of the size of the fixed-size lists + /// (`list_size`). + /// - The `Validity` length (if it exists) times the `list_size` is equal to the length of the + /// `elements` (or put another way, the length of the array divided by the size of each + /// fixed-size list is equal to the length of the validity). + pub fn try_new( + elements: ArrayRef, + list_size: u32, + validity: Validity, + len: usize, + ) -> VortexResult { + Self::validate(&elements, len, list_size, &validity)?; + + // SAFETY: we validate that the inputs are valid above. + Ok(unsafe { Self::new_unchecked(elements, list_size, validity, len) }) + } + + /// Creates a new [`FixedSizeListArray`] without validation from these components: + /// + /// * `elements` is the data array where each fixed-size list is a slice. + /// * `list_size` is the fixed number of elements in each list. + /// * `validity` holds the null values. + /// * `len` is the number of lists in the array. + /// + /// # Safety + /// + /// The inputs are **valid** if: + /// + /// - The `list_size` is 0 and: + /// - The `elements` array is empty. + /// - The `len` is equal to the length of the `validity` map. + /// - The length of the `elements` array is a multiple of the size of the fixed-size lists + /// (`list_size`). + /// - The `Validity` length (if it exists) times the `list_size` is equal to the length of the + /// `elements` (or put another way, the length of the array divided by the size of each + /// fixed-size list is equal to the length of the validity). + pub unsafe fn new_unchecked( + elements: ArrayRef, + list_size: u32, + validity: Validity, + len: usize, + ) -> Self { + #[cfg(debug_assertions)] + Self::validate(&elements, len, list_size, &validity) + .vortex_expect("[Debug Assertion]: Invalid `FixedSizeListArray` parameters"); + + let nullability = validity.nullability(); + + Self { + dtype: DType::FixedSizeList(Arc::new(elements.dtype().clone()), list_size, nullability), + elements, + list_size, + validity, + len, + stats_set: Default::default(), + } + } + + /// Returns the elements array. + pub fn elements(&self) -> &ArrayRef { + &self.elements + } + + /// The size of each fixed-size list scalar in the array. + pub const fn list_size(&self) -> u32 { + self.list_size + } + + /// Returns the elements of the fixed-size list scalar at the given index of the list array. + /// + /// # Panics + /// + /// Panics if the index is out of bounds. + pub fn fixed_size_list_elements_at(&self, index: usize) -> ArrayRef { + debug_assert!( + index < self.len, + "index out of bounds: the len is {} but the index is {index}", + self.len + ); + debug_assert!(self.validity.is_valid(index)); + + let start = self.list_size as usize * index; + let end = self.list_size as usize * (index + 1); + self.elements().slice(start..end) + } + + /// Validates the components that would be used to create a [`FixedSizeListArray`]. + /// + /// This function checks all the invariants required by [`FixedSizeListArray::new_unchecked`]. + pub(crate) fn validate( + elements: &dyn Array, + len: usize, + list_size: u32, + validity: &Validity, + ) -> VortexResult<()> { + // A fixed-size list array where each list scalar is empty is completely useless, but we can + // support it regardless. + if list_size == 0 { + vortex_ensure!( + elements.is_empty() && validity.maybe_len().is_none_or(|vlen| vlen == len), + "an empty `FixedSizeList` should have no elements" + ); + return Ok(()); + } + + let num_elements = elements.len(); + + vortex_ensure!( + len * list_size as usize == num_elements, + "the `elements` array has the incorrect number of elements to construct a \ + `FixedSizeList[{list_size}] array of length {len}", + ); + + // If a validity array is present, it must be the same length as the fixed-size list array. + if let Some(validity_len) = validity.maybe_len() { + vortex_ensure!( + len == validity_len, + "validity with size {validity_len} does not match fixed-size list array size {len}", + ); + } + + Ok(()) + } +} diff --git a/vortex-array/src/arrays/fixed_size_list/mod.rs b/vortex-array/src/arrays/fixed_size_list/mod.rs index 4b86d265464..9ec30181a4b 100644 --- a/vortex-array/src/arrays/fixed_size_list/mod.rs +++ b/vortex-array/src/arrays/fixed_size_list/mod.rs @@ -1,246 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::sync::Arc; +mod array; +pub use array::FixedSizeListArray; -use vortex_dtype::DType; -use vortex_error::{VortexExpect, VortexResult, vortex_ensure}; - -use crate::stats::ArrayStats; -use crate::validity::Validity; -use crate::{Array, ArrayRef}; +mod compute; mod vtable; pub use vtable::{FixedSizeListEncoding, FixedSizeListVTable}; #[cfg(test)] mod tests; - -mod compute; - -/// The canonical encoding for fixed-size list arrays. -/// -/// A fixed-size list array stores lists where each list has the same number of elements. This is -/// similar to a 2D array or matrix where the inner dimension is fixed. -/// -/// ## Data Layout -/// -/// Unlike [`ListArray`] which uses offsets, `FixedSizeListArray` stores elements contiguously and -/// uses a fixed `list_size`: -/// -/// - **Elements array**: A flat array containing all list elements concatenated together -/// - **List size**: The fixed number of elements in each list -/// - **Validity**: Optional mask indicating which lists are null -/// -/// The list at index `i` contains elements from `elements[i * list_size..(i + 1) * list_size]`. -/// -/// [`ListArray`]: crate::arrays::ListArray -/// -/// # Examples -/// -/// ``` -/// use vortex_array::arrays::{FixedSizeListArray, PrimitiveArray}; -/// use vortex_array::validity::Validity; -/// use vortex_array::IntoArray; -/// use vortex_buffer::buffer; -/// -/// // Create a fixed-size list array representing [[1, 2] [3, 4], [5, 6], [7, 8]] -/// let elements = buffer![1i32, 2, 3, 4, 5, 6, 7, 8].into_array(); -/// let list_size = 2; -/// -/// let fixed_list_array = FixedSizeListArray::new( -/// elements.into_array(), -/// list_size, -/// Validity::NonNullable, -/// 4, // 4 lists -/// ); -/// -/// assert_eq!(fixed_list_array.len(), 4); -/// assert_eq!(fixed_list_array.list_size(), 2); -/// -/// // Access individual lists -/// let first_list = fixed_list_array.fixed_size_list_elements_at(0); -/// assert_eq!(first_list.len(), 2); -/// ``` -#[derive(Clone, Debug)] -pub struct FixedSizeListArray { - /// The [`DType`] of the fixed-size list. - /// - /// This type **must** be the variant [`DType::FixedSizeList`]. - dtype: DType, - - /// The `elements` data array, where each fixed-size list scalar is a _slice_ of the `elements` - /// array, and each inner list element is a _scalar_ of the `elements` array. - /// - /// The fixed-size list scalars (or the elements of the array) are contiguous (regardless of - /// nullability for easy lookups), each with equal size in memory. - elements: ArrayRef, - - /// The size of each fixed-size list scalar in the array. - /// - /// We store the size of each fixed-size list in the array as a field for convenience. - list_size: u32, - - /// The validity / null map of the array. - /// - /// Note that this null map refers to which fixed-size list scalars are null, **not** which - /// sub-elements of fixed-size list scalars are null. The `elements` array will track individual - /// value nullability. - validity: Validity, - - /// The length of the array. - /// - /// Note that this is different from the size of each fixed-size list scalar (`list_size`). - /// - /// The main reason we need to store this (rather than calculate it on the fly via `list_size` - /// and `elements.len()`) is because in the degenerate case where `list_size == 0`, we cannot - /// use `0 / 0` to determine the length. - len: usize, - - /// The stats for this array. - stats_set: ArrayStats, -} - -impl FixedSizeListArray { - /// Creates a new [`FixedSizeListArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented - /// in [`FixedSizeListArray::new_unchecked`]. - pub fn new(elements: ArrayRef, list_size: u32, validity: Validity, len: usize) -> Self { - Self::try_new(elements, list_size, validity, len) - .vortex_expect("FixedSizeListArray construction failed") - } - - /// Constructs a new `FixedSizeListArray`. - /// - /// See [`FixedSizeListArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the inputs are invalid. The inputs are **valid** if: - /// - /// - The `list_size` is 0 and: - /// - The `elements` array is empty. - /// - The `len` is equal to the length of the `validity` map. - /// - The length of the `elements` array is a multiple of the size of the fixed-size lists - /// (`list_size`). - /// - The `Validity` length (if it exists) times the `list_size` is equal to the length of the - /// `elements` (or put another way, the length of the array divided by the size of each - /// fixed-size list is equal to the length of the validity). - pub fn try_new( - elements: ArrayRef, - list_size: u32, - validity: Validity, - len: usize, - ) -> VortexResult { - Self::validate(&elements, len, list_size, &validity)?; - - // SAFETY: we validate that the inputs are valid above. - Ok(unsafe { Self::new_unchecked(elements, list_size, validity, len) }) - } - - /// Creates a new [`FixedSizeListArray`] without validation from these components: - /// - /// * `elements` is the data array where each fixed-size list is a slice. - /// * `list_size` is the fixed number of elements in each list. - /// * `validity` holds the null values. - /// * `len` is the number of lists in the array. - /// - /// # Safety - /// - /// The inputs are **valid** if: - /// - /// - The `list_size` is 0 and: - /// - The `elements` array is empty. - /// - The `len` is equal to the length of the `validity` map. - /// - The length of the `elements` array is a multiple of the size of the fixed-size lists - /// (`list_size`). - /// - The `Validity` length (if it exists) times the `list_size` is equal to the length of the - /// `elements` (or put another way, the length of the array divided by the size of each - /// fixed-size list is equal to the length of the validity). - pub unsafe fn new_unchecked( - elements: ArrayRef, - list_size: u32, - validity: Validity, - len: usize, - ) -> Self { - let nullability = validity.nullability(); - - Self { - dtype: DType::FixedSizeList(Arc::new(elements.dtype().clone()), list_size, nullability), - elements, - list_size, - validity, - len, - stats_set: Default::default(), - } - } - - /// Returns the elements array. - pub fn elements(&self) -> &ArrayRef { - &self.elements - } - - /// The size of each fixed-size list scalar in the array. - pub const fn list_size(&self) -> u32 { - self.list_size - } - - /// Returns the elements of the fixed-size list scalar at the given index of the list array. - /// - /// # Panics - /// - /// Panics if the index is out of bounds. - pub fn fixed_size_list_elements_at(&self, index: usize) -> ArrayRef { - debug_assert!( - index < self.len, - "index out of bounds: the len is {} but the index is {index}", - self.len - ); - debug_assert!(self.validity.is_valid(index)); - - let start = self.list_size as usize * index; - let end = self.list_size as usize * (index + 1); - self.elements().slice(start..end) - } - - /// Validates the components that would be used to create a [`FixedSizeListArray`]. - /// - /// This function checks all the invariants required by [`FixedSizeListArray::new_unchecked`]. - pub(crate) fn validate( - elements: &dyn Array, - len: usize, - list_size: u32, - validity: &Validity, - ) -> VortexResult<()> { - // A fixed-size list array where each list scalar is empty is completely useless, but we can - // support it regardless. - if list_size == 0 { - vortex_ensure!( - elements.is_empty() && validity.maybe_len().is_none_or(|vlen| vlen == len), - "an empty `FixedSizeList` should have no elements" - ); - return Ok(()); - } - - let num_elements = elements.len(); - - vortex_ensure!( - len * list_size as usize == num_elements, - "the `elements` array has the incorrect number of elements to construct a \ - `FixedSizeList[{list_size}] array of length {len}", - ); - - // If a validity array is present, it must be the same length as the fixed-size list array. - if let Some(validity_len) = validity.maybe_len() { - vortex_ensure!( - len == validity_len, - "validity with size {validity_len} does not match fixed-size list array size {len}", - ); - } - - Ok(()) - } -} diff --git a/vortex-array/src/arrays/fixed_size_list/vtable/operations.rs b/vortex-array/src/arrays/fixed_size_list/vtable/operations.rs index 296ac37bd66..9291e5789d6 100644 --- a/vortex-array/src/arrays/fixed_size_list/vtable/operations.rs +++ b/vortex-array/src/arrays/fixed_size_list/vtable/operations.rs @@ -49,7 +49,7 @@ impl OperationsVTable for FixedSizeListVTable { Scalar::fixed_size_list( list.dtype().clone(), children_elements, - array.dtype.nullability(), + array.dtype().nullability(), ) } } diff --git a/vortex-array/src/arrays/list/array.rs b/vortex-array/src/arrays/list/array.rs new file mode 100644 index 00000000000..d21e0076889 --- /dev/null +++ b/vortex-array/src/arrays/list/array.rs @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use num_traits::AsPrimitive; +use vortex_dtype::{DType, match_each_integer_ptype, match_each_native_ptype}; +use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure}; + +use crate::arrays::{ListVTable, PrimitiveVTable}; +use crate::compute::{min_max, sub_scalar}; +use crate::stats::ArrayStats; +use crate::validity::Validity; +use crate::{Array, ArrayRef, IntoArray}; + +/// A list array that stores variable-length lists of elements, similar to `Vec>`. +/// +/// This mirrors the Apache Arrow List array encoding and provides efficient storage +/// for nested data where each row contains a list of elements of the same type. +/// +/// ## Data Layout +/// +/// The list array uses an offset-based encoding: +/// - **Elements array**: A flat array containing all list elements concatenated together +/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into +/// the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list. +/// - **Validity**: Optional mask indicating which lists are null +/// +/// This allows for excellent cascading compression of the elements and offsets, as similar values +/// are clustered together and the offsets have a predictable pattern and small deltas between +/// consecutive elements. +/// +/// ## Offset Semantics +/// +/// - Offsets must be non-nullable integers (i32, i64, etc.) +/// - Offsets array has length `n+1` where `n` is the number of lists +/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]` +/// - Offsets must be monotonically increasing +/// +/// # Examples +/// +/// ``` +/// use vortex_array::arrays::{ListArray, PrimitiveArray}; +/// use vortex_array::validity::Validity; +/// use vortex_array::IntoArray; +/// use vortex_buffer::buffer; +/// use std::sync::Arc; +/// +/// // Create a list array representing [[1, 2], [3, 4, 5], []] +/// let elements = buffer![1i32, 2, 3, 4, 5].into_array(); +/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists +/// +/// let list_array = ListArray::try_new( +/// elements.into_array(), +/// offsets.into_array(), +/// Validity::NonNullable, +/// ).unwrap(); +/// +/// assert_eq!(list_array.len(), 3); +/// +/// // Access individual lists +/// let first_list = list_array.list_elements_at(0); +/// assert_eq!(first_list.len(), 2); // [1, 2] +/// +/// let third_list = list_array.list_elements_at(2); +/// assert!(third_list.is_empty()); // [] +/// ``` +#[derive(Clone, Debug)] +pub struct ListArray { + pub(super) dtype: DType, + pub(super) elements: ArrayRef, + pub(super) offsets: ArrayRef, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, +} + +impl ListArray { + /// Creates a new [`ListArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented + /// in [`ListArray::new_unchecked`]. + pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self { + Self::try_new(elements, offsets, validity).vortex_expect("ListArray new") + } + + /// Constructs a new `ListArray`. + /// + /// See [`ListArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the provided components do not satisfy the invariants documented in + /// [`ListArray::new_unchecked`]. + pub fn try_new( + elements: ArrayRef, + offsets: ArrayRef, + validity: Validity, + ) -> VortexResult { + Self::validate(&elements, &offsets, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(elements, offsets, validity) }) + } + + /// Creates a new [`ListArray`] without validation from these components: + /// + /// * `elements` is a flat array containing all list elements concatenated. + /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`. + /// * `validity` holds the null values. + /// + /// # Safety + /// + /// The caller must ensure all of the following invariants are satisfied: + /// + /// - Offsets must be a non-nullable integer array. + /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]). + /// - Offsets must be sorted (monotonically increasing). + /// - All offset values must be non-negative. + /// - The maximum offset must not exceed `elements.len()`. + /// - If validity is an array, its length must equal `offsets.len() - 1`. + pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self { + #[cfg(debug_assertions)] + Self::validate(&elements, &offsets, &validity) + .vortex_expect("[Debug Assertion]: Invalid `ListViewArray` parameters"); + + Self { + dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()), + elements, + offsets, + validity, + stats_set: Default::default(), + } + } + + /// Validates the components that would be used to create a [`ListArray`]. + /// + /// This function checks all the invariants required by [`ListArray::new_unchecked`]. + pub(crate) fn validate( + elements: &dyn Array, + offsets: &dyn Array, + validity: &Validity, + ) -> VortexResult<()> { + // Offsets must have at least one element + vortex_ensure!( + !offsets.is_empty(), + "Offsets must have at least one element, [0] for an empty list" + ); + + // Offsets must be of integer type, and cannot go lower than 0. + vortex_ensure!( + offsets.dtype().is_int() && !offsets.dtype().is_nullable(), + "offsets have invalid type {}", + offsets.dtype() + ); + + // We can safely unwrap the DType as primitive now + let offsets_ptype = offsets.dtype().as_ptype(); + + // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed) + if let Some(is_sorted) = offsets.statistics().compute_is_sorted() { + vortex_ensure!(is_sorted, "offsets must be sorted"); + } else { + vortex_bail!("offsets must report is_sorted statistic"); + } + + // Validate that offsets min is non-negative, and max does not exceed the length of + // the elements array. + if let Some(min_max) = min_max(offsets)? { + match_each_integer_ptype!(offsets_ptype, |P| { + let max_offset = P::try_from(offsets.scalar_at(offsets.len() - 1)) + .vortex_expect("Offsets type must fit offsets values"); + + #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)] + { + if let Some(min) = min_max.min.as_primitive().as_::

() { + vortex_ensure!( + min >= 0 && min <= max_offset, + "offsets minimum {min} outside valid range [0, {max_offset}]" + ); + } + + if let Some(max) = min_max.max.as_primitive().as_::

() { + vortex_ensure!( + max >= 0 && max <= max_offset, + "offsets maximum {max} outside valid range [0, {max_offset}]" + ) + } + } + + vortex_ensure!( + max_offset + <= P::try_from(elements.len()) + .vortex_expect("Offsets type must be able to fit elements length"), + "Max offset {max_offset} is beyond the length of the elements array {}", + elements.len() + ); + }) + } else { + // TODO(aduffy): fallback to slower validation pathway? + vortex_bail!( + "offsets array with encoding {} must support min_max compute function", + offsets.encoding_id() + ); + }; + + // If a validity array is present, it must be the same length as the ListArray + if let Some(validity_len) = validity.maybe_len() { + vortex_ensure!( + validity_len == offsets.len() - 1, + "validity with size {validity_len} does not match array size {}", + offsets.len() - 1 + ); + } + + Ok(()) + } + + /// Returns the offset at the given index from the list array. + /// + /// Panics if the index is out of bounds. + pub fn offset_at(&self, index: usize) -> usize { + assert!( + index <= self.len(), + "Index {index} out of bounds 0..={}", + self.len() + ); + + self.offsets() + .as_opt::() + .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::

()[index].as_() })) + .unwrap_or_else(|| { + self.offsets() + .scalar_at(index) + .as_primitive() + .as_::() + .vortex_expect("index must fit in usize") + }) + } + + /// Returns the elements of the list scalar at the given index of the list array. + pub fn list_elements_at(&self, index: usize) -> ArrayRef { + let start = self.offset_at(index); + let end = self.offset_at(index + 1); + self.elements().slice(start..end) + } + + /// Returns elements of the list array referenced by the offsets array. + /// + /// This is useful for discarding any potentially unused parts of the underlying `elements` + /// child array. + pub fn sliced_elements(&self) -> ArrayRef { + let start = self.offset_at(0); + let end = self.offset_at(self.len()); + self.elements().slice(start..end) + } + + /// Returns the offsets array. + pub fn offsets(&self) -> &ArrayRef { + &self.offsets + } + + /// Returns the elements array. + pub fn elements(&self) -> &ArrayRef { + &self.elements + } + + /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not + /// referenced by the `offsets`. + pub fn reset_offsets(&self, recurse: bool) -> VortexResult { + let mut elements = self.sliced_elements(); + if recurse && elements.is_canonical() { + elements = elements.to_canonical().compact()?.into_array(); + } else if recurse && let Some(child_list_array) = elements.as_opt::() { + elements = child_list_array.reset_offsets(recurse)?.into_array(); + } + + let offsets = self.offsets(); + let first_offset = offsets.scalar_at(0); + let adjusted_offsets = sub_scalar(offsets, first_offset)?; + + Self::try_new(elements, adjusted_offsets, self.validity.clone()) + } +} diff --git a/vortex-array/src/arrays/list/compute/filter.rs b/vortex-array/src/arrays/list/compute/filter.rs index d22b5639435..807cfa28454 100644 --- a/vortex-array/src/arrays/list/compute/filter.rs +++ b/vortex-array/src/arrays/list/compute/filter.rs @@ -23,7 +23,7 @@ const MASK_EXPANSION_DENSITY_THRESHOLD: f64 = 0.05; impl FilterKernel for ListVTable { fn filter(&self, array: &ListArray, selection_mask: &Mask) -> VortexResult { let elements = array.elements(); - let offsets = array.offsets.to_primitive(); + let offsets = array.offsets().to_primitive(); let new_validity = array.validity().filter(selection_mask)?; debug_assert!( diff --git a/vortex-array/src/arrays/list/compute/is_constant.rs b/vortex-array/src/arrays/list/compute/is_constant.rs index bdc46bc9955..96e35d961a2 100644 --- a/vortex-array/src/arrays/list/compute/is_constant.rs +++ b/vortex-array/src/arrays/list/compute/is_constant.rs @@ -37,9 +37,9 @@ impl IsConstantKernel for ListVTable { // If the array is long, do an optimistic check on the remainder of the list lengths. if array.len() > SMALL_ARRAY_THRESHOLD { // check the rest of the element lengths - let start_offsets = array.offsets.slice(SMALL_ARRAY_THRESHOLD..array.len()); + let start_offsets = array.offsets().slice(SMALL_ARRAY_THRESHOLD..array.len()); let end_offsets = array - .offsets + .offsets() .slice(SMALL_ARRAY_THRESHOLD + 1..array.len() + 1); let list_lengths = numeric(&end_offsets, &start_offsets, NumericOperator::Sub)?; diff --git a/vortex-array/src/arrays/list/mod.rs b/vortex-array/src/arrays/list/mod.rs index 018ca2ed3b4..3cdb1a24065 100644 --- a/vortex-array/src/arrays/list/mod.rs +++ b/vortex-array/src/arrays/list/mod.rs @@ -1,436 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -mod compute; -mod serde; - -use std::ops::Range; -use std::sync::Arc; - -#[cfg(feature = "test-harness")] -use itertools::Itertools; -use num_traits::AsPrimitive; -#[cfg(feature = "test-harness")] -use vortex_dtype::IntegerPType; -use vortex_dtype::{DType, match_each_integer_ptype, match_each_native_ptype}; -use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_ensure}; -use vortex_scalar::Scalar; - -use crate::arrays::PrimitiveVTable; -#[cfg(feature = "test-harness")] -use crate::builders::{ArrayBuilder, ListBuilder}; -use crate::compute::{min_max, sub_scalar}; -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::vtable::{ - ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityHelper, - ValidityVTableFromValidityHelper, -}; -use crate::{Array, ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, vtable}; - -vtable!(List); - -impl VTable for ListVTable { - type Array = ListArray; - type Encoding = ListEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.list") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(ListEncoding.as_ref()) - } -} - -/// A list array that stores variable-length lists of elements, similar to `Vec>`. -/// -/// This mirrors the Apache Arrow List array encoding and provides efficient storage -/// for nested data where each row contains a list of elements of the same type. -/// -/// ## Data Layout -/// -/// The list array uses an offset-based encoding: -/// - **Elements array**: A flat array containing all list elements concatenated together -/// - **Offsets array**: Integer array where `offsets[i]` is an (inclusive) start index into -/// the **elements** and `offsets[i+1]` is the (exclusive) stop index for the `i`th list. -/// - **Validity**: Optional mask indicating which lists are null -/// -/// This allows for excellent cascading compression of the elements and offsets, as similar values -/// are clustered together and the offsets have a predictable pattern and small deltas between -/// consecutive elements. -/// -/// ## Offset Semantics -/// -/// - Offsets must be non-nullable integers (i32, i64, etc.) -/// - Offsets array has length `n+1` where `n` is the number of lists -/// - List `i` contains elements from `elements[offsets[i]..offsets[i+1]]` -/// - Offsets must be monotonically increasing -/// -/// # Examples -/// -/// ``` -/// use vortex_array::arrays::{ListArray, PrimitiveArray}; -/// use vortex_array::validity::Validity; -/// use vortex_array::IntoArray; -/// use vortex_buffer::buffer; -/// use std::sync::Arc; -/// -/// // Create a list array representing [[1, 2], [3, 4, 5], []] -/// let elements = buffer![1i32, 2, 3, 4, 5].into_array(); -/// let offsets = buffer![0u32, 2, 5, 5].into_array(); // 3 lists -/// -/// let list_array = ListArray::try_new( -/// elements.into_array(), -/// offsets.into_array(), -/// Validity::NonNullable, -/// ).unwrap(); -/// -/// assert_eq!(list_array.len(), 3); -/// -/// // Access individual lists -/// let first_list = list_array.list_elements_at(0); -/// assert_eq!(first_list.len(), 2); // [1, 2] -/// -/// let third_list = list_array.list_elements_at(2); -/// assert!(third_list.is_empty()); // [] -/// ``` -#[derive(Clone, Debug)] -pub struct ListArray { - dtype: DType, - elements: ArrayRef, - offsets: ArrayRef, - validity: Validity, - stats_set: ArrayStats, -} - -#[derive(Clone, Debug)] -pub struct ListEncoding; - -impl ListArray { - /// Creates a new [`ListArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented - /// in [`ListArray::new_unchecked`]. - pub fn new(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self { - Self::try_new(elements, offsets, validity).vortex_expect("ListArray new") - } - - /// Constructs a new `ListArray`. - /// - /// See [`ListArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the provided components do not satisfy the invariants documented in - /// [`ListArray::new_unchecked`]. - pub fn try_new( - elements: ArrayRef, - offsets: ArrayRef, - validity: Validity, - ) -> VortexResult { - Self::validate(&elements, &offsets, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(elements, offsets, validity) }) - } - - /// Creates a new [`ListArray`] without validation from these components: - /// - /// * `elements` is a flat array containing all list elements concatenated. - /// * `offsets` is an integer array where `offsets[i]` is the start index for list `i`. - /// * `validity` holds the null values. - /// - /// # Safety - /// - /// The caller must ensure all of the following invariants are satisfied: - /// - /// - Offsets must be a non-nullable integer array. - /// - Offsets must have at least one element (even for empty lists, it should contain \[0\]). - /// - Offsets must be sorted (monotonically increasing). - /// - All offset values must be non-negative. - /// - The maximum offset must not exceed `elements.len()`. - /// - If validity is an array, its length must equal `offsets.len() - 1`. - pub unsafe fn new_unchecked(elements: ArrayRef, offsets: ArrayRef, validity: Validity) -> Self { - Self { - dtype: DType::List(Arc::new(elements.dtype().clone()), validity.nullability()), - elements, - offsets, - validity, - stats_set: Default::default(), - } - } - - /// Validates the components that would be used to create a [`ListArray`]. - /// - /// This function checks all the invariants required by [`ListArray::new_unchecked`]. - pub(crate) fn validate( - elements: &dyn Array, - offsets: &dyn Array, - validity: &Validity, - ) -> VortexResult<()> { - // Offsets must have at least one element - vortex_ensure!( - !offsets.is_empty(), - "Offsets must have at least one element, [0] for an empty list" - ); - - // Offsets must be of integer type, and cannot go lower than 0. - vortex_ensure!( - offsets.dtype().is_int() && !offsets.dtype().is_nullable(), - "offsets have invalid type {}", - offsets.dtype() - ); - - // We can safely unwrap the DType as primitive now - let offsets_ptype = offsets.dtype().as_ptype(); - - // Offsets must be sorted (but not strictly sorted, zero-length lists are allowed) - if let Some(is_sorted) = offsets.statistics().compute_is_sorted() { - vortex_ensure!(is_sorted, "offsets must be sorted"); - } else { - vortex_bail!("offsets must report is_sorted statistic"); - } - - // Validate that offsets min is non-negative, and max does not exceed the length of - // the elements array. - if let Some(min_max) = min_max(offsets)? { - match_each_integer_ptype!(offsets_ptype, |P| { - let max_offset = P::try_from(offsets.scalar_at(offsets.len() - 1)) - .vortex_expect("Offsets type must fit offsets values"); - - #[allow(clippy::absurd_extreme_comparisons, unused_comparisons)] - { - if let Some(min) = min_max.min.as_primitive().as_::

() { - vortex_ensure!( - min >= 0 && min <= max_offset, - "offsets minimum {min} outside valid range [0, {max_offset}]" - ); - } +mod array; +pub use array::ListArray; - if let Some(max) = min_max.max.as_primitive().as_::

() { - vortex_ensure!( - max >= 0 && max <= max_offset, - "offsets maximum {max} outside valid range [0, {max_offset}]" - ) - } - } - - vortex_ensure!( - max_offset - <= P::try_from(elements.len()) - .vortex_expect("Offsets type must be able to fit elements length"), - "Max offset {max_offset} is beyond the length of the elements array {}", - elements.len() - ); - }) - } else { - // TODO(aduffy): fallback to slower validation pathway? - vortex_bail!( - "offsets array with encoding {} must support min_max compute function", - offsets.encoding_id() - ); - }; - - // If a validity array is present, it must be the same length as the ListArray - if let Some(validity_len) = validity.maybe_len() { - vortex_ensure!( - validity_len == offsets.len() - 1, - "validity with size {validity_len} does not match array size {}", - offsets.len() - 1 - ); - } - - Ok(()) - } - - /// Returns the offset at the given index from the list array. - /// - /// Panics if the index is out of bounds. - pub fn offset_at(&self, index: usize) -> usize { - assert!( - index <= self.len(), - "Index {index} out of bounds 0..={}", - self.len() - ); - - self.offsets() - .as_opt::() - .map(|p| match_each_native_ptype!(p.ptype(), |P| { p.as_slice::

()[index].as_() })) - .unwrap_or_else(|| { - self.offsets() - .scalar_at(index) - .as_primitive() - .as_::() - .vortex_expect("index must fit in usize") - }) - } - - /// Returns the elements of the list scalar at the given index of the list array. - pub fn list_elements_at(&self, index: usize) -> ArrayRef { - let start = self.offset_at(index); - let end = self.offset_at(index + 1); - self.elements().slice(start..end) - } - - /// Returns elements of the list array referenced by the offsets array. - /// - /// This is useful for discarding any potentially unused parts of the underlying `elements` - /// child array. - pub fn sliced_elements(&self) -> ArrayRef { - let start = self.offset_at(0); - let end = self.offset_at(self.len()); - self.elements().slice(start..end) - } - - /// Returns the offsets array. - pub fn offsets(&self) -> &ArrayRef { - &self.offsets - } - - /// Returns the elements array. - pub fn elements(&self) -> &ArrayRef { - &self.elements - } - - /// Create a copy of this array by adjusting `offsets` to start at `0` and removing elements not - /// referenced by the `offsets`. - pub fn reset_offsets(&self, recurse: bool) -> VortexResult { - let mut elements = self.sliced_elements(); - if recurse && elements.is_canonical() { - elements = elements.to_canonical().compact()?.into_array(); - } - - let offsets = self.offsets(); - let first_offset = offsets.scalar_at(0); - let adjusted_offsets = sub_scalar(offsets, first_offset)?; - - Self::try_new(elements, adjusted_offsets, self.validity.clone()) - } -} - -impl ArrayVTable for ListVTable { - fn len(array: &ListArray) -> usize { - array.offsets.len().saturating_sub(1) - } - - fn dtype(array: &ListArray) -> &DType { - &array.dtype - } - - fn stats(array: &ListArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl OperationsVTable for ListVTable { - fn slice(array: &ListArray, range: Range) -> ArrayRef { - ListArray::new( - array.elements().clone(), - array.offsets().slice(range.start..range.end + 1), - array.validity().slice(range), - ) - .into_array() - } - - fn scalar_at(array: &ListArray, index: usize) -> Scalar { - // By the preconditions we know that the list scalar is not null. - let elems = array.list_elements_at(index); - let scalars: Vec = (0..elems.len()).map(|i| elems.scalar_at(i)).collect(); - - Scalar::list( - Arc::new(elems.dtype().clone()), - scalars, - array.dtype().nullability(), - ) - } -} - -impl CanonicalVTable for ListVTable { - fn canonicalize(array: &ListArray) -> Canonical { - Canonical::List(array.clone()) - } -} +mod compute; -impl ValidityHelper for ListArray { - fn validity(&self) -> &Validity { - &self.validity - } -} +mod vtable; +pub use vtable::{ListEncoding, ListVTable}; #[cfg(feature = "test-harness")] -impl ListArray { - /// This is a convenience method to create a list array from an iterator of iterators. - /// This method is slow however since each element is first converted to a scalar and then - /// appended to the array. - pub fn from_iter_slow( - iter: I, - dtype: Arc, - ) -> VortexResult - where - I::Item: IntoIterator, - ::Item: Into, - { - let iter = iter.into_iter(); - let mut builder = ListBuilder::::with_capacity( - dtype.clone(), - vortex_dtype::Nullability::NonNullable, - iter.size_hint().0, - ); - - for v in iter { - let elem = Scalar::list( - dtype.clone(), - v.into_iter().map(|x| x.into()).collect_vec(), - dtype.nullability(), - ); - builder.append_value(elem.as_list())? - } - Ok(builder.finish()) - } - - pub fn from_iter_opt_slow>, T>( - iter: I, - dtype: Arc, - ) -> VortexResult - where - T: IntoIterator, - T::Item: Into, - { - let iter = iter.into_iter(); - let mut builder = ListBuilder::::with_capacity( - dtype.clone(), - vortex_dtype::Nullability::Nullable, - iter.size_hint().0, - ); - - for v in iter { - if let Some(v) = v { - let elem = Scalar::list( - dtype.clone(), - v.into_iter().map(|x| x.into()).collect_vec(), - dtype.nullability(), - ); - builder.append_value(elem.as_list())? - } else { - builder.append_null() - } - } - Ok(builder.finish()) - } -} +mod test_harness; #[cfg(test)] mod tests; diff --git a/vortex-array/src/arrays/list/test_harness.rs b/vortex-array/src/arrays/list/test_harness.rs new file mode 100644 index 00000000000..547fbcf1e82 --- /dev/null +++ b/vortex-array/src/arrays/list/test_harness.rs @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use itertools::Itertools; +use vortex_dtype::{DType, IntegerPType}; +use vortex_error::VortexResult; +use vortex_scalar::Scalar; + +use crate::ArrayRef; +use crate::arrays::ListArray; +use crate::builders::{ArrayBuilder, ListBuilder}; + +impl ListArray { + /// This is a convenience method to create a list array from an iterator of iterators. + /// This method is slow however since each element is first converted to a scalar and then + /// appended to the array. + pub fn from_iter_slow( + iter: I, + dtype: Arc, + ) -> VortexResult + where + I::Item: IntoIterator, + ::Item: Into, + { + let iter = iter.into_iter(); + let mut builder = ListBuilder::::with_capacity( + dtype.clone(), + vortex_dtype::Nullability::NonNullable, + iter.size_hint().0, + ); + + for v in iter { + let elem = Scalar::list( + dtype.clone(), + v.into_iter().map(|x| x.into()).collect_vec(), + dtype.nullability(), + ); + builder.append_value(elem.as_list())? + } + Ok(builder.finish()) + } + + pub fn from_iter_opt_slow>, T>( + iter: I, + dtype: Arc, + ) -> VortexResult + where + T: IntoIterator, + T::Item: Into, + { + let iter = iter.into_iter(); + let mut builder = ListBuilder::::with_capacity( + dtype.clone(), + vortex_dtype::Nullability::Nullable, + iter.size_hint().0, + ); + + for v in iter { + if let Some(v) = v { + let elem = Scalar::list( + dtype.clone(), + v.into_iter().map(|x| x.into()).collect_vec(), + dtype.nullability(), + ); + builder.append_value(elem.as_list())? + } else { + builder.append_null() + } + } + Ok(builder.finish()) + } +} diff --git a/vortex-array/src/arrays/list/tests.rs b/vortex-array/src/arrays/list/tests.rs index a0af5ecfa32..3a95fb83b0b 100644 --- a/vortex-array/src/arrays/list/tests.rs +++ b/vortex-array/src/arrays/list/tests.rs @@ -9,10 +9,14 @@ use vortex_dtype::PType::I32; use vortex_dtype::{DType, Nullability}; use vortex_error::VortexUnwrap; use vortex_mask::Mask; +use vortex_scalar::Scalar; use super::*; +use crate::IntoArray; use crate::arrays::PrimitiveArray; +use crate::builders::{ArrayBuilder, ListBuilder}; use crate::compute::filter; +use crate::validity::Validity; #[test] fn test_empty_list_array() { diff --git a/vortex-array/src/arrays/list/vtable/array.rs b/vortex-array/src/arrays/list/vtable/array.rs new file mode 100644 index 00000000000..08c0b7ec2c4 --- /dev/null +++ b/vortex-array/src/arrays/list/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::{ListArray, ListVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for ListVTable { + fn len(array: &ListArray) -> usize { + array.offsets.len().saturating_sub(1) + } + + fn dtype(array: &ListArray) -> &DType { + &array.dtype + } + + fn stats(array: &ListArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/list/vtable/canonical.rs b/vortex-array/src/arrays/list/vtable/canonical.rs new file mode 100644 index 00000000000..b883b4707c3 --- /dev/null +++ b/vortex-array/src/arrays/list/vtable/canonical.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::{ListArray, ListVTable}; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for ListVTable { + fn canonicalize(array: &ListArray) -> Canonical { + Canonical::List(array.clone()) + } +} diff --git a/vortex-array/src/arrays/list/vtable/mod.rs b/vortex-array/src/arrays/list/vtable/mod.rs new file mode 100644 index 00000000000..00455ef270e --- /dev/null +++ b/vortex-array/src/arrays/list/vtable/mod.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::ListArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; +mod visitor; + +vtable!(List); + +impl VTable for ListVTable { + type Array = ListArray; + type Encoding = ListEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.list") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(ListEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct ListEncoding; diff --git a/vortex-array/src/arrays/list/vtable/operations.rs b/vortex-array/src/arrays/list/vtable/operations.rs new file mode 100644 index 00000000000..5fd63f57d8f --- /dev/null +++ b/vortex-array/src/arrays/list/vtable/operations.rs @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::ops::Range; +use std::sync::Arc; + +use vortex_scalar::Scalar; + +use crate::arrays::{ListArray, ListVTable}; +use crate::vtable::{OperationsVTable, ValidityHelper}; +use crate::{ArrayRef, IntoArray}; + +impl OperationsVTable for ListVTable { + fn slice(array: &ListArray, range: Range) -> ArrayRef { + ListArray::new( + array.elements().clone(), + array.offsets().slice(range.start..range.end + 1), + array.validity().slice(range), + ) + .into_array() + } + + fn scalar_at(array: &ListArray, index: usize) -> Scalar { + // By the preconditions we know that the list scalar is not null. + let elems = array.list_elements_at(index); + let scalars: Vec = (0..elems.len()).map(|i| elems.scalar_at(i)).collect(); + + Scalar::list( + Arc::new(elems.dtype().clone()), + scalars, + array.dtype().nullability(), + ) + } +} diff --git a/vortex-array/src/arrays/list/serde.rs b/vortex-array/src/arrays/list/vtable/serde.rs similarity index 76% rename from vortex-array/src/arrays/list/serde.rs rename to vortex-array/src/arrays/list/vtable/serde.rs index ccb46f3dd8c..24ee26df719 100644 --- a/vortex-array/src/arrays/list/serde.rs +++ b/vortex-array/src/arrays/list/vtable/serde.rs @@ -5,12 +5,12 @@ use vortex_buffer::ByteBuffer; use vortex_dtype::{DType, Nullability, PType}; use vortex_error::{VortexResult, vortex_bail}; -use super::{ListArray, ListVTable}; -use crate::arrays::ListEncoding; +use super::ListArray; +use crate::ProstMetadata; +use crate::arrays::{ListEncoding, ListVTable}; use crate::serde::ArrayChildren; use crate::validity::Validity; -use crate::vtable::{SerdeVTable, ValidityHelper, VisitorVTable}; -use crate::{Array, ArrayBufferVisitor, ArrayChildVisitor, ProstMetadata}; +use crate::vtable::SerdeVTable; #[derive(Clone, prost::Message)] pub struct ListMetadata { @@ -65,13 +65,3 @@ impl SerdeVTable for ListVTable { ListArray::try_new(elements, offsets, validity) } } - -impl VisitorVTable for ListVTable { - fn visit_buffers(_array: &ListArray, _visitor: &mut dyn ArrayBufferVisitor) {} - - fn visit_children(array: &ListArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_child("elements", array.elements()); - visitor.visit_child("offsets", array.offsets()); - visitor.visit_validity(array.validity(), array.len()); - } -} diff --git a/vortex-array/src/arrays/list/vtable/validity.rs b/vortex-array/src/arrays/list/vtable/validity.rs new file mode 100644 index 00000000000..3accef4d97c --- /dev/null +++ b/vortex-array/src/arrays/list/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::ListArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for ListArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/list/vtable/visitor.rs b/vortex-array/src/arrays/list/vtable/visitor.rs new file mode 100644 index 00000000000..083a24855ce --- /dev/null +++ b/vortex-array/src/arrays/list/vtable/visitor.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::{ListArray, ListVTable}; +use crate::vtable::{ValidityHelper, VisitorVTable}; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for ListVTable { + fn visit_buffers(_array: &ListArray, _visitor: &mut dyn ArrayBufferVisitor) {} + + fn visit_children(array: &ListArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_child("elements", array.elements()); + visitor.visit_child("offsets", array.offsets()); + visitor.visit_validity(array.validity(), array.len()); + } +} diff --git a/vortex-array/src/arrays/masked/array.rs b/vortex-array/src/arrays/masked/array.rs new file mode 100644 index 00000000000..3bfa66f0d79 --- /dev/null +++ b/vortex-array/src/arrays/masked/array.rs @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; +use vortex_error::{VortexResult, vortex_bail}; + +use crate::ArrayRef; +use crate::compute::mask; +use crate::stats::ArrayStats; +use crate::validity::Validity; + +#[derive(Clone, Debug)] +pub struct MaskedArray { + pub(super) child: ArrayRef, + pub(super) validity: Validity, + pub(super) dtype: DType, + pub(super) stats: ArrayStats, +} + +impl MaskedArray { + pub fn try_new(child: ArrayRef, validity: Validity) -> VortexResult { + if matches!(validity, Validity::NonNullable) { + vortex_bail!("MaskedArray must have nullable validity, got {validity:?}") + } + + if !child.all_valid() { + vortex_bail!("MaskedArray children must not have nulls"); + } + + if let Some(validity_len) = validity.maybe_len() + && validity_len != child.len() + { + vortex_bail!("Validity must be the same length as a MaskedArray's child"); + } + + // MaskedArray's nullability is determined solely by its validity, not the child's dtype. + // The child can have nullable dtype but must not have any actual null values. + let dtype = child.dtype().as_nullable(); + + Ok(Self { + child, + validity, + dtype, + stats: ArrayStats::default(), + }) + } + + pub(crate) fn masked_child(&self) -> VortexResult { + // Invert the validity mask - we want to set values to null where validity is false. + let inverted_mask = !self.validity.to_mask(self.len()); + mask(&self.child, &inverted_mask) + } +} diff --git a/vortex-array/src/arrays/masked/mod.rs b/vortex-array/src/arrays/masked/mod.rs index 4eb5afa707e..45aaa87bf20 100644 --- a/vortex-array/src/arrays/masked/mod.rs +++ b/vortex-array/src/arrays/masked/mod.rs @@ -1,241 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::ops::Range; +mod array; +pub use array::MaskedArray; -use vortex_dtype::DType; -use vortex_error::{VortexResult, vortex_bail}; -use vortex_scalar::Scalar; - -use crate::compute::mask; -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::vtable::{ - ArrayVTable, NotSupported, OperationsVTable, VTable, ValidityHelper, - ValidityVTableFromValidityHelper, -}; -use crate::{Array, ArrayRef, EncodingId, EncodingRef, IntoArray, vtable}; - -vtable!(Masked); - -mod canonical; mod compute; -mod serde; - -#[derive(Clone, Debug)] -pub struct MaskedEncoding; - -impl VTable for MaskedVTable { - type Array = MaskedArray; - type Encoding = MaskedEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type SerdeVTable = Self; - type PipelineVTable = NotSupported; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.masked") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(MaskedEncoding.as_ref()) - } -} - -#[derive(Clone, Debug)] -pub struct MaskedArray { - child: ArrayRef, - validity: Validity, - dtype: DType, - stats: ArrayStats, -} - -impl ArrayVTable for MaskedVTable { - fn len(array: &MaskedArray) -> usize { - array.child.len() - } - - fn dtype(array: &MaskedArray) -> &DType { - &array.dtype - } - - fn stats(array: &MaskedArray) -> StatsSetRef<'_> { - array.stats.to_ref(array.as_ref()) - } -} - -impl ValidityHelper for MaskedArray { - fn validity(&self) -> &Validity { - &self.validity - } -} - -impl MaskedArray { - pub fn try_new(child: ArrayRef, validity: Validity) -> VortexResult { - if matches!(validity, Validity::NonNullable) { - vortex_bail!("MaskedArray must have nullable validity, got {validity:?}") - } - - if !child.all_valid() { - vortex_bail!("MaskedArray children must not have nulls"); - } - - if let Some(validity_len) = validity.maybe_len() - && validity_len != child.len() - { - vortex_bail!("Validity must be the same length as a MaskedArray's child"); - } - - // MaskedArray's nullability is determined solely by its validity, not the child's dtype - // The child can have nullable dtype but must not have any actual null values - let dtype = child.dtype().as_nullable(); - - Ok(Self { - child, - validity, - dtype, - stats: ArrayStats::default(), - }) - } - - fn masked_child(&self) -> VortexResult { - // Invert the validity mask - we want to set values to null where validity is false - let inverted_mask = !self.validity.to_mask(self.len()); - mask(&self.child, &inverted_mask) - } -} -impl OperationsVTable for MaskedVTable { - fn slice(array: &MaskedArray, range: Range) -> ArrayRef { - let child = array.child.slice(range.clone()); - let validity = array.validity.slice(range); - - MaskedArray { - child, - validity, - dtype: array.dtype.clone(), - stats: ArrayStats::default(), - } - .into_array() - } - - fn scalar_at(array: &MaskedArray, index: usize) -> Scalar { - // invalid indices are handled by the entrypoint function - array.child.scalar_at(index).into_nullable() - } -} +mod vtable; +pub use vtable::{MaskedEncoding, MaskedVTable}; #[cfg(test)] -mod tests { - use rstest::rstest; - use vortex_dtype::{DType, Nullability}; - - use super::*; - use crate::arrays::PrimitiveArray; - use crate::validity::Validity; - use crate::{Array, IntoArray, ToCanonical as _}; - - #[rstest] - #[case(Validity::AllValid, Nullability::Nullable)] - #[case(Validity::from_iter([true, false, true]), Nullability::Nullable)] - fn test_dtype_nullability(#[case] validity: Validity, #[case] expected: Nullability) { - let child = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); - let array = MaskedArray::try_new(child, validity).unwrap(); - - assert_eq!( - array.dtype(), - &DType::Primitive(vortex_dtype::PType::I32, expected) - ); - } - - #[test] - fn test_dtype_nullability_with_nullable_child() { - // Child can have nullable dtype but no actual nulls - // MaskedArray dtype should be determined by validity, not child's dtype - let child = PrimitiveArray::new(vortex_buffer::buffer![1i32, 2, 3], Validity::AllValid) - .into_array(); - - // Child has nullable dtype - assert!(child.dtype().is_nullable()); - } - - #[test] - fn test_canonical_dtype_matches_array_dtype() { - // The canonical form should have the same nullability as the array's dtype - let child = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); - let array = MaskedArray::try_new(child, Validity::AllValid).unwrap(); - - let canonical = array.to_canonical(); - assert_eq!(canonical.as_ref().dtype(), array.dtype()); - } - - #[test] - fn test_masked_child_with_validity() { - // When validity has nulls, masked_child should apply inverted mask - let child = PrimitiveArray::from_iter([1i32, 2, 3, 4, 5]).into_array(); - let array = - MaskedArray::try_new(child, Validity::from_iter([true, false, true, false, true])) - .unwrap(); - - let masked = array.masked_child().unwrap(); - let prim = masked.to_primitive(); - - // Positions where validity is false should be null in masked_child - assert_eq!(prim.valid_count(), 3); - assert!(prim.is_valid(0)); - assert!(!prim.is_valid(1)); - assert!(prim.is_valid(2)); - assert!(!prim.is_valid(3)); - assert!(prim.is_valid(4)); - - assert_eq!( - array.as_ref().display_values().to_string(), - masked.display_values().to_string() - ); - } - - #[test] - fn test_masked_child_all_valid() { - // When validity is AllValid, masked_child should invert to AllInvalid - let child = PrimitiveArray::from_iter([10i32, 20, 30]).into_array(); - let array = MaskedArray::try_new(child, Validity::AllValid).unwrap(); - - let masked = array.masked_child().unwrap(); - assert_eq!(masked.len(), 3); - assert_eq!(masked.valid_count(), 3); - assert_eq!( - array.as_ref().display_values().to_string(), - masked.display_values().to_string() - ); - } - - #[rstest] - #[case(Validity::AllValid)] - #[case(Validity::from_iter([true, true, true]))] - #[case(Validity::from_iter([false, false, false]))] - #[case(Validity::from_iter([true, false, true, false]))] - fn test_masked_child_preserves_length(#[case] validity: Validity) { - let len = match &validity { - Validity::Array(arr) => arr.len(), - _ => 3, - }; - - #[allow(clippy::cast_possible_truncation)] - let child = PrimitiveArray::from_iter(0..len as i32).into_array(); - let array = MaskedArray::try_new(child, validity.clone()).unwrap(); - - let masked = array.masked_child().unwrap(); - assert_eq!(masked.len(), len); - assert_eq!(masked.validity_mask(), validity.to_mask(len)); - assert_eq!( - array.as_ref().display_values().to_string(), - masked.display_values().to_string() - ); - } -} +mod tests; diff --git a/vortex-array/src/arrays/masked/tests.rs b/vortex-array/src/arrays/masked/tests.rs new file mode 100644 index 00000000000..8c661443e89 --- /dev/null +++ b/vortex-array/src/arrays/masked/tests.rs @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use rstest::rstest; +use vortex_dtype::{DType, Nullability}; + +use super::*; +use crate::arrays::PrimitiveArray; +use crate::validity::Validity; +use crate::{Array, IntoArray, ToCanonical as _}; + +#[rstest] +#[case(Validity::AllValid, Nullability::Nullable)] +#[case(Validity::from_iter([true, false, true]), Nullability::Nullable)] +fn test_dtype_nullability(#[case] validity: Validity, #[case] expected: Nullability) { + let child = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + let array = MaskedArray::try_new(child, validity).unwrap(); + + assert_eq!( + array.dtype(), + &DType::Primitive(vortex_dtype::PType::I32, expected) + ); +} + +#[test] +fn test_dtype_nullability_with_nullable_child() { + // Child can have nullable dtype but no actual nulls. + // MaskedArray dtype should be determined by validity, not child's dtype. + let child = + PrimitiveArray::new(vortex_buffer::buffer![1i32, 2, 3], Validity::AllValid).into_array(); + + // Child has nullable dtype. + assert!(child.dtype().is_nullable()); +} + +#[test] +fn test_canonical_dtype_matches_array_dtype() { + // The canonical form should have the same nullability as the array's dtype. + let child = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); + let array = MaskedArray::try_new(child, Validity::AllValid).unwrap(); + + let canonical = array.to_canonical(); + assert_eq!(canonical.as_ref().dtype(), array.dtype()); +} + +#[test] +fn test_masked_child_with_validity() { + // When validity has nulls, masked_child should apply inverted mask. + let child = PrimitiveArray::from_iter([1i32, 2, 3, 4, 5]).into_array(); + let array = + MaskedArray::try_new(child, Validity::from_iter([true, false, true, false, true])).unwrap(); + + let masked = array.masked_child().unwrap(); + let prim = masked.to_primitive(); + + // Positions where validity is false should be null in masked_child. + assert_eq!(prim.valid_count(), 3); + assert!(prim.is_valid(0)); + assert!(!prim.is_valid(1)); + assert!(prim.is_valid(2)); + assert!(!prim.is_valid(3)); + assert!(prim.is_valid(4)); + + assert_eq!( + array.as_ref().display_values().to_string(), + masked.display_values().to_string() + ); +} + +#[test] +fn test_masked_child_all_valid() { + // When validity is AllValid, masked_child should invert to AllInvalid. + let child = PrimitiveArray::from_iter([10i32, 20, 30]).into_array(); + let array = MaskedArray::try_new(child, Validity::AllValid).unwrap(); + + let masked = array.masked_child().unwrap(); + assert_eq!(masked.len(), 3); + assert_eq!(masked.valid_count(), 3); + assert_eq!( + array.as_ref().display_values().to_string(), + masked.display_values().to_string() + ); +} + +#[rstest] +#[case(Validity::AllValid)] +#[case(Validity::from_iter([true, true, true]))] +#[case(Validity::from_iter([false, false, false]))] +#[case(Validity::from_iter([true, false, true, false]))] +fn test_masked_child_preserves_length(#[case] validity: Validity) { + let len = match &validity { + Validity::Array(arr) => arr.len(), + _ => 3, + }; + + #[allow(clippy::cast_possible_truncation)] + let child = PrimitiveArray::from_iter(0..len as i32).into_array(); + let array = MaskedArray::try_new(child, validity.clone()).unwrap(); + + let masked = array.masked_child().unwrap(); + assert_eq!(masked.len(), len); + assert_eq!(masked.validity_mask(), validity.to_mask(len)); + assert_eq!( + array.as_ref().display_values().to_string(), + masked.display_values().to_string() + ); +} diff --git a/vortex-array/src/arrays/masked/vtable/array.rs b/vortex-array/src/arrays/masked/vtable/array.rs new file mode 100644 index 00000000000..d120d87611f --- /dev/null +++ b/vortex-array/src/arrays/masked/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::masked::{MaskedArray, MaskedVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for MaskedVTable { + fn len(array: &MaskedArray) -> usize { + array.child.len() + } + + fn dtype(array: &MaskedArray) -> &DType { + &array.dtype + } + + fn stats(array: &MaskedArray) -> StatsSetRef<'_> { + array.stats.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/masked/canonical.rs b/vortex-array/src/arrays/masked/vtable/canonical.rs similarity index 97% rename from vortex-array/src/arrays/masked/canonical.rs rename to vortex-array/src/arrays/masked/vtable/canonical.rs index d39d56e6ab2..59f0461977b 100644 --- a/vortex-array/src/arrays/masked/canonical.rs +++ b/vortex-array/src/arrays/masked/vtable/canonical.rs @@ -64,7 +64,7 @@ mod tests { let canonical = array.to_canonical(); let prim = canonical.as_ref().to_primitive(); - // Check that null positions match validity + // Check that null positions match validity. assert_eq!(prim.valid_count(), 3); assert!(prim.is_valid(0)); assert!(!prim.is_valid(1)); diff --git a/vortex-array/src/arrays/masked/vtable/mod.rs b/vortex-array/src/arrays/masked/vtable/mod.rs new file mode 100644 index 00000000000..1f40a6a4344 --- /dev/null +++ b/vortex-array/src/arrays/masked/vtable/mod.rs @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; + +use crate::arrays::masked::MaskedArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +vtable!(Masked); + +#[derive(Clone, Debug)] +pub struct MaskedEncoding; + +impl VTable for MaskedVTable { + type Array = MaskedArray; + type Encoding = MaskedEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type SerdeVTable = Self; + type PipelineVTable = NotSupported; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.masked") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(MaskedEncoding.as_ref()) + } +} diff --git a/vortex-array/src/arrays/masked/vtable/operations.rs b/vortex-array/src/arrays/masked/vtable/operations.rs new file mode 100644 index 00000000000..c95b9d0d70f --- /dev/null +++ b/vortex-array/src/arrays/masked/vtable/operations.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::ops::Range; + +use vortex_scalar::Scalar; + +use crate::arrays::MaskedVTable; +use crate::arrays::masked::MaskedArray; +use crate::stats::ArrayStats; +use crate::vtable::OperationsVTable; +use crate::{ArrayRef, IntoArray}; + +impl OperationsVTable for MaskedVTable { + fn slice(array: &MaskedArray, range: Range) -> ArrayRef { + let child = array.child.slice(range.clone()); + let validity = array.validity.slice(range); + + MaskedArray { + child, + validity, + dtype: array.dtype.clone(), + stats: ArrayStats::default(), + } + .into_array() + } + + fn scalar_at(array: &MaskedArray, index: usize) -> Scalar { + // Invalid indices are handled by the entrypoint function. + array.child.scalar_at(index).into_nullable() + } +} diff --git a/vortex-array/src/arrays/masked/serde.rs b/vortex-array/src/arrays/masked/vtable/serde.rs similarity index 98% rename from vortex-array/src/arrays/masked/serde.rs rename to vortex-array/src/arrays/masked/vtable/serde.rs index 6f7a3cba60c..410e2d53b2a 100644 --- a/vortex-array/src/arrays/masked/serde.rs +++ b/vortex-array/src/arrays/masked/vtable/serde.rs @@ -96,7 +96,7 @@ mod tests { .serialize(&ctx, &SerializeOptions::default()) .unwrap(); - // Concat into a single buffer + // Concat into a single buffer. let mut concat = ByteBufferMut::empty(); for buf in serialized { concat.extend_from_slice(buf.as_ref()); diff --git a/vortex-array/src/arrays/masked/vtable/validity.rs b/vortex-array/src/arrays/masked/vtable/validity.rs new file mode 100644 index 00000000000..96ab2f829c1 --- /dev/null +++ b/vortex-array/src/arrays/masked/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::masked::MaskedArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for MaskedArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/mod.rs b/vortex-array/src/arrays/mod.rs index 6a2dd7f4308..ef5344cee48 100644 --- a/vortex-array/src/arrays/mod.rs +++ b/vortex-array/src/arrays/mod.rs @@ -13,6 +13,7 @@ mod bool; mod chunked; mod constant; mod datetime; +mod decimal; mod extension; mod fixed_size_list; mod list; @@ -26,7 +27,8 @@ mod varbinview; #[cfg(feature = "arbitrary")] pub mod arbitrary; -mod decimal; + +// TODO(connor): Export exact types, not glob. pub use bool::*; pub use chunked::*; diff --git a/vortex-array/src/arrays/primitive/accessor.rs b/vortex-array/src/arrays/primitive/array/accessor.rs similarity index 100% rename from vortex-array/src/arrays/primitive/accessor.rs rename to vortex-array/src/arrays/primitive/array/accessor.rs diff --git a/vortex-array/src/arrays/primitive/downcast.rs b/vortex-array/src/arrays/primitive/array/cast.rs similarity index 84% rename from vortex-array/src/arrays/primitive/downcast.rs rename to vortex-array/src/arrays/primitive/array/cast.rs index e4ec67c3d5d..55f7f0f47a5 100644 --- a/vortex-array/src/arrays/primitive/downcast.rs +++ b/vortex-array/src/arrays/primitive/array/cast.rs @@ -2,14 +2,47 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use vortex_buffer::Buffer; -use vortex_dtype::{DType, PType}; -use vortex_error::VortexResult; +use vortex_dtype::{DType, NativePType, PType}; +use vortex_error::{VortexResult, vortex_panic}; use crate::ToCanonical; use crate::arrays::PrimitiveArray; use crate::compute::{cast, min_max}; +use crate::vtable::ValidityHelper; impl PrimitiveArray { + /// Return a slice of the array's buffer. + /// + /// NOTE: these values may be nonsense if the validity buffer indicates that the value is null. + pub fn as_slice(&self) -> &[T] { + if T::PTYPE != self.ptype() { + vortex_panic!( + "Attempted to get slice of type {} from array of type {}", + T::PTYPE, + self.ptype() + ) + } + let raw_slice = self.byte_buffer().as_ptr(); + // SAFETY: alignment of Buffer is checked on construction + unsafe { + std::slice::from_raw_parts(raw_slice.cast(), self.byte_buffer().len() / size_of::()) + } + } + + pub fn reinterpret_cast(&self, ptype: PType) -> Self { + if self.ptype() == ptype { + return self.clone(); + } + + assert_eq!( + self.ptype().byte_width(), + ptype.byte_width(), + "can't reinterpret cast between integers of two different widths" + ); + + PrimitiveArray::from_byte_buffer(self.byte_buffer().clone(), ptype, self.validity().clone()) + } + pub fn downcast(&self) -> VortexResult { if !self.ptype().is_int() { return Ok(self.clone()); diff --git a/vortex-array/src/arrays/primitive/array/conversion.rs b/vortex-array/src/arrays/primitive/array/conversion.rs new file mode 100644 index 00000000000..d542938e6eb --- /dev/null +++ b/vortex-array/src/arrays/primitive/array/conversion.rs @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Conversion methods and trait implementations of [`From`] and [`Into`] for [`PrimitiveArray`]. + +use arrow_buffer::BooleanBufferBuilder; +use vortex_buffer::{Buffer, BufferMut}; +use vortex_dtype::NativePType; +use vortex_error::vortex_panic; + +use crate::arrays::PrimitiveArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; +use crate::{ArrayRef, IntoArray}; + +impl PrimitiveArray { + /// Create a PrimitiveArray from an iterator of `T`. + /// NOTE: we cannot impl FromIterator trait since it conflicts with `FromIterator`. + pub fn from_option_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut values = BufferMut::with_capacity(iter.size_hint().0); + let mut validity = BooleanBufferBuilder::new(values.capacity()); + + for i in iter { + match i { + None => { + validity.append(false); + values.push(T::default()); + } + Some(e) => { + validity.append(true); + values.push(e); + } + } + } + Self::new(values.freeze(), Validity::from(validity.finish())) + } + + pub fn buffer(&self) -> Buffer { + if T::PTYPE != self.ptype() { + vortex_panic!( + "Attempted to get buffer of type {} from array of type {}", + T::PTYPE, + self.ptype() + ) + } + Buffer::from_byte_buffer(self.byte_buffer().clone()) + } + + pub fn into_buffer(self) -> Buffer { + if T::PTYPE != self.ptype() { + vortex_panic!( + "Attempted to get buffer of type {} from array of type {}", + T::PTYPE, + self.ptype() + ) + } + Buffer::from_byte_buffer(self.buffer) + } + + /// Extract a mutable buffer from the PrimitiveArray. Attempts to do this with zero-copy + /// if the buffer is uniquely owned, otherwise will make a copy. + pub fn into_buffer_mut(self) -> BufferMut { + if T::PTYPE != self.ptype() { + vortex_panic!( + "Attempted to get buffer_mut of type {} from array of type {}", + T::PTYPE, + self.ptype() + ) + } + self.into_buffer() + .try_into_mut() + .unwrap_or_else(|buffer| BufferMut::::copy_from(&buffer)) + } + + /// Try to extract a mutable buffer from the PrimitiveArray with zero copy. + #[allow(clippy::panic_in_result_fn)] + pub fn try_into_buffer_mut(self) -> Result, PrimitiveArray> { + if T::PTYPE != self.ptype() { + vortex_panic!( + "Attempted to get buffer_mut of type {} from array of type {}", + T::PTYPE, + self.ptype() + ) + } + let validity = self.validity().clone(); + Buffer::::from_byte_buffer(self.into_byte_buffer()) + .try_into_mut() + .map_err(|buffer| PrimitiveArray::new(buffer, validity)) + } +} + +impl FromIterator for PrimitiveArray { + fn from_iter>(iter: I) -> Self { + let values = BufferMut::from_iter(iter); + PrimitiveArray::new(values, Validity::NonNullable) + } +} + +impl IntoArray for Buffer { + fn into_array(self) -> ArrayRef { + PrimitiveArray::new(self, Validity::NonNullable).into_array() + } +} + +impl IntoArray for BufferMut { + fn into_array(self) -> ArrayRef { + self.freeze().into_array() + } +} diff --git a/vortex-array/src/arrays/primitive/array/mod.rs b/vortex-array/src/arrays/primitive/array/mod.rs new file mode 100644 index 00000000000..7f49a7e4541 --- /dev/null +++ b/vortex-array/src/arrays/primitive/array/mod.rs @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::iter; + +use vortex_buffer::{Alignment, Buffer, BufferMut, ByteBuffer, ByteBufferMut}; +use vortex_dtype::{DType, NativePType, Nullability, PType, match_each_native_ptype}; +use vortex_error::{VortexExpect, VortexResult, vortex_err}; + +use crate::ToCanonical; +use crate::stats::ArrayStats; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +mod accessor; +mod cast; +mod conversion; +mod patch; +mod top_value; + +/// A primitive array that stores [native types][vortex_dtype::NativePType] in a contiguous buffer +/// of memory, along with an optional validity child. +/// +/// This mirrors the Apache Arrow Primitive layout and can be converted into and out of one +/// without allocations or copies. +/// +/// The underlying buffer must be natively aligned to the primitive type they are representing. +/// +/// Values are stored in their native representation with proper alignment. +/// Null values still occupy space in the buffer but are marked invalid in the validity mask. +/// +/// # Examples +/// +/// ``` +/// use vortex_array::arrays::PrimitiveArray; +/// use vortex_array::compute::sum; +/// /// +/// // Create from iterator using FromIterator impl +/// let array: PrimitiveArray = [1i32, 2, 3, 4, 5].into_iter().collect(); +/// +/// // Slice the array +/// let sliced = array.slice(1..3); +/// +/// // Access individual values +/// let value = sliced.scalar_at(0); +/// assert_eq!(value, 2i32.into()); +/// +/// // Convert into a type-erased array that can be passed to compute functions. +/// let summed = sum(sliced.as_ref()).unwrap().as_primitive().typed_value::().unwrap(); +/// assert_eq!(summed, 5i64); +/// ``` +#[derive(Clone, Debug)] +pub struct PrimitiveArray { + pub(super) dtype: DType, + pub(super) buffer: ByteBuffer, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, +} + +// TODO(connor): There are a lot of places where we could be using `new_unchecked` in the codebase. +impl PrimitiveArray { + /// Creates a new [`PrimitiveArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented + /// in [`PrimitiveArray::new_unchecked`]. + pub fn new(buffer: impl Into>, validity: Validity) -> Self { + let buffer = buffer.into(); + Self::try_new(buffer, validity).vortex_expect("PrimitiveArray construction failed") + } + + /// Constructs a new `PrimitiveArray`. + /// + /// See [`PrimitiveArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the provided components do not satisfy the invariants documented in + /// [`PrimitiveArray::new_unchecked`]. + #[inline] + pub fn try_new(buffer: Buffer, validity: Validity) -> VortexResult { + Self::validate(&buffer, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(buffer, validity) }) + } + + /// Creates a new [`PrimitiveArray`] without validation from these components: + /// + /// * `buffer` is a typed buffer containing the primitive values. + /// * `validity` holds the null values. + /// + /// # Safety + /// + /// The caller must ensure all of the following invariants are satisfied: + /// + /// ## Validity Requirements + /// + /// - If `validity` is [`Validity::Array`], its length must exactly equal `buffer.len()`. + #[inline] + pub unsafe fn new_unchecked(buffer: Buffer, validity: Validity) -> Self { + #[cfg(debug_assertions)] + Self::validate(&buffer, &validity) + .vortex_expect("[Debug Assertion]: Invalid `PrimitiveArray` parameters"); + + Self { + dtype: DType::Primitive(T::PTYPE, validity.nullability()), + buffer: buffer.into_byte_buffer(), + validity, + stats_set: Default::default(), + } + } + + /// Validates the components that would be used to create a [`PrimitiveArray`]. + /// + /// This function checks all the invariants required by [`PrimitiveArray::new_unchecked`]. + #[inline] + pub(crate) fn validate( + buffer: &Buffer, + validity: &Validity, + ) -> VortexResult<()> { + if let Some(len) = validity.maybe_len() + && buffer.len() != len + { + return Err(vortex_err!( + "Buffer and validity length mismatch: buffer={}, validity={}", + buffer.len(), + len + )); + } + Ok(()) + } + + pub fn empty(nullability: Nullability) -> Self { + Self::new(Buffer::::empty(), nullability.into()) + } + + pub fn ptype(&self) -> PType { + self.dtype().as_ptype() + } + + pub fn byte_buffer(&self) -> &ByteBuffer { + &self.buffer + } + + pub fn into_byte_buffer(self) -> ByteBuffer { + self.buffer + } + + pub fn from_byte_buffer(buffer: ByteBuffer, ptype: PType, validity: Validity) -> Self { + match_each_native_ptype!(ptype, |T| { + Self::new::(Buffer::from_byte_buffer(buffer), validity) + }) + } + + /// Create a PrimitiveArray from a byte buffer containing only the valid elements. + pub fn from_values_byte_buffer( + valid_elems_buffer: ByteBuffer, + ptype: PType, + validity: Validity, + n_rows: usize, + ) -> Self { + let byte_width = ptype.byte_width(); + let alignment = Alignment::new(byte_width); + let buffer = match &validity { + Validity::AllValid | Validity::NonNullable => valid_elems_buffer.aligned(alignment), + Validity::AllInvalid => ByteBuffer::zeroed_aligned(n_rows * byte_width, alignment), + Validity::Array(is_valid) => { + let bool_array = is_valid.to_bool(); + let bool_buffer = bool_array.boolean_buffer(); + let mut bytes = ByteBufferMut::zeroed_aligned(n_rows * byte_width, alignment); + for (i, valid_i) in bool_buffer.set_indices().enumerate() { + bytes[valid_i * byte_width..(valid_i + 1) * byte_width] + .copy_from_slice(&valid_elems_buffer[i * byte_width..(i + 1) * byte_width]) + } + bytes.freeze() + } + }; + + Self::from_byte_buffer(buffer, ptype, validity) + } + + /// Map each element in the array to a new value. + /// + /// This ignores validity and maps over all maybe-null elements. + /// + /// TODO(ngates): we could be smarter here if validity is sparse and only run the function + /// over the valid elements. + pub fn map_each(self, f: F) -> PrimitiveArray + where + T: NativePType, + R: NativePType, + F: FnMut(T) -> R, + { + let validity = self.validity().clone(); + let buffer = match self.try_into_buffer_mut() { + Ok(buffer_mut) => buffer_mut.map_each(f), + Err(parray) => BufferMut::::from_iter(parray.buffer::().iter().copied().map(f)), + }; + PrimitiveArray::new(buffer.freeze(), validity) + } + + /// Map each element in the array to a new value. + /// + /// This doesn't ignore validity and maps over all maybe-null elements, with a bool true if + /// valid and false otherwise. + pub fn map_each_with_validity(self, f: F) -> VortexResult + where + T: NativePType, + R: NativePType, + F: FnMut((T, bool)) -> R, + { + let validity = self.validity(); + + let buf_iter = self.buffer::().into_iter(); + + let buffer = match &validity { + Validity::NonNullable | Validity::AllValid => { + BufferMut::::from_iter(buf_iter.zip(iter::repeat(true)).map(f)) + } + Validity::AllInvalid => { + BufferMut::::from_iter(buf_iter.zip(iter::repeat(false)).map(f)) + } + Validity::Array(val) => { + let val = val.to_bool(); + BufferMut::::from_iter(buf_iter.zip(val.boolean_buffer()).map(f)) + } + }; + Ok(PrimitiveArray::new(buffer.freeze(), validity.clone())) + } +} diff --git a/vortex-array/src/arrays/primitive/patch.rs b/vortex-array/src/arrays/primitive/array/patch.rs similarity index 100% rename from vortex-array/src/arrays/primitive/patch.rs rename to vortex-array/src/arrays/primitive/array/patch.rs diff --git a/vortex-array/src/arrays/primitive/top_value.rs b/vortex-array/src/arrays/primitive/array/top_value.rs similarity index 100% rename from vortex-array/src/arrays/primitive/top_value.rs rename to vortex-array/src/arrays/primitive/array/top_value.rs diff --git a/vortex-array/src/arrays/primitive/mod.rs b/vortex-array/src/arrays/primitive/mod.rs index ca4735f8050..bd2359bef09 100644 --- a/vortex-array/src/arrays/primitive/mod.rs +++ b/vortex-array/src/arrays/primitive/mod.rs @@ -1,506 +1,17 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::fmt::Debug; -use std::iter; - -mod accessor; - -use arrow_buffer::BooleanBufferBuilder; -use vortex_buffer::{Alignment, Buffer, BufferMut, ByteBuffer, ByteBufferMut}; -use vortex_dtype::{DType, NativePType, Nullability, PType, match_each_native_ptype}; -use vortex_error::{VortexExpect, VortexResult, vortex_err, vortex_panic}; - -use crate::builders::ArrayBuilder; -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::{ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, ToCanonical, vtable}; +mod array; +pub use array::PrimitiveArray; mod compute; -mod downcast; -mod native_value; -mod operator; -mod ops; -mod patch; -mod serde; -mod top_value; - pub use compute::{IS_CONST_LANE_WIDTH, compute_is_constant}; -pub use native_value::NativeValue; - -use crate::vtable::{ - ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper, - ValidityVTableFromValidityHelper, -}; - -vtable!(Primitive); - -impl VTable for PrimitiveVTable { - type Array = PrimitiveArray; - type Encoding = PrimitiveEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = Self; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.primitive") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(PrimitiveEncoding.as_ref()) - } -} - -/// A primitive array that stores [native types][vortex_dtype::NativePType] in a contiguous buffer -/// of memory, along with an optional validity child. -/// -/// This mirrors the Apache Arrow Primitive layout and can be converted into and out of one -/// without allocations or copies. -/// -/// The underlying buffer must be natively aligned to the primitive type they are representing. -/// -/// Values are stored in their native representation with proper alignment. -/// Null values still occupy space in the buffer but are marked invalid in the validity mask. -/// -/// # Examples -/// -/// ``` -/// use vortex_array::arrays::PrimitiveArray; -/// use vortex_array::compute::sum; -/// /// -/// // Create from iterator using FromIterator impl -/// let array: PrimitiveArray = [1i32, 2, 3, 4, 5].into_iter().collect(); -/// -/// // Slice the array -/// let sliced = array.slice(1..3); -/// -/// // Access individual values -/// let value = sliced.scalar_at(0); -/// assert_eq!(value, 2i32.into()); -/// -/// // Convert into a type-erased array that can be passed to compute functions. -/// let summed = sum(sliced.as_ref()).unwrap().as_primitive().typed_value::().unwrap(); -/// assert_eq!(summed, 5i64); -/// ``` -#[derive(Clone, Debug)] -pub struct PrimitiveArray { - dtype: DType, - buffer: ByteBuffer, - validity: Validity, - stats_set: ArrayStats, -} - -#[derive(Clone, Debug)] -pub struct PrimitiveEncoding; - -// TODO(connor): There are a lot of places where we could be using `new_unchecked` in the codebase. -impl PrimitiveArray { - /// Creates a new [`PrimitiveArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented - /// in [`PrimitiveArray::new_unchecked`]. - pub fn new(buffer: impl Into>, validity: Validity) -> Self { - let buffer = buffer.into(); - Self::try_new(buffer, validity).vortex_expect("PrimitiveArray construction failed") - } - - /// Constructs a new `PrimitiveArray`. - /// - /// See [`PrimitiveArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the provided components do not satisfy the invariants documented in - /// [`PrimitiveArray::new_unchecked`]. - #[inline] - pub fn try_new(buffer: Buffer, validity: Validity) -> VortexResult { - Self::validate(&buffer, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(buffer, validity) }) - } - - /// Creates a new [`PrimitiveArray`] without validation from these components: - /// - /// * `buffer` is a typed buffer containing the primitive values. - /// * `validity` holds the null values. - /// - /// # Safety - /// - /// The caller must ensure all of the following invariants are satisfied: - /// - /// ## Validity Requirements - /// - /// - If `validity` is [`Validity::Array`], its length must exactly equal `buffer.len()`. - #[inline] - pub unsafe fn new_unchecked(buffer: Buffer, validity: Validity) -> Self { - Self { - dtype: DType::Primitive(T::PTYPE, validity.nullability()), - buffer: buffer.into_byte_buffer(), - validity, - stats_set: Default::default(), - } - } - - /// Validates the components that would be used to create a [`PrimitiveArray`]. - /// - /// This function checks all the invariants required by [`PrimitiveArray::new_unchecked`]. - #[inline] - pub(crate) fn validate( - buffer: &Buffer, - validity: &Validity, - ) -> VortexResult<()> { - if let Some(len) = validity.maybe_len() - && buffer.len() != len - { - return Err(vortex_err!( - "Buffer and validity length mismatch: buffer={}, validity={}", - buffer.len(), - len - )); - } - Ok(()) - } - - pub fn empty(nullability: Nullability) -> Self { - Self::new(Buffer::::empty(), nullability.into()) - } - - pub fn from_byte_buffer(buffer: ByteBuffer, ptype: PType, validity: Validity) -> Self { - match_each_native_ptype!(ptype, |T| { - Self::new::(Buffer::from_byte_buffer(buffer), validity) - }) - } - - /// Create a PrimitiveArray from an iterator of `T`. - /// NOTE: we cannot impl FromIterator trait since it conflicts with `FromIterator`. - pub fn from_option_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let mut values = BufferMut::with_capacity(iter.size_hint().0); - let mut validity = BooleanBufferBuilder::new(values.capacity()); - - for i in iter { - match i { - None => { - validity.append(false); - values.push(T::default()); - } - Some(e) => { - validity.append(true); - values.push(e); - } - } - } - Self::new(values.freeze(), Validity::from(validity.finish())) - } - - /// Create a PrimitiveArray from a byte buffer containing only the valid elements. - pub fn from_values_byte_buffer( - valid_elems_buffer: ByteBuffer, - ptype: PType, - validity: Validity, - n_rows: usize, - ) -> Self { - let byte_width = ptype.byte_width(); - let alignment = Alignment::new(byte_width); - let buffer = match &validity { - Validity::AllValid | Validity::NonNullable => valid_elems_buffer.aligned(alignment), - Validity::AllInvalid => ByteBuffer::zeroed_aligned(n_rows * byte_width, alignment), - Validity::Array(is_valid) => { - let bool_array = is_valid.to_bool(); - let bool_buffer = bool_array.boolean_buffer(); - let mut bytes = ByteBufferMut::zeroed_aligned(n_rows * byte_width, alignment); - for (i, valid_i) in bool_buffer.set_indices().enumerate() { - bytes[valid_i * byte_width..(valid_i + 1) * byte_width] - .copy_from_slice(&valid_elems_buffer[i * byte_width..(i + 1) * byte_width]) - } - bytes.freeze() - } - }; - - Self::from_byte_buffer(buffer, ptype, validity) - } - - pub fn ptype(&self) -> PType { - self.dtype().as_ptype() - } - - pub fn byte_buffer(&self) -> &ByteBuffer { - &self.buffer - } - pub fn into_byte_buffer(self) -> ByteBuffer { - self.buffer - } +mod vtable; +pub use vtable::{PrimitiveEncoding, PrimitiveVTable}; - pub fn buffer(&self) -> Buffer { - if T::PTYPE != self.ptype() { - vortex_panic!( - "Attempted to get buffer of type {} from array of type {}", - T::PTYPE, - self.ptype() - ) - } - Buffer::from_byte_buffer(self.byte_buffer().clone()) - } - - pub fn into_buffer(self) -> Buffer { - if T::PTYPE != self.ptype() { - vortex_panic!( - "Attempted to get buffer of type {} from array of type {}", - T::PTYPE, - self.ptype() - ) - } - Buffer::from_byte_buffer(self.buffer) - } - - /// Extract a mutable buffer from the PrimitiveArray. Attempts to do this with zero-copy - /// if the buffer is uniquely owned, otherwise will make a copy. - pub fn into_buffer_mut(self) -> BufferMut { - if T::PTYPE != self.ptype() { - vortex_panic!( - "Attempted to get buffer_mut of type {} from array of type {}", - T::PTYPE, - self.ptype() - ) - } - self.into_buffer() - .try_into_mut() - .unwrap_or_else(|buffer| BufferMut::::copy_from(&buffer)) - } - - /// Try to extract a mutable buffer from the PrimitiveArray with zero copy. - #[allow(clippy::panic_in_result_fn)] - pub fn try_into_buffer_mut(self) -> Result, PrimitiveArray> { - if T::PTYPE != self.ptype() { - vortex_panic!( - "Attempted to get buffer_mut of type {} from array of type {}", - T::PTYPE, - self.ptype() - ) - } - let validity = self.validity().clone(); - Buffer::::from_byte_buffer(self.into_byte_buffer()) - .try_into_mut() - .map_err(|buffer| PrimitiveArray::new(buffer, validity)) - } - - /// Map each element in the array to a new value. - /// - /// This ignores validity and maps over all maybe-null elements. - /// - /// TODO(ngates): we could be smarter here if validity is sparse and only run the function - /// over the valid elements. - pub fn map_each(self, f: F) -> PrimitiveArray - where - T: NativePType, - R: NativePType, - F: FnMut(T) -> R, - { - let validity = self.validity().clone(); - let buffer = match self.try_into_buffer_mut() { - Ok(buffer_mut) => buffer_mut.map_each(f), - Err(parray) => BufferMut::::from_iter(parray.buffer::().iter().copied().map(f)), - }; - PrimitiveArray::new(buffer.freeze(), validity) - } - - /// Map each element in the array to a new value. - /// - /// This doesn't ignore validity and maps over all maybe-null elements, with a bool true if - /// valid and false otherwise. - pub fn map_each_with_validity(self, f: F) -> VortexResult - where - T: NativePType, - R: NativePType, - F: FnMut((T, bool)) -> R, - { - let validity = self.validity(); - - let buf_iter = self.buffer::().into_iter(); - - let buffer = match &validity { - Validity::NonNullable | Validity::AllValid => { - BufferMut::::from_iter(buf_iter.zip(iter::repeat(true)).map(f)) - } - Validity::AllInvalid => { - BufferMut::::from_iter(buf_iter.zip(iter::repeat(false)).map(f)) - } - Validity::Array(val) => { - let val = val.to_bool(); - BufferMut::::from_iter(buf_iter.zip(val.boolean_buffer()).map(f)) - } - }; - Ok(PrimitiveArray::new(buffer.freeze(), validity.clone())) - } - - /// Return a slice of the array's buffer. - /// - /// NOTE: these values may be nonsense if the validity buffer indicates that the value is null. - pub fn as_slice(&self) -> &[T] { - if T::PTYPE != self.ptype() { - vortex_panic!( - "Attempted to get slice of type {} from array of type {}", - T::PTYPE, - self.ptype() - ) - } - let raw_slice = self.byte_buffer().as_ptr(); - // SAFETY: alignment of Buffer is checked on construction - unsafe { - std::slice::from_raw_parts(raw_slice.cast(), self.byte_buffer().len() / size_of::()) - } - } - - pub fn reinterpret_cast(&self, ptype: PType) -> Self { - if self.ptype() == ptype { - return self.clone(); - } - - assert_eq!( - self.ptype().byte_width(), - ptype.byte_width(), - "can't reinterpret cast between integers of two different widths" - ); - - PrimitiveArray::from_byte_buffer(self.byte_buffer().clone(), ptype, self.validity().clone()) - } -} - -impl ArrayVTable for PrimitiveVTable { - fn len(array: &PrimitiveArray) -> usize { - array.byte_buffer().len() / array.ptype().byte_width() - } - - fn dtype(array: &PrimitiveArray) -> &DType { - &array.dtype - } - - fn stats(array: &PrimitiveArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl ValidityHelper for PrimitiveArray { - fn validity(&self) -> &Validity { - &self.validity - } -} - -impl FromIterator for PrimitiveArray { - fn from_iter>(iter: I) -> Self { - let values = BufferMut::from_iter(iter); - PrimitiveArray::new(values, Validity::NonNullable) - } -} - -impl IntoArray for Buffer { - fn into_array(self) -> ArrayRef { - PrimitiveArray::new(self, Validity::NonNullable).into_array() - } -} - -impl IntoArray for BufferMut { - fn into_array(self) -> ArrayRef { - self.freeze().into_array() - } -} - -impl CanonicalVTable for PrimitiveVTable { - fn canonicalize(array: &PrimitiveArray) -> Canonical { - Canonical::Primitive(array.clone()) - } - - fn append_to_builder(array: &PrimitiveArray, builder: &mut dyn ArrayBuilder) { - builder.extend_from_array(array.as_ref()) - } -} +mod native_value; +pub use native_value::NativeValue; #[cfg(test)] -mod tests { - use vortex_buffer::buffer; - use vortex_scalar::PValue; - - use crate::arrays::{BoolArray, PrimitiveArray}; - use crate::compute::conformance::filter::test_filter_conformance; - use crate::compute::conformance::mask::test_mask_conformance; - use crate::compute::conformance::search_sorted::rstest_reuse::apply; - use crate::compute::conformance::search_sorted::{search_sorted_conformance, *}; - use crate::search_sorted::{SearchResult, SearchSorted, SearchSortedSide}; - use crate::validity::Validity; - use crate::{ArrayRef, IntoArray}; - - #[apply(search_sorted_conformance)] - fn test_search_sorted_primitive( - #[case] array: ArrayRef, - #[case] value: i32, - #[case] side: SearchSortedSide, - #[case] expected: SearchResult, - ) { - let res = array - .as_primitive_typed() - .search_sorted(&Some(PValue::from(value)), side); - assert_eq!(res, expected); - } - - #[test] - fn test_mask_primitive_array() { - test_mask_conformance( - PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::NonNullable).as_ref(), - ); - test_mask_conformance( - PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::AllValid).as_ref(), - ); - test_mask_conformance( - PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::AllInvalid).as_ref(), - ); - test_mask_conformance( - PrimitiveArray::new( - buffer![0, 1, 2, 3, 4], - Validity::Array( - BoolArray::from_iter([true, false, true, false, true]).into_array(), - ), - ) - .as_ref(), - ); - } - - #[test] - fn test_filter_primitive_array() { - // Test various sizes - test_filter_conformance( - PrimitiveArray::new(buffer![42i32], Validity::NonNullable).as_ref(), - ); - test_filter_conformance(PrimitiveArray::new(buffer![0, 1], Validity::NonNullable).as_ref()); - test_filter_conformance( - PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::NonNullable).as_ref(), - ); - test_filter_conformance( - PrimitiveArray::new(buffer![0, 1, 2, 3, 4, 5, 6, 7], Validity::NonNullable).as_ref(), - ); - - // Test with validity - test_filter_conformance( - PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::AllValid).as_ref(), - ); - test_filter_conformance( - PrimitiveArray::new( - buffer![0, 1, 2, 3, 4, 5], - Validity::Array( - BoolArray::from_iter([true, false, true, false, true, true]).into_array(), - ), - ) - .as_ref(), - ); - } -} +mod tests; diff --git a/vortex-array/src/arrays/primitive/tests.rs b/vortex-array/src/arrays/primitive/tests.rs new file mode 100644 index 00000000000..43fc17d5f05 --- /dev/null +++ b/vortex-array/src/arrays/primitive/tests.rs @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_buffer::buffer; +use vortex_scalar::PValue; + +use crate::arrays::{BoolArray, PrimitiveArray}; +use crate::compute::conformance::filter::test_filter_conformance; +use crate::compute::conformance::mask::test_mask_conformance; +use crate::compute::conformance::search_sorted::rstest_reuse::apply; +use crate::compute::conformance::search_sorted::{search_sorted_conformance, *}; +use crate::search_sorted::{SearchResult, SearchSorted, SearchSortedSide}; +use crate::validity::Validity; +use crate::{ArrayRef, IntoArray}; + +#[apply(search_sorted_conformance)] +fn test_search_sorted_primitive( + #[case] array: ArrayRef, + #[case] value: i32, + #[case] side: SearchSortedSide, + #[case] expected: SearchResult, +) { + let res = array + .as_primitive_typed() + .search_sorted(&Some(PValue::from(value)), side); + assert_eq!(res, expected); +} + +#[test] +fn test_mask_primitive_array() { + test_mask_conformance( + PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::NonNullable).as_ref(), + ); + test_mask_conformance(PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::AllValid).as_ref()); + test_mask_conformance( + PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::AllInvalid).as_ref(), + ); + test_mask_conformance( + PrimitiveArray::new( + buffer![0, 1, 2, 3, 4], + Validity::Array(BoolArray::from_iter([true, false, true, false, true]).into_array()), + ) + .as_ref(), + ); +} + +#[test] +fn test_filter_primitive_array() { + // Test various sizes + test_filter_conformance(PrimitiveArray::new(buffer![42i32], Validity::NonNullable).as_ref()); + test_filter_conformance(PrimitiveArray::new(buffer![0, 1], Validity::NonNullable).as_ref()); + test_filter_conformance( + PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::NonNullable).as_ref(), + ); + test_filter_conformance( + PrimitiveArray::new(buffer![0, 1, 2, 3, 4, 5, 6, 7], Validity::NonNullable).as_ref(), + ); + + // Test with validity + test_filter_conformance( + PrimitiveArray::new(buffer![0, 1, 2, 3, 4], Validity::AllValid).as_ref(), + ); + test_filter_conformance( + PrimitiveArray::new( + buffer![0, 1, 2, 3, 4, 5], + Validity::Array( + BoolArray::from_iter([true, false, true, false, true, true]).into_array(), + ), + ) + .as_ref(), + ); +} diff --git a/vortex-array/src/arrays/primitive/vtable/array.rs b/vortex-array/src/arrays/primitive/vtable/array.rs new file mode 100644 index 00000000000..23b2d0a2d1f --- /dev/null +++ b/vortex-array/src/arrays/primitive/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::{PrimitiveArray, PrimitiveVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for PrimitiveVTable { + fn len(array: &PrimitiveArray) -> usize { + array.byte_buffer().len() / array.ptype().byte_width() + } + + fn dtype(array: &PrimitiveArray) -> &DType { + &array.dtype + } + + fn stats(array: &PrimitiveArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/primitive/vtable/canonical.rs b/vortex-array/src/arrays/primitive/vtable/canonical.rs new file mode 100644 index 00000000000..e4fb46f7c63 --- /dev/null +++ b/vortex-array/src/arrays/primitive/vtable/canonical.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::{PrimitiveArray, PrimitiveVTable}; +use crate::builders::ArrayBuilder; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for PrimitiveVTable { + fn canonicalize(array: &PrimitiveArray) -> Canonical { + Canonical::Primitive(array.clone()) + } + + fn append_to_builder(array: &PrimitiveArray, builder: &mut dyn ArrayBuilder) { + builder.extend_from_array(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs new file mode 100644 index 00000000000..e8b8887bfc5 --- /dev/null +++ b/vortex-array/src/arrays/primitive/vtable/mod.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::PrimitiveArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod pipeline; +mod serde; +mod validity; +mod visitor; + +vtable!(Primitive); + +impl VTable for PrimitiveVTable { + type Array = PrimitiveArray; + type Encoding = PrimitiveEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = Self; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.primitive") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(PrimitiveEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct PrimitiveEncoding; diff --git a/vortex-array/src/arrays/primitive/ops.rs b/vortex-array/src/arrays/primitive/vtable/operations.rs similarity index 100% rename from vortex-array/src/arrays/primitive/ops.rs rename to vortex-array/src/arrays/primitive/vtable/operations.rs diff --git a/vortex-array/src/arrays/primitive/operator.rs b/vortex-array/src/arrays/primitive/vtable/pipeline.rs similarity index 95% rename from vortex-array/src/arrays/primitive/operator.rs rename to vortex-array/src/arrays/primitive/vtable/pipeline.rs index c44273a4d5e..80fca778b3a 100644 --- a/vortex-array/src/arrays/primitive/operator.rs +++ b/vortex-array/src/arrays/primitive/vtable/pipeline.rs @@ -1,5 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors use std::any::Any; use std::fmt::Formatter; diff --git a/vortex-array/src/arrays/primitive/serde.rs b/vortex-array/src/arrays/primitive/vtable/serde.rs similarity index 78% rename from vortex-array/src/arrays/primitive/serde.rs rename to vortex-array/src/arrays/primitive/vtable/serde.rs index c4938da70c9..474baddb4d5 100644 --- a/vortex-array/src/arrays/primitive/serde.rs +++ b/vortex-array/src/arrays/primitive/vtable/serde.rs @@ -5,12 +5,12 @@ use vortex_buffer::{Alignment, Buffer, ByteBuffer}; use vortex_dtype::{DType, PType, match_each_native_ptype}; use vortex_error::{VortexResult, vortex_bail}; -use super::PrimitiveEncoding; -use crate::arrays::{PrimitiveArray, PrimitiveVTable}; +use super::PrimitiveArray; +use crate::EmptyMetadata; +use crate::arrays::{PrimitiveEncoding, PrimitiveVTable}; use crate::serde::ArrayChildren; use crate::validity::Validity; -use crate::vtable::{SerdeVTable, ValidityHelper, VisitorVTable}; -use crate::{ArrayBufferVisitor, ArrayChildVisitor, EmptyMetadata}; +use crate::vtable::SerdeVTable; impl SerdeVTable for PrimitiveVTable { type Metadata = EmptyMetadata; @@ -65,13 +65,3 @@ impl SerdeVTable for PrimitiveVTable { }) } } - -impl VisitorVTable for PrimitiveVTable { - fn visit_buffers(array: &PrimitiveArray, visitor: &mut dyn ArrayBufferVisitor) { - visitor.visit_buffer(array.byte_buffer()); - } - - fn visit_children(array: &PrimitiveArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_validity(array.validity(), array.len()); - } -} diff --git a/vortex-array/src/arrays/primitive/vtable/validity.rs b/vortex-array/src/arrays/primitive/vtable/validity.rs new file mode 100644 index 00000000000..47903e81a09 --- /dev/null +++ b/vortex-array/src/arrays/primitive/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::PrimitiveArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for PrimitiveArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/primitive/vtable/visitor.rs b/vortex-array/src/arrays/primitive/vtable/visitor.rs new file mode 100644 index 00000000000..53ce105d60e --- /dev/null +++ b/vortex-array/src/arrays/primitive/vtable/visitor.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::{PrimitiveArray, PrimitiveVTable}; +use crate::vtable::{ValidityHelper, VisitorVTable}; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for PrimitiveVTable { + fn visit_buffers(array: &PrimitiveArray, visitor: &mut dyn ArrayBufferVisitor) { + visitor.visit_buffer(array.byte_buffer()); + } + + fn visit_children(array: &PrimitiveArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_validity(array.validity(), array.len()); + } +} diff --git a/vortex-array/src/arrays/struct_/array.rs b/vortex-array/src/arrays/struct_/array.rs new file mode 100644 index 00000000000..bcf23d80adb --- /dev/null +++ b/vortex-array/src/arrays/struct_/array.rs @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Debug; +use std::iter::once; + +use vortex_dtype::{DType, FieldName, FieldNames, StructFields}; +use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err}; + +use crate::stats::ArrayStats; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; +use crate::{Array, ArrayRef, IntoArray}; + +/// A struct array that stores multiple named fields as columns, similar to a database row. +/// +/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation +/// of structured data where each row contains multiple named fields of potentially different types. +/// +/// ## Data Layout +/// +/// The struct array uses a columnar layout where: +/// - Each field is stored as a separate child array +/// - All fields must have the same length (number of rows) +/// - Field names and types are defined in the struct's dtype +/// - An optional validity mask indicates which entire rows are null +/// +/// ## Row-level nulls +/// +/// The StructArray contains its own top-level nulls, which are superimposed on top of the +/// field-level validity values. This can be the case even if the fields themselves are non-nullable, +/// accessing a particular row can yield nulls even if all children are valid at that position. +/// +/// ``` +/// use vortex_array::arrays::{StructArray, BoolArray}; +/// use vortex_array::validity::Validity; +/// use vortex_array::IntoArray; +/// use vortex_dtype::FieldNames; +/// use vortex_buffer::buffer; +/// +/// // Create struct with all non-null fields but struct-level nulls +/// let struct_array = StructArray::try_new( +/// FieldNames::from(["a", "b", "c"]), +/// vec![ +/// buffer![1i32, 2i32].into_array(), // non-null field a +/// buffer![10i32, 20i32].into_array(), // non-null field b +/// buffer![100i32, 200i32].into_array(), // non-null field c +/// ], +/// 2, +/// Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null +/// ).unwrap(); +/// +/// // Row 0 is valid - returns a struct scalar with field values +/// let row0 = struct_array.scalar_at(0); +/// assert!(!row0.is_null()); +/// +/// // Row 1 is null at struct level - returns null even though fields have values +/// let row1 = struct_array.scalar_at(1); +/// assert!(row1.is_null()); +/// ``` +/// +/// ## Name uniqueness +/// +/// It is valid for a StructArray to have multiple child columns that have the same name. In this +/// case, any accessors that use column names will find the first column in sequence with the name. +/// +/// ``` +/// use vortex_array::arrays::StructArray; +/// use vortex_array::validity::Validity; +/// use vortex_array::IntoArray; +/// use vortex_dtype::FieldNames; +/// use vortex_buffer::buffer; +/// +/// // Create struct with duplicate "data" field names +/// let struct_array = StructArray::try_new( +/// FieldNames::from(["data", "data"]), +/// vec![ +/// buffer![1i32, 2i32].into_array(), // first "data" +/// buffer![3i32, 4i32].into_array(), // second "data" +/// ], +/// 2, +/// Validity::NonNullable, +/// ).unwrap(); +/// +/// // field_by_name returns the FIRST "data" field +/// let first_data = struct_array.field_by_name("data").unwrap(); +/// assert_eq!(first_data.scalar_at(0), 1i32.into()); +/// ``` +/// +/// ## Field Operations +/// +/// Struct arrays support efficient column operations: +/// - **Projection**: Select/reorder fields without copying data +/// - **Field access**: Get columns by name or index +/// - **Column addition**: Add new fields to create extended structs +/// - **Column removal**: Remove fields to create narrower structs +/// +/// ## Validity Semantics +/// +/// - Row-level nulls are tracked in the struct's validity child +/// - Individual field nulls are tracked in each field's own validity +/// - A null struct row means all fields in that row are conceptually null +/// - Field-level nulls can exist independently of struct-level nulls +/// +/// # Examples +/// +/// ``` +/// use vortex_array::arrays::{StructArray, PrimitiveArray}; +/// use vortex_array::validity::Validity; +/// use vortex_array::IntoArray; +/// use vortex_dtype::FieldNames; +/// use vortex_buffer::buffer; +/// +/// // Create arrays for each field +/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable); +/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable); +/// +/// // Create struct array with named fields +/// let struct_array = StructArray::try_new( +/// FieldNames::from(["id", "score"]), +/// vec![ids.into_array(), names.into_array()], +/// 3, +/// Validity::NonNullable, +/// ).unwrap(); +/// +/// assert_eq!(struct_array.len(), 3); +/// assert_eq!(struct_array.names().len(), 2); +/// +/// // Access field by name +/// let id_field = struct_array.field_by_name("id").unwrap(); +/// assert_eq!(id_field.len(), 3); +/// ``` +#[derive(Clone, Debug)] +pub struct StructArray { + pub(super) len: usize, + pub(super) dtype: DType, + pub(super) fields: Vec, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, +} + +impl StructArray { + pub fn fields(&self) -> &[ArrayRef] { + &self.fields + } + + pub fn field_by_name(&self, name: impl AsRef) -> VortexResult<&ArrayRef> { + let name = name.as_ref(); + self.field_by_name_opt(name).ok_or_else(|| { + vortex_err!( + "Field {name} not found in struct array with names {:?}", + self.names() + ) + }) + } + + pub fn field_by_name_opt(&self, name: impl AsRef) -> Option<&ArrayRef> { + let name = name.as_ref(); + self.names() + .iter() + .position(|field_name| field_name.as_ref() == name) + .map(|idx| &self.fields[idx]) + } + + pub fn names(&self) -> &FieldNames { + self.struct_fields().names() + } + + pub fn struct_fields(&self) -> &StructFields { + let Some(struct_dtype) = &self.dtype.as_struct_fields_opt() else { + unreachable!( + "struct arrays must have be a DType::Struct, this is likely an internal bug." + ) + }; + struct_dtype + } + + /// Create a new `StructArray` with the given length, but without any fields. + pub fn new_fieldless_with_len(len: usize) -> Self { + Self::try_new( + FieldNames::default(), + Vec::new(), + len, + Validity::NonNullable, + ) + .vortex_expect("StructArray::new_with_len should not fail") + } + + /// Creates a new [`StructArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented + /// in [`StructArray::new_unchecked`]. + pub fn new( + names: FieldNames, + fields: Vec, + length: usize, + validity: Validity, + ) -> Self { + Self::try_new(names, fields, length, validity) + .vortex_expect("StructArray construction failed") + } + + /// Constructs a new `StructArray`. + /// + /// See [`StructArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the provided components do not satisfy the invariants documented in + /// [`StructArray::new_unchecked`]. + pub fn try_new( + names: FieldNames, + fields: Vec, + length: usize, + validity: Validity, + ) -> VortexResult { + let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect(); + let dtype = StructFields::new(names, field_dtypes); + + Self::validate(&fields, &dtype, length, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) }) + } + + /// Creates a new [`StructArray`] without validation from these components: + /// + /// * `fields` is a vector of arrays, one for each field in the struct. + /// * `dtype` contains the field names and types. + /// * `length` is the number of struct rows. + /// * `validity` holds the null values. + /// + /// # Safety + /// + /// The caller must ensure all of the following invariants are satisfied: + /// + /// ## Field Requirements + /// + /// - `fields.len()` must exactly equal `dtype.names().len()`. + /// - Every field array in `fields` must have length exactly equal to `length`. + /// - For each index `i`, `fields[i].dtype()` must exactly match `dtype.fields()[i]`. + /// + /// ## Type Requirements + /// + /// - Field names in `dtype` may be duplicated (this is explicitly allowed). + /// - The nullability of `dtype` must match the nullability of `validity`. + /// + /// ## Validity Requirements + /// + /// - If `validity` is [`Validity::Array`], its length must exactly equal `length`. + pub unsafe fn new_unchecked( + fields: Vec, + dtype: StructFields, + length: usize, + validity: Validity, + ) -> Self { + #[cfg(debug_assertions)] + Self::validate(&fields, &dtype, length, &validity) + .vortex_expect("[Debug Assertion]: Invalid `StructArray` parameters"); + + Self { + len: length, + dtype: DType::Struct(dtype, validity.nullability()), + fields, + validity, + stats_set: Default::default(), + } + } + + /// Validates the components that would be used to create a [`StructArray`]. + /// + /// This function checks all the invariants required by [`StructArray::new_unchecked`]. + pub(crate) fn validate( + fields: &[ArrayRef], + dtype: &StructFields, + length: usize, + validity: &Validity, + ) -> VortexResult<()> { + // Check field count matches + if fields.len() != dtype.names().len() { + vortex_bail!( + "Got {} fields but dtype has {} names", + fields.len(), + dtype.names().len() + ); + } + + // Check each field's length and dtype + for (i, (field, struct_dt)) in fields.iter().zip(dtype.fields()).enumerate() { + if field.len() != length { + vortex_bail!( + "Field {} has length {} but expected {}", + i, + field.len(), + length + ); + } + + if field.dtype() != &struct_dt { + vortex_bail!( + "Field {} has dtype {} but expected {}", + i, + field.dtype(), + struct_dt + ); + } + } + + // Check validity length + if let Some(validity_len) = validity.maybe_len() + && validity_len != length + { + vortex_bail!( + "Validity has length {} but expected {}", + validity_len, + length + ); + } + + Ok(()) + } + + pub fn try_new_with_dtype( + fields: Vec, + dtype: StructFields, + length: usize, + validity: Validity, + ) -> VortexResult { + Self::validate(&fields, &dtype, length, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) }) + } + + pub fn from_fields>(items: &[(N, ArrayRef)]) -> VortexResult { + Self::try_from_iter(items.iter().map(|(a, b)| (a, b.to_array()))) + } + + pub fn try_from_iter_with_validity< + N: AsRef, + A: IntoArray, + T: IntoIterator, + >( + iter: T, + validity: Validity, + ) -> VortexResult { + let (names, fields): (Vec, Vec) = iter + .into_iter() + .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array())) + .unzip(); + let len = fields + .first() + .map(|f| f.len()) + .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?; + + Self::try_new(FieldNames::from_iter(names), fields, len, validity) + } + + pub fn try_from_iter, A: IntoArray, T: IntoIterator>( + iter: T, + ) -> VortexResult { + Self::try_from_iter_with_validity(iter, Validity::NonNullable) + } + + // TODO(aduffy): Add equivalent function to support field masks for nested column access. + /// Return a new StructArray with the given projection applied. + /// + /// Projection does not copy data arrays. Projection is defined by an ordinal array slice + /// which specifies the new ordering of columns in the struct. The projection can be used to + /// perform column re-ordering, deletion, or duplication at a logical level, without any data + /// copying. + #[allow(clippy::same_name_method)] + pub fn project(&self, projection: &[FieldName]) -> VortexResult { + let mut children = Vec::with_capacity(projection.len()); + let mut names = Vec::with_capacity(projection.len()); + + for f_name in projection.iter() { + let idx = self + .names() + .iter() + .position(|name| name == f_name) + .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?; + + names.push(self.names()[idx].clone()); + children.push(self.fields()[idx].clone()); + } + + StructArray::try_new( + FieldNames::from(names.as_slice()), + children, + self.len(), + self.validity().clone(), + ) + } + + /// Removes and returns a column from the struct array by name. + /// If the column does not exist, returns `None`. + pub fn remove_column(&mut self, name: impl Into) -> Option { + let name = name.into(); + + let struct_dtype = self.struct_fields().clone(); + + let position = struct_dtype + .names() + .iter() + .position(|field_name| field_name.as_ref() == name.as_ref())?; + + let field = self.fields.remove(position); + + if let Ok(new_dtype) = struct_dtype.without_field(position) { + self.dtype = DType::Struct(new_dtype, self.dtype.nullability()); + return Some(field); + } + None + } + + /// Create a new StructArray by appending a new column onto the existing array. + pub fn with_column(&self, name: impl Into, array: ArrayRef) -> VortexResult { + let name = name.into(); + let struct_dtype = self.struct_fields().clone(); + + let names = struct_dtype.names().iter().cloned().chain(once(name)); + let types = struct_dtype.fields().chain(once(array.dtype().clone())); + let new_fields = StructFields::new(names.collect(), types.collect()); + + let mut children = self.fields.clone(); + children.push(array); + + Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone()) + } +} diff --git a/vortex-array/src/arrays/struct_/mod.rs b/vortex-array/src/arrays/struct_/mod.rs index eff2e5946b5..16dd274adad 100644 --- a/vortex-array/src/arrays/struct_/mod.rs +++ b/vortex-array/src/arrays/struct_/mod.rs @@ -1,674 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::fmt::Debug; -use std::iter::once; -use std::ops::Range; - -use itertools::Itertools; -use vortex_dtype::{DType, FieldName, FieldNames, StructFields}; -use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err}; -use vortex_scalar::Scalar; - -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::vtable::{ - ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityHelper, - ValidityVTableFromValidityHelper, -}; -use crate::{Array, ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, vtable}; +mod array; +pub use array::StructArray; mod compute; -mod operator; -mod serde; - -vtable!(Struct); - -impl VTable for StructVTable { - type Array = StructArray; - type Encoding = StructEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = Self; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.struct") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(StructEncoding.as_ref()) - } -} - -/// A struct array that stores multiple named fields as columns, similar to a database row. -/// -/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation -/// of structured data where each row contains multiple named fields of potentially different types. -/// -/// ## Data Layout -/// -/// The struct array uses a columnar layout where: -/// - Each field is stored as a separate child array -/// - All fields must have the same length (number of rows) -/// - Field names and types are defined in the struct's dtype -/// - An optional validity mask indicates which entire rows are null -/// -/// ## Row-level nulls -/// -/// The StructArray contains its own top-level nulls, which are superimposed on top of the -/// field-level validity values. This can be the case even if the fields themselves are non-nullable, -/// accessing a particular row can yield nulls even if all children are valid at that position. -/// -/// ``` -/// use vortex_array::arrays::{StructArray, BoolArray}; -/// use vortex_array::validity::Validity; -/// use vortex_array::IntoArray; -/// use vortex_dtype::FieldNames; -/// use vortex_buffer::buffer; -/// -/// // Create struct with all non-null fields but struct-level nulls -/// let struct_array = StructArray::try_new( -/// FieldNames::from(["a", "b", "c"]), -/// vec![ -/// buffer![1i32, 2i32].into_array(), // non-null field a -/// buffer![10i32, 20i32].into_array(), // non-null field b -/// buffer![100i32, 200i32].into_array(), // non-null field c -/// ], -/// 2, -/// Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null -/// ).unwrap(); -/// -/// // Row 0 is valid - returns a struct scalar with field values -/// let row0 = struct_array.scalar_at(0); -/// assert!(!row0.is_null()); -/// -/// // Row 1 is null at struct level - returns null even though fields have values -/// let row1 = struct_array.scalar_at(1); -/// assert!(row1.is_null()); -/// ``` -/// -/// ## Name uniqueness -/// -/// It is valid for a StructArray to have multiple child columns that have the same name. In this -/// case, any accessors that use column names will find the first column in sequence with the name. -/// -/// ``` -/// use vortex_array::arrays::StructArray; -/// use vortex_array::validity::Validity; -/// use vortex_array::IntoArray; -/// use vortex_dtype::FieldNames; -/// use vortex_buffer::buffer; -/// -/// // Create struct with duplicate "data" field names -/// let struct_array = StructArray::try_new( -/// FieldNames::from(["data", "data"]), -/// vec![ -/// buffer![1i32, 2i32].into_array(), // first "data" -/// buffer![3i32, 4i32].into_array(), // second "data" -/// ], -/// 2, -/// Validity::NonNullable, -/// ).unwrap(); -/// -/// // field_by_name returns the FIRST "data" field -/// let first_data = struct_array.field_by_name("data").unwrap(); -/// assert_eq!(first_data.scalar_at(0), 1i32.into()); -/// ``` -/// -/// ## Field Operations -/// -/// Struct arrays support efficient column operations: -/// - **Projection**: Select/reorder fields without copying data -/// - **Field access**: Get columns by name or index -/// - **Column addition**: Add new fields to create extended structs -/// - **Column removal**: Remove fields to create narrower structs -/// -/// ## Validity Semantics -/// -/// - Row-level nulls are tracked in the struct's validity child -/// - Individual field nulls are tracked in each field's own validity -/// - A null struct row means all fields in that row are conceptually null -/// - Field-level nulls can exist independently of struct-level nulls -/// -/// # Examples -/// -/// ``` -/// use vortex_array::arrays::{StructArray, PrimitiveArray}; -/// use vortex_array::validity::Validity; -/// use vortex_array::IntoArray; -/// use vortex_dtype::FieldNames; -/// use vortex_buffer::buffer; -/// -/// // Create arrays for each field -/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable); -/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable); -/// -/// // Create struct array with named fields -/// let struct_array = StructArray::try_new( -/// FieldNames::from(["id", "score"]), -/// vec![ids.into_array(), names.into_array()], -/// 3, -/// Validity::NonNullable, -/// ).unwrap(); -/// -/// assert_eq!(struct_array.len(), 3); -/// assert_eq!(struct_array.names().len(), 2); -/// -/// // Access field by name -/// let id_field = struct_array.field_by_name("id").unwrap(); -/// assert_eq!(id_field.len(), 3); -/// ``` -#[derive(Clone, Debug)] -pub struct StructArray { - len: usize, - dtype: DType, - fields: Vec, - validity: Validity, - stats_set: ArrayStats, -} - -#[derive(Clone, Debug)] -pub struct StructEncoding; - -impl StructArray { - pub fn fields(&self) -> &[ArrayRef] { - &self.fields - } - - pub fn field_by_name(&self, name: impl AsRef) -> VortexResult<&ArrayRef> { - let name = name.as_ref(); - self.field_by_name_opt(name).ok_or_else(|| { - vortex_err!( - "Field {name} not found in struct array with names {:?}", - self.names() - ) - }) - } - - pub fn field_by_name_opt(&self, name: impl AsRef) -> Option<&ArrayRef> { - let name = name.as_ref(); - self.names() - .iter() - .position(|field_name| field_name.as_ref() == name) - .map(|idx| &self.fields[idx]) - } - - pub fn names(&self) -> &FieldNames { - self.struct_fields().names() - } - - pub fn struct_fields(&self) -> &StructFields { - let Some(struct_dtype) = &self.dtype.as_struct_fields_opt() else { - unreachable!( - "struct arrays must have be a DType::Struct, this is likely an internal bug." - ) - }; - struct_dtype - } - - /// Create a new `StructArray` with the given length, but without any fields. - pub fn new_fieldless_with_len(len: usize) -> Self { - Self::try_new( - FieldNames::default(), - Vec::new(), - len, - Validity::NonNullable, - ) - .vortex_expect("StructArray::new_with_len should not fail") - } - - /// Creates a new [`StructArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented - /// in [`StructArray::new_unchecked`]. - pub fn new( - names: FieldNames, - fields: Vec, - length: usize, - validity: Validity, - ) -> Self { - Self::try_new(names, fields, length, validity) - .vortex_expect("StructArray construction failed") - } - - /// Constructs a new `StructArray`. - /// - /// See [`StructArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the provided components do not satisfy the invariants documented in - /// [`StructArray::new_unchecked`]. - pub fn try_new( - names: FieldNames, - fields: Vec, - length: usize, - validity: Validity, - ) -> VortexResult { - let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect(); - let dtype = StructFields::new(names, field_dtypes); - - Self::validate(&fields, &dtype, length, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) }) - } - - /// Creates a new [`StructArray`] without validation from these components: - /// - /// * `fields` is a vector of arrays, one for each field in the struct. - /// * `dtype` contains the field names and types. - /// * `length` is the number of struct rows. - /// * `validity` holds the null values. - /// - /// # Safety - /// - /// The caller must ensure all of the following invariants are satisfied: - /// - /// ## Field Requirements - /// - /// - `fields.len()` must exactly equal `dtype.names().len()`. - /// - Every field array in `fields` must have length exactly equal to `length`. - /// - For each index `i`, `fields[i].dtype()` must exactly match `dtype.fields()[i]`. - /// - /// ## Type Requirements - /// - /// - Field names in `dtype` may be duplicated (this is explicitly allowed). - /// - The nullability of `dtype` must match the nullability of `validity`. - /// - /// ## Validity Requirements - /// - /// - If `validity` is [`Validity::Array`], its length must exactly equal `length`. - pub unsafe fn new_unchecked( - fields: Vec, - dtype: StructFields, - length: usize, - validity: Validity, - ) -> Self { - Self { - len: length, - dtype: DType::Struct(dtype, validity.nullability()), - fields, - validity, - stats_set: Default::default(), - } - } - - /// Validates the components that would be used to create a [`StructArray`]. - /// - /// This function checks all the invariants required by [`StructArray::new_unchecked`]. - pub(crate) fn validate( - fields: &[ArrayRef], - dtype: &StructFields, - length: usize, - validity: &Validity, - ) -> VortexResult<()> { - // Check field count matches - if fields.len() != dtype.names().len() { - vortex_bail!( - "Got {} fields but dtype has {} names", - fields.len(), - dtype.names().len() - ); - } - - // Check each field's length and dtype - for (i, (field, struct_dt)) in fields.iter().zip(dtype.fields()).enumerate() { - if field.len() != length { - vortex_bail!( - "Field {} has length {} but expected {}", - i, - field.len(), - length - ); - } - - if field.dtype() != &struct_dt { - vortex_bail!( - "Field {} has dtype {} but expected {}", - i, - field.dtype(), - struct_dt - ); - } - } - - // Check validity length - if let Some(validity_len) = validity.maybe_len() - && validity_len != length - { - vortex_bail!( - "Validity has length {} but expected {}", - validity_len, - length - ); - } - - Ok(()) - } - - pub fn try_new_with_dtype( - fields: Vec, - dtype: StructFields, - length: usize, - validity: Validity, - ) -> VortexResult { - Self::validate(&fields, &dtype, length, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) }) - } - - pub fn from_fields>(items: &[(N, ArrayRef)]) -> VortexResult { - Self::try_from_iter(items.iter().map(|(a, b)| (a, b.to_array()))) - } - - pub fn try_from_iter_with_validity< - N: AsRef, - A: IntoArray, - T: IntoIterator, - >( - iter: T, - validity: Validity, - ) -> VortexResult { - let (names, fields): (Vec, Vec) = iter - .into_iter() - .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array())) - .unzip(); - let len = fields - .first() - .map(|f| f.len()) - .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?; - - Self::try_new(FieldNames::from_iter(names), fields, len, validity) - } - - pub fn try_from_iter, A: IntoArray, T: IntoIterator>( - iter: T, - ) -> VortexResult { - Self::try_from_iter_with_validity(iter, Validity::NonNullable) - } - - // TODO(aduffy): Add equivalent function to support field masks for nested column access. - /// Return a new StructArray with the given projection applied. - /// - /// Projection does not copy data arrays. Projection is defined by an ordinal array slice - /// which specifies the new ordering of columns in the struct. The projection can be used to - /// perform column re-ordering, deletion, or duplication at a logical level, without any data - /// copying. - #[allow(clippy::same_name_method)] - pub fn project(&self, projection: &[FieldName]) -> VortexResult { - let mut children = Vec::with_capacity(projection.len()); - let mut names = Vec::with_capacity(projection.len()); - - for f_name in projection.iter() { - let idx = self - .names() - .iter() - .position(|name| name == f_name) - .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?; - - names.push(self.names()[idx].clone()); - children.push(self.fields()[idx].clone()); - } - - StructArray::try_new( - FieldNames::from(names.as_slice()), - children, - self.len(), - self.validity().clone(), - ) - } - /// Removes and returns a column from the struct array by name. - /// If the column does not exist, returns `None`. - pub fn remove_column(&mut self, name: impl Into) -> Option { - let name = name.into(); - - let struct_dtype = self.struct_fields().clone(); - - let position = struct_dtype - .names() - .iter() - .position(|field_name| field_name.as_ref() == name.as_ref())?; - - let field = self.fields.remove(position); - - if let Ok(new_dtype) = struct_dtype.without_field(position) { - self.dtype = DType::Struct(new_dtype, self.dtype.nullability()); - return Some(field); - } - None - } - - /// Create a new StructArray by appending a new column onto the existing array. - pub fn with_column(&self, name: impl Into, array: ArrayRef) -> VortexResult { - let name = name.into(); - let struct_dtype = self.struct_fields().clone(); - - let names = struct_dtype.names().iter().cloned().chain(once(name)); - let types = struct_dtype.fields().chain(once(array.dtype().clone())); - let new_fields = StructFields::new(names.collect(), types.collect()); - - let mut children = self.fields.clone(); - children.push(array); - - Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone()) - } -} - -impl ValidityHelper for StructArray { - fn validity(&self) -> &Validity { - &self.validity - } -} - -impl ArrayVTable for StructVTable { - fn len(array: &StructArray) -> usize { - array.len - } - - fn dtype(array: &StructArray) -> &DType { - &array.dtype - } - - fn stats(array: &StructArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl CanonicalVTable for StructVTable { - fn canonicalize(array: &StructArray) -> Canonical { - Canonical::Struct(array.clone()) - } -} - -impl OperationsVTable for StructVTable { - fn slice(array: &StructArray, range: Range) -> ArrayRef { - let fields = array - .fields() - .iter() - .map(|field| field.slice(range.clone())) - .collect_vec(); - // SAFETY: All invariants are preserved: - // - fields.len() == dtype.names().len() (same struct fields) - // - Every field has length == range.len() (all sliced to same range) - // - Each field's dtype matches the struct dtype (unchanged from original) - // - Validity length matches array length (both sliced to same range) - unsafe { - StructArray::new_unchecked( - fields, - array.struct_fields().clone(), - range.len(), - array.validity().slice(range), - ) - } - .into_array() - } - - fn scalar_at(array: &StructArray, index: usize) -> Scalar { - Scalar::struct_( - array.dtype().clone(), - array - .fields() - .iter() - .map(|field| field.scalar_at(index)) - .collect_vec(), - ) - } -} +mod vtable; +pub use vtable::{StructEncoding, StructVTable}; #[cfg(test)] -mod test { - use vortex_buffer::buffer; - use vortex_dtype::{DType, FieldName, FieldNames, Nullability, PType}; - - use crate::arrays::primitive::PrimitiveArray; - use crate::arrays::struct_::StructArray; - use crate::arrays::varbin::VarBinArray; - use crate::arrays::{BoolArray, ConstantArray}; - use crate::validity::Validity; - use crate::{Array, IntoArray, ToCanonical}; - - #[test] - fn test_project() { - let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable); - let ys = VarBinArray::from_vec( - vec!["a", "b", "c", "d", "e"], - DType::Utf8(Nullability::NonNullable), - ); - let zs = BoolArray::from_iter([true, true, true, false, false]); - - let struct_a = StructArray::try_new( - FieldNames::from(["xs", "ys", "zs"]), - vec![xs.into_array(), ys.into_array(), zs.into_array()], - 5, - Validity::NonNullable, - ) - .unwrap(); - - let struct_b = struct_a - .project(&[FieldName::from("zs"), FieldName::from("xs")]) - .unwrap(); - assert_eq!( - struct_b.names().as_ref(), - [FieldName::from("zs"), FieldName::from("xs")], - ); - - assert_eq!(struct_b.len(), 5); - - let bools = &struct_b.fields[0]; - assert_eq!( - bools.to_bool().boolean_buffer().iter().collect::>(), - vec![true, true, true, false, false] - ); - - let prims = &struct_b.fields[1]; - assert_eq!(prims.to_primitive().as_slice::(), [0i64, 1, 2, 3, 4]); - } - - #[test] - fn test_remove_column() { - let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable); - let ys = PrimitiveArray::new(buffer![4u64, 5, 6, 7, 8], Validity::NonNullable); - - let mut struct_a = StructArray::try_new( - FieldNames::from(["xs", "ys"]), - vec![xs.into_array(), ys.into_array()], - 5, - Validity::NonNullable, - ) - .unwrap(); - - let removed = struct_a.remove_column("xs").unwrap(); - assert_eq!( - removed.dtype(), - &DType::Primitive(PType::I64, Nullability::NonNullable) - ); - assert_eq!(removed.to_primitive().as_slice::(), [0i64, 1, 2, 3, 4]); - - assert_eq!(struct_a.names(), &["ys"]); - assert_eq!(struct_a.fields.len(), 1); - assert_eq!(struct_a.len(), 5); - assert_eq!( - struct_a.fields[0].dtype(), - &DType::Primitive(PType::U64, Nullability::NonNullable) - ); - assert_eq!( - struct_a.fields[0].to_primitive().as_slice::(), - [4u64, 5, 6, 7, 8] - ); - - let empty = struct_a.remove_column("non_existent"); - assert!( - empty.is_none(), - "Expected None when removing non-existent column" - ); - assert_eq!(struct_a.names(), &["ys"]); - } - - #[test] - fn test_duplicate_field_names() { - // Test that StructArray allows duplicate field names and returns the first match - let field1 = buffer![1i32, 2, 3].into_array(); - let field2 = buffer![10i32, 20, 30].into_array(); - let field3 = buffer![100i32, 200, 300].into_array(); - - // Create struct with duplicate field names - "value" appears twice - let struct_array = StructArray::try_new( - FieldNames::from(["value", "other", "value"]), - vec![field1, field2, field3], - 3, - Validity::NonNullable, - ) - .unwrap(); - - // field_by_name should return the first field with the matching name - let first_value_field = struct_array.field_by_name("value").unwrap(); - assert_eq!( - first_value_field.to_primitive().as_slice::(), - [1i32, 2, 3] // This is field1, not field3 - ); - - // Verify field_by_name_opt also returns the first match - let opt_field = struct_array.field_by_name_opt("value").unwrap(); - assert_eq!( - opt_field.to_primitive().as_slice::(), - [1i32, 2, 3] // First "value" field - ); - - // Verify the third field (second "value") can be accessed by index - let third_field = &struct_array.fields()[2]; - assert_eq!( - third_field.to_primitive().as_slice::(), - [100i32, 200, 300] - ); - } - - #[test] - fn test_uncompressed_size_in_bytes() { - let struct_array = StructArray::new( - FieldNames::from(["integers"]), - vec![ConstantArray::new(5, 1000).into_array()], - 1000, - Validity::NonNullable, - ); - - let canonical_size = struct_array.to_canonical().into_array().nbytes(); - let uncompressed_size = struct_array - .statistics() - .compute_uncompressed_size_in_bytes(); - - assert_eq!(canonical_size, 2); - assert_eq!(uncompressed_size, Some(4000)); - } -} +mod tests; diff --git a/vortex-array/src/arrays/struct_/tests.rs b/vortex-array/src/arrays/struct_/tests.rs new file mode 100644 index 00000000000..c03fe5a725c --- /dev/null +++ b/vortex-array/src/arrays/struct_/tests.rs @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_buffer::buffer; +use vortex_dtype::{DType, FieldName, FieldNames, Nullability, PType}; + +use crate::arrays::primitive::PrimitiveArray; +use crate::arrays::struct_::StructArray; +use crate::arrays::varbin::VarBinArray; +use crate::arrays::{BoolArray, ConstantArray}; +use crate::validity::Validity; +use crate::{Array, IntoArray, ToCanonical}; + +#[test] +fn test_project() { + let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable); + let ys = VarBinArray::from_vec( + vec!["a", "b", "c", "d", "e"], + DType::Utf8(Nullability::NonNullable), + ); + let zs = BoolArray::from_iter([true, true, true, false, false]); + + let struct_a = StructArray::try_new( + FieldNames::from(["xs", "ys", "zs"]), + vec![xs.into_array(), ys.into_array(), zs.into_array()], + 5, + Validity::NonNullable, + ) + .unwrap(); + + let struct_b = struct_a + .project(&[FieldName::from("zs"), FieldName::from("xs")]) + .unwrap(); + assert_eq!( + struct_b.names().as_ref(), + [FieldName::from("zs"), FieldName::from("xs")], + ); + + assert_eq!(struct_b.len(), 5); + + let bools = &struct_b.fields[0]; + assert_eq!( + bools.to_bool().boolean_buffer().iter().collect::>(), + vec![true, true, true, false, false] + ); + + let prims = &struct_b.fields[1]; + assert_eq!(prims.to_primitive().as_slice::(), [0i64, 1, 2, 3, 4]); +} + +#[test] +fn test_remove_column() { + let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable); + let ys = PrimitiveArray::new(buffer![4u64, 5, 6, 7, 8], Validity::NonNullable); + + let mut struct_a = StructArray::try_new( + FieldNames::from(["xs", "ys"]), + vec![xs.into_array(), ys.into_array()], + 5, + Validity::NonNullable, + ) + .unwrap(); + + let removed = struct_a.remove_column("xs").unwrap(); + assert_eq!( + removed.dtype(), + &DType::Primitive(PType::I64, Nullability::NonNullable) + ); + assert_eq!(removed.to_primitive().as_slice::(), [0i64, 1, 2, 3, 4]); + + assert_eq!(struct_a.names(), &["ys"]); + assert_eq!(struct_a.fields.len(), 1); + assert_eq!(struct_a.len(), 5); + assert_eq!( + struct_a.fields[0].dtype(), + &DType::Primitive(PType::U64, Nullability::NonNullable) + ); + assert_eq!( + struct_a.fields[0].to_primitive().as_slice::(), + [4u64, 5, 6, 7, 8] + ); + + let empty = struct_a.remove_column("non_existent"); + assert!( + empty.is_none(), + "Expected None when removing non-existent column" + ); + assert_eq!(struct_a.names(), &["ys"]); +} + +#[test] +fn test_duplicate_field_names() { + // Test that StructArray allows duplicate field names and returns the first match + let field1 = buffer![1i32, 2, 3].into_array(); + let field2 = buffer![10i32, 20, 30].into_array(); + let field3 = buffer![100i32, 200, 300].into_array(); + + // Create struct with duplicate field names - "value" appears twice + let struct_array = StructArray::try_new( + FieldNames::from(["value", "other", "value"]), + vec![field1, field2, field3], + 3, + Validity::NonNullable, + ) + .unwrap(); + + // field_by_name should return the first field with the matching name + let first_value_field = struct_array.field_by_name("value").unwrap(); + assert_eq!( + first_value_field.to_primitive().as_slice::(), + [1i32, 2, 3] // This is field1, not field3 + ); + + // Verify field_by_name_opt also returns the first match + let opt_field = struct_array.field_by_name_opt("value").unwrap(); + assert_eq!( + opt_field.to_primitive().as_slice::(), + [1i32, 2, 3] // First "value" field + ); + + // Verify the third field (second "value") can be accessed by index + let third_field = &struct_array.fields()[2]; + assert_eq!( + third_field.to_primitive().as_slice::(), + [100i32, 200, 300] + ); +} + +#[test] +fn test_uncompressed_size_in_bytes() { + let struct_array = StructArray::new( + FieldNames::from(["integers"]), + vec![ConstantArray::new(5, 1000).into_array()], + 1000, + Validity::NonNullable, + ); + + let canonical_size = struct_array.to_canonical().into_array().nbytes(); + let uncompressed_size = struct_array + .statistics() + .compute_uncompressed_size_in_bytes(); + + assert_eq!(canonical_size, 2); + assert_eq!(uncompressed_size, Some(4000)); +} diff --git a/vortex-array/src/arrays/struct_/vtable/array.rs b/vortex-array/src/arrays/struct_/vtable/array.rs new file mode 100644 index 00000000000..7914cf22d1a --- /dev/null +++ b/vortex-array/src/arrays/struct_/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::struct_::{StructArray, StructVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for StructVTable { + fn len(array: &StructArray) -> usize { + array.len + } + + fn dtype(array: &StructArray) -> &DType { + &array.dtype + } + + fn stats(array: &StructArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/struct_/vtable/canonical.rs b/vortex-array/src/arrays/struct_/vtable/canonical.rs new file mode 100644 index 00000000000..db67e65fd91 --- /dev/null +++ b/vortex-array/src/arrays/struct_/vtable/canonical.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::struct_::{StructArray, StructVTable}; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for StructVTable { + fn canonicalize(array: &StructArray) -> Canonical { + Canonical::Struct(array.clone()) + } +} diff --git a/vortex-array/src/arrays/struct_/vtable/mod.rs b/vortex-array/src/arrays/struct_/vtable/mod.rs new file mode 100644 index 00000000000..659cc5d9d18 --- /dev/null +++ b/vortex-array/src/arrays/struct_/vtable/mod.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::struct_::StructArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod pipeline; +mod serde; +mod validity; +mod visitor; + +vtable!(Struct); + +impl VTable for StructVTable { + type Array = StructArray; + type Encoding = StructEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = Self; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.struct") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(StructEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct StructEncoding; diff --git a/vortex-array/src/arrays/struct_/vtable/operations.rs b/vortex-array/src/arrays/struct_/vtable/operations.rs new file mode 100644 index 00000000000..7996f60b11b --- /dev/null +++ b/vortex-array/src/arrays/struct_/vtable/operations.rs @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::ops::Range; + +use itertools::Itertools; +use vortex_scalar::Scalar; + +use crate::arrays::struct_::{StructArray, StructVTable}; +use crate::vtable::{OperationsVTable, ValidityHelper}; +use crate::{ArrayRef, IntoArray}; + +impl OperationsVTable for StructVTable { + fn slice(array: &StructArray, range: Range) -> ArrayRef { + let fields = array + .fields() + .iter() + .map(|field| field.slice(range.clone())) + .collect_vec(); + // SAFETY: All invariants are preserved: + // - fields.len() == dtype.names().len() (same struct fields) + // - Every field has length == range.len() (all sliced to same range) + // - Each field's dtype matches the struct dtype (unchanged from original) + // - Validity length matches array length (both sliced to same range) + unsafe { + StructArray::new_unchecked( + fields, + array.struct_fields().clone(), + range.len(), + array.validity().slice(range), + ) + } + .into_array() + } + + fn scalar_at(array: &StructArray, index: usize) -> Scalar { + Scalar::struct_( + array.dtype().clone(), + array + .fields() + .iter() + .map(|field| field.scalar_at(index)) + .collect_vec(), + ) + } +} diff --git a/vortex-array/src/arrays/struct_/operator.rs b/vortex-array/src/arrays/struct_/vtable/pipeline.rs similarity index 100% rename from vortex-array/src/arrays/struct_/operator.rs rename to vortex-array/src/arrays/struct_/vtable/pipeline.rs diff --git a/vortex-array/src/arrays/struct_/serde.rs b/vortex-array/src/arrays/struct_/vtable/serde.rs similarity index 75% rename from vortex-array/src/arrays/struct_/serde.rs rename to vortex-array/src/arrays/struct_/vtable/serde.rs index b7f3b006c12..0c6c3352de8 100644 --- a/vortex-array/src/arrays/struct_/serde.rs +++ b/vortex-array/src/arrays/struct_/vtable/serde.rs @@ -6,12 +6,11 @@ use vortex_buffer::ByteBuffer; use vortex_dtype::DType; use vortex_error::{VortexExpect, VortexResult, vortex_bail}; -use super::StructEncoding; -use crate::arrays::{StructArray, StructVTable}; +use crate::EmptyMetadata; +use crate::arrays::struct_::{StructArray, StructEncoding, StructVTable}; use crate::serde::ArrayChildren; use crate::validity::Validity; -use crate::vtable::{SerdeVTable, ValidityHelper, VisitorVTable}; -use crate::{ArrayBufferVisitor, ArrayChildVisitor, EmptyMetadata}; +use crate::vtable::SerdeVTable; impl SerdeVTable for StructVTable { type Metadata = EmptyMetadata; @@ -59,14 +58,3 @@ impl SerdeVTable for StructVTable { StructArray::try_new_with_dtype(children, struct_dtype.clone(), len, validity) } } - -impl VisitorVTable for StructVTable { - fn visit_buffers(_array: &StructArray, _visitor: &mut dyn ArrayBufferVisitor) {} - - fn visit_children(array: &StructArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_validity(array.validity(), array.len()); - for (idx, name) in array.names().iter().enumerate() { - visitor.visit_child(name.as_ref(), &array.fields()[idx]); - } - } -} diff --git a/vortex-array/src/arrays/struct_/vtable/validity.rs b/vortex-array/src/arrays/struct_/vtable/validity.rs new file mode 100644 index 00000000000..8d676e1f8d4 --- /dev/null +++ b/vortex-array/src/arrays/struct_/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::struct_::StructArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for StructArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/struct_/vtable/visitor.rs b/vortex-array/src/arrays/struct_/vtable/visitor.rs new file mode 100644 index 00000000000..b49822e2307 --- /dev/null +++ b/vortex-array/src/arrays/struct_/vtable/visitor.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::struct_::{StructArray, StructVTable}; +use crate::vtable::{ValidityHelper, VisitorVTable}; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for StructVTable { + fn visit_buffers(_array: &StructArray, _visitor: &mut dyn ArrayBufferVisitor) {} + + fn visit_children(array: &StructArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_validity(array.validity(), array.len()); + for (idx, name) in array.names().iter().enumerate() { + visitor.visit_child(name.as_ref(), &array.fields()[idx]); + } + } +} diff --git a/vortex-array/src/arrays/validation_tests.rs b/vortex-array/src/arrays/validation_tests.rs index 98a8f1182ad..1eecc512421 100644 --- a/vortex-array/src/arrays/validation_tests.rs +++ b/vortex-array/src/arrays/validation_tests.rs @@ -152,7 +152,7 @@ mod tests { #[test] fn test_varbinview_array_validation_success() { // Valid case: simple inline strings. - use crate::arrays::varbinview::BinaryView; + use crate::arrays::binary_view::BinaryView; // Create inline views (length <= 12). let view1 = BinaryView::new_inlined(b"foo"); @@ -171,7 +171,7 @@ mod tests { #[test] fn test_varbinview_array_validation_failure_buffer_index_out_of_bounds() { // Invalid case: view references non-existent buffer. - use crate::arrays::varbinview::BinaryView; + use crate::arrays::binary_view::BinaryView; // Create a view that references buffer 1, but we only have 1 buffer (index 0). let data = b"this is a long string that needs a buffer"; diff --git a/vortex-array/src/arrays/varbin/array.rs b/vortex-array/src/arrays/varbin/array.rs new file mode 100644 index 00000000000..8cd0aaa2def --- /dev/null +++ b/vortex-array/src/arrays/varbin/array.rs @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use num_traits::AsPrimitive; +use vortex_buffer::ByteBuffer; +use vortex_dtype::{DType, IntegerPType, Nullability, match_each_integer_ptype}; +use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_err}; + +use crate::arrays::varbin::builder::VarBinBuilder; +use crate::stats::ArrayStats; +use crate::validity::Validity; +use crate::{Array, ArrayRef, ToCanonical}; + +#[derive(Clone, Debug)] +pub struct VarBinArray { + pub(super) dtype: DType, + bytes: ByteBuffer, + offsets: ArrayRef, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, +} + +impl VarBinArray { + /// Creates a new [`VarBinArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented + /// in [`VarBinArray::new_unchecked`]. + pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self { + Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new") + } + + /// Constructs a new `VarBinArray`. + /// + /// See [`VarBinArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the provided components do not satisfy the invariants documented in + /// [`VarBinArray::new_unchecked`]. + pub fn try_new( + offsets: ArrayRef, + bytes: ByteBuffer, + dtype: DType, + validity: Validity, + ) -> VortexResult { + Self::validate(&offsets, &bytes, &dtype, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) }) + } + + /// Creates a new [`VarBinArray`] without validation from these components: + /// + /// * `offsets` is an array of byte offsets into the `bytes` buffer. + /// * `bytes` is a buffer containing all the variable-length data concatenated. + /// * `dtype` specifies whether this contains UTF-8 strings or binary data. + /// * `validity` holds the null values. + /// + /// # Safety + /// + /// The caller must ensure all of the following invariants are satisfied: + /// + /// ## Offsets Requirements + /// + /// - `offsets` must be a non-nullable integer array. + /// - `offsets` must contain at least 1 element (for empty array, it contains \[0\]). + /// - All values in `offsets` must be monotonically non-decreasing. + /// - The first value in `offsets` must be 0. + /// - No offset value may exceed `bytes.len()`. + /// + /// ## Type Requirements + /// + /// - `dtype` must be exactly [`DType::Binary`] or [`DType::Utf8`]. + /// - If `dtype` is [`DType::Utf8`], every byte slice `bytes[offsets[i]..offsets[i+1]]` must be valid UTF-8. + /// - `dtype.is_nullable()` must match the nullability of `validity`. + /// + /// ## Validity Requirements + /// + /// - If `validity` is [`Validity::Array`], its length must exactly equal `offsets.len() - 1`. + pub unsafe fn new_unchecked( + offsets: ArrayRef, + bytes: ByteBuffer, + dtype: DType, + validity: Validity, + ) -> Self { + #[cfg(debug_assertions)] + Self::validate(&offsets, &bytes, &dtype, &validity) + .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters"); + + Self { + dtype, + bytes, + offsets, + validity, + stats_set: Default::default(), + } + } + + /// Validates the components that would be used to create a [`VarBinArray`]. + /// + /// This function checks all the invariants required by [`VarBinArray::new_unchecked`]. + pub(crate) fn validate( + offsets: &dyn Array, + bytes: &ByteBuffer, + dtype: &DType, + validity: &Validity, + ) -> VortexResult<()> { + // Check offsets are non-nullable integer + vortex_ensure!( + offsets.dtype().is_int() && !offsets.dtype().is_nullable(), + MismatchedTypes: "non nullable int", offsets.dtype() + ); + + // Check dtype is Binary or Utf8 + vortex_ensure!( + matches!(dtype, DType::Binary(_) | DType::Utf8(_)), + MismatchedTypes: "utf8 or binary", dtype + ); + + // Check nullability matches + vortex_ensure!( + dtype.is_nullable() != (validity == &Validity::NonNullable), + "incorrect validity {:?} for dtype {}", + validity, + dtype + ); + + // Check offsets has at least one element + vortex_ensure!( + !offsets.is_empty(), + "Offsets must have at least one element" + ); + + // Check offsets are sorted + if let Some(is_sorted) = offsets.statistics().compute_is_sorted() { + vortex_ensure!(is_sorted, "offsets must be sorted"); + } + + let last_offset = offsets + .scalar_at(offsets.len() - 1) + .as_primitive() + .as_::() + .ok_or_else(|| vortex_err!("Last offset must be convertible to usize"))?; + vortex_ensure!( + last_offset <= bytes.len(), + "Last offset {} exceeds bytes length {}", + last_offset, + bytes.len() + ); + + // Check validity length + if let Some(validity_len) = validity.maybe_len() { + vortex_ensure!( + validity_len == offsets.len() - 1, + "Validity length {} doesn't match array length {}", + validity_len, + offsets.len() - 1 + ); + } + + // Validate UTF-8 for Utf8 dtype + if matches!(dtype, DType::Utf8(_)) { + let primitive_offsets = offsets.to_primitive(); + match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| { + let offsets_slice = primitive_offsets.as_slice::(); + for (i, (start, end)) in offsets_slice + .windows(2) + .map(|o| (o[0].as_(), o[1].as_())) + .enumerate() + { + if validity.is_null(i) { + continue; + } + + let string_bytes = &bytes.as_ref()[start..end]; + simdutf8::basic::from_utf8(string_bytes).map_err(|_| { + #[allow(clippy::unwrap_used)] + // run validation using `compat` package to get more detailed error message + let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err(); + vortex_err!("invalid utf-8: {err} at index {i}") + })?; + } + }); + } + + Ok(()) + } + + #[inline] + pub fn offsets(&self) -> &ArrayRef { + &self.offsets + } + + /// Access the value bytes child buffer + /// + /// # Note + /// + /// Bytes child buffer is never sliced when the array is sliced so this can include values + /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes] + /// unless they're resolving values via the offset child array. + #[inline] + pub fn bytes(&self) -> &ByteBuffer { + &self.bytes + } + + /// Access value bytes child array limited to values that are logically present in + /// the array unlike [bytes][Self::bytes]. + pub fn sliced_bytes(&self) -> ByteBuffer { + let first_offset: usize = self.offset_at(0); + let last_offset = self.offset_at(self.len()); + + self.bytes().slice(first_offset..last_offset) + } + + pub fn from_vec>(vec: Vec, dtype: DType) -> Self { + let size: usize = vec.iter().map(|v| v.as_ref().len()).sum(); + if size < u32::MAX as usize { + Self::from_vec_sized::(vec, dtype) + } else { + Self::from_vec_sized::(vec, dtype) + } + } + + fn from_vec_sized(vec: Vec, dtype: DType) -> Self + where + O: IntegerPType, + T: AsRef<[u8]>, + { + let mut builder = VarBinBuilder::::with_capacity(vec.len()); + for v in vec { + builder.append_value(v.as_ref()); + } + builder.finish(dtype) + } + + #[allow(clippy::same_name_method)] + pub fn from_iter, I: IntoIterator>>( + iter: I, + dtype: DType, + ) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinBuilder::::with_capacity(iter.size_hint().0); + for v in iter { + builder.append(v.as_ref().map(|o| o.as_ref())); + } + builder.finish(dtype) + } + + pub fn from_iter_nonnull, I: IntoIterator>( + iter: I, + dtype: DType, + ) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinBuilder::::with_capacity(iter.size_hint().0); + for v in iter { + builder.append_value(v); + } + builder.finish(dtype) + } + + /// Get value offset at a given index + /// + /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index + /// + /// Panics if index is out of bounds + pub fn offset_at(&self, index: usize) -> usize { + assert!( + index <= self.len(), + "Index {index} out of bounds 0..={}", + self.len() + ); + + self.offsets() + .scalar_at(index) + .as_ref() + .try_into() + .vortex_expect("Failed to convert offset to usize") + } + + /// Access value bytes at a given index + /// + /// Will return buffer referencing underlying data without performing a copy + pub fn bytes_at(&self, index: usize) -> ByteBuffer { + let start = self.offset_at(index); + let end = self.offset_at(index + 1); + + self.bytes().slice(start..end) + } + + /// Consumes self, returning a tuple containing the `DType`, the `bytes` array, + /// the `offsets` array, and the `validity`. + pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) { + (self.dtype, self.bytes, self.offsets, self.validity) + } +} + +impl From> for VarBinArray { + fn from(value: Vec<&[u8]>) -> Self { + Self::from_vec(value, DType::Binary(Nullability::NonNullable)) + } +} + +impl From>> for VarBinArray { + fn from(value: Vec>) -> Self { + Self::from_vec(value, DType::Binary(Nullability::NonNullable)) + } +} + +impl From> for VarBinArray { + fn from(value: Vec) -> Self { + Self::from_vec(value, DType::Utf8(Nullability::NonNullable)) + } +} + +impl From> for VarBinArray { + fn from(value: Vec<&str>) -> Self { + Self::from_vec(value, DType::Utf8(Nullability::NonNullable)) + } +} + +impl<'a> FromIterator> for VarBinArray { + fn from_iter>>(iter: T) -> Self { + Self::from_iter(iter, DType::Binary(Nullability::Nullable)) + } +} + +impl FromIterator>> for VarBinArray { + fn from_iter>>>(iter: T) -> Self { + Self::from_iter(iter, DType::Binary(Nullability::Nullable)) + } +} + +impl FromIterator> for VarBinArray { + fn from_iter>>(iter: T) -> Self { + Self::from_iter(iter, DType::Utf8(Nullability::Nullable)) + } +} + +impl<'a> FromIterator> for VarBinArray { + fn from_iter>>(iter: T) -> Self { + Self::from_iter(iter, DType::Utf8(Nullability::Nullable)) + } +} diff --git a/vortex-array/src/arrays/varbin/builder.rs b/vortex-array/src/arrays/varbin/builder.rs index 85b0fffa7f4..9a64a0c8552 100644 --- a/vortex-array/src/arrays/varbin/builder.rs +++ b/vortex-array/src/arrays/varbin/builder.rs @@ -110,7 +110,7 @@ impl VarBinBuilder { } #[cfg(test)] -mod test { +mod tests { use vortex_dtype::DType; use vortex_dtype::Nullability::Nullable; use vortex_scalar::Scalar; diff --git a/vortex-array/src/arrays/varbin/compute/min_max.rs b/vortex-array/src/arrays/varbin/compute/min_max.rs index ec27b765f2d..9ce469c9188 100644 --- a/vortex-array/src/arrays/varbin/compute/min_max.rs +++ b/vortex-array/src/arrays/varbin/compute/min_max.rs @@ -13,14 +13,14 @@ use crate::register_kernel; impl MinMaxKernel for VarBinVTable { fn min_max(&self, array: &VarBinArray) -> VortexResult> { - compute_min_max(array, array.dtype()) + varbin_compute_min_max(array, array.dtype()) } } register_kernel!(MinMaxKernelAdapter(VarBinVTable).lift()); /// Compute the min and max of VarBin like array. -pub(crate) fn compute_min_max>( +pub(crate) fn varbin_compute_min_max>( array: &T, dtype: &DType, ) -> VortexResult> { diff --git a/vortex-array/src/arrays/varbin/compute/mod.rs b/vortex-array/src/arrays/varbin/compute/mod.rs index be2c2fbf89d..a852db6ccc4 100644 --- a/vortex-array/src/arrays/varbin/compute/mod.rs +++ b/vortex-array/src/arrays/varbin/compute/mod.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -pub(crate) use min_max::compute_min_max; +pub(crate) use min_max::varbin_compute_min_max; mod cast; mod compare; diff --git a/vortex-array/src/arrays/varbin/mod.rs b/vortex-array/src/arrays/varbin/mod.rs index 591c72d5d65..fc911834ca0 100644 --- a/vortex-array/src/arrays/varbin/mod.rs +++ b/vortex-array/src/arrays/varbin/mod.rs @@ -1,406 +1,24 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::fmt::Debug; +mod array; +pub use array::VarBinArray; -pub(crate) use compute::compute_min_max; -use num_traits::AsPrimitive; -use vortex_buffer::ByteBuffer; -use vortex_dtype::{DType, IntegerPType, Nullability, match_each_integer_ptype}; -use vortex_error::{VortexExpect as _, VortexResult, VortexUnwrap as _, vortex_ensure, vortex_err}; -use vortex_scalar::Scalar; - -use crate::arrays::varbin::builder::VarBinBuilder; -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::vtable::{ - ArrayVTable, NotSupported, VTable, ValidityHelper, ValidityVTableFromValidityHelper, -}; -use crate::{Array, ArrayRef, EncodingId, EncodingRef, ToCanonical, vtable}; - -mod accessor; -pub mod builder; -mod canonical; mod compute; -mod operator; -mod ops; -mod serde; - -vtable!(VarBin); - -impl VTable for VarBinVTable { - type Array = VarBinArray; - type Encoding = VarBinEncoding; - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.varbin") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(VarBinEncoding.as_ref()) - } -} - -#[derive(Clone, Debug)] -pub struct VarBinArray { - dtype: DType, - bytes: ByteBuffer, - offsets: ArrayRef, - validity: Validity, - stats_set: ArrayStats, -} - -#[derive(Clone, Debug)] -pub struct VarBinEncoding; - -impl VarBinArray { - /// Creates a new [`VarBinArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented - /// in [`VarBinArray::new_unchecked`]. - pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self { - Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new") - } - - /// Constructs a new `VarBinArray`. - /// - /// See [`VarBinArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the provided components do not satisfy the invariants documented in - /// [`VarBinArray::new_unchecked`]. - pub fn try_new( - offsets: ArrayRef, - bytes: ByteBuffer, - dtype: DType, - validity: Validity, - ) -> VortexResult { - Self::validate(&offsets, &bytes, &dtype, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) }) - } - - /// Creates a new [`VarBinArray`] without validation from these components: - /// - /// * `offsets` is an array of byte offsets into the `bytes` buffer. - /// * `bytes` is a buffer containing all the variable-length data concatenated. - /// * `dtype` specifies whether this contains UTF-8 strings or binary data. - /// * `validity` holds the null values. - /// - /// # Safety - /// - /// The caller must ensure all of the following invariants are satisfied: - /// - /// ## Offsets Requirements - /// - /// - `offsets` must be a non-nullable integer array. - /// - `offsets` must contain at least 1 element (for empty array, it contains \[0\]). - /// - All values in `offsets` must be monotonically non-decreasing. - /// - The first value in `offsets` must be 0. - /// - No offset value may exceed `bytes.len()`. - /// - /// ## Type Requirements - /// - /// - `dtype` must be exactly [`DType::Binary`] or [`DType::Utf8`]. - /// - If `dtype` is [`DType::Utf8`], every byte slice `bytes[offsets[i]..offsets[i+1]]` must be valid UTF-8. - /// - `dtype.is_nullable()` must match the nullability of `validity`. - /// - /// ## Validity Requirements - /// - /// - If `validity` is [`Validity::Array`], its length must exactly equal `offsets.len() - 1`. - pub unsafe fn new_unchecked( - offsets: ArrayRef, - bytes: ByteBuffer, - dtype: DType, - validity: Validity, - ) -> Self { - Self { - dtype, - bytes, - offsets, - validity, - stats_set: Default::default(), - } - } - - /// Validates the components that would be used to create a [`VarBinArray`]. - /// - /// This function checks all the invariants required by [`VarBinArray::new_unchecked`]. - pub(crate) fn validate( - offsets: &dyn Array, - bytes: &ByteBuffer, - dtype: &DType, - validity: &Validity, - ) -> VortexResult<()> { - // Check offsets are non-nullable integer - vortex_ensure!( - offsets.dtype().is_int() && !offsets.dtype().is_nullable(), - MismatchedTypes: "non nullable int", offsets.dtype() - ); - - // Check dtype is Binary or Utf8 - vortex_ensure!( - matches!(dtype, DType::Binary(_) | DType::Utf8(_)), - MismatchedTypes: "utf8 or binary", dtype - ); +pub(crate) use compute::varbin_compute_min_max; // For use in `varbinview`. - // Check nullability matches - vortex_ensure!( - dtype.is_nullable() != (validity == &Validity::NonNullable), - "incorrect validity {:?} for dtype {}", - validity, - dtype - ); - - // Check offsets has at least one element - vortex_ensure!( - !offsets.is_empty(), - "Offsets must have at least one element" - ); - - // Check offsets are sorted - if let Some(is_sorted) = offsets.statistics().compute_is_sorted() { - vortex_ensure!(is_sorted, "offsets must be sorted"); - } - - let last_offset = offsets - .scalar_at(offsets.len() - 1) - .as_primitive() - .as_::() - .ok_or_else(|| vortex_err!("Last offset must be convertible to usize"))?; - vortex_ensure!( - last_offset <= bytes.len(), - "Last offset {} exceeds bytes length {}", - last_offset, - bytes.len() - ); - - // Check validity length - if let Some(validity_len) = validity.maybe_len() { - vortex_ensure!( - validity_len == offsets.len() - 1, - "Validity length {} doesn't match array length {}", - validity_len, - offsets.len() - 1 - ); - } - - // Validate UTF-8 for Utf8 dtype - if matches!(dtype, DType::Utf8(_)) { - let primitive_offsets = offsets.to_primitive(); - match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| { - let offsets_slice = primitive_offsets.as_slice::(); - for (i, (start, end)) in offsets_slice - .windows(2) - .map(|o| (o[0].as_(), o[1].as_())) - .enumerate() - { - if validity.is_null(i) { - continue; - } - - let string_bytes = &bytes.as_ref()[start..end]; - simdutf8::basic::from_utf8(string_bytes).map_err(|_| { - #[allow(clippy::unwrap_used)] - // run validation using `compat` package to get more detailed error message - let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err(); - vortex_err!("invalid utf-8: {err} at index {i}") - })?; - } - }); - } - - Ok(()) - } - - #[inline] - pub fn offsets(&self) -> &ArrayRef { - &self.offsets - } - - /// Access the value bytes child buffer - /// - /// # Note - /// - /// Bytes child buffer is never sliced when the array is sliced so this can include values - /// that are not logically present in the array. Users should prefer [sliced_bytes][Self::sliced_bytes] - /// unless they're resolving values via the offset child array. - #[inline] - pub fn bytes(&self) -> &ByteBuffer { - &self.bytes - } - - /// Access value bytes child array limited to values that are logically present in - /// the array unlike [bytes][Self::bytes]. - pub fn sliced_bytes(&self) -> ByteBuffer { - let first_offset: usize = self.offset_at(0); - let last_offset = self.offset_at(self.len()); - - self.bytes().slice(first_offset..last_offset) - } - - pub fn from_vec>(vec: Vec, dtype: DType) -> Self { - let size: usize = vec.iter().map(|v| v.as_ref().len()).sum(); - if size < u32::MAX as usize { - Self::from_vec_sized::(vec, dtype) - } else { - Self::from_vec_sized::(vec, dtype) - } - } - - fn from_vec_sized(vec: Vec, dtype: DType) -> Self - where - O: IntegerPType, - T: AsRef<[u8]>, - { - let mut builder = VarBinBuilder::::with_capacity(vec.len()); - for v in vec { - builder.append_value(v.as_ref()); - } - builder.finish(dtype) - } - - #[allow(clippy::same_name_method)] - pub fn from_iter, I: IntoIterator>>( - iter: I, - dtype: DType, - ) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinBuilder::::with_capacity(iter.size_hint().0); - for v in iter { - builder.append(v.as_ref().map(|o| o.as_ref())); - } - builder.finish(dtype) - } - - pub fn from_iter_nonnull, I: IntoIterator>( - iter: I, - dtype: DType, - ) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinBuilder::::with_capacity(iter.size_hint().0); - for v in iter { - builder.append_value(v); - } - builder.finish(dtype) - } - - /// Get value offset at a given index - /// - /// Note: There's 1 more offsets than the elements in the array, thus last offset is at array length index - /// - /// Panics if index is out of bounds - pub fn offset_at(&self, index: usize) -> usize { - assert!( - index <= self.len(), - "Index {index} out of bounds 0..={}", - self.len() - ); - - self.offsets() - .scalar_at(index) - .as_ref() - .try_into() - .vortex_expect("Failed to convert offset to usize") - } +mod vtable; +pub use vtable::{VarBinEncoding, VarBinVTable}; - /// Access value bytes at a given index - /// - /// Will return buffer referencing underlying data without performing a copy - pub fn bytes_at(&self, index: usize) -> ByteBuffer { - let start = self.offset_at(index); - let end = self.offset_at(index + 1); - - self.bytes().slice(start..end) - } - - /// Consumes self, returning a tuple containing the `DType`, the `bytes` array, - /// the `offsets` array, and the `validity`. - pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) { - (self.dtype, self.bytes, self.offsets, self.validity) - } -} - -impl ValidityHelper for VarBinArray { - fn validity(&self) -> &Validity { - &self.validity - } -} - -impl ArrayVTable for VarBinVTable { - fn len(array: &VarBinArray) -> usize { - array.offsets().len().saturating_sub(1) - } - - fn dtype(array: &VarBinArray) -> &DType { - &array.dtype - } - - fn stats(array: &VarBinArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl From> for VarBinArray { - fn from(value: Vec<&[u8]>) -> Self { - Self::from_vec(value, DType::Binary(Nullability::NonNullable)) - } -} - -impl From>> for VarBinArray { - fn from(value: Vec>) -> Self { - Self::from_vec(value, DType::Binary(Nullability::NonNullable)) - } -} - -impl From> for VarBinArray { - fn from(value: Vec) -> Self { - Self::from_vec(value, DType::Utf8(Nullability::NonNullable)) - } -} - -impl From> for VarBinArray { - fn from(value: Vec<&str>) -> Self { - Self::from_vec(value, DType::Utf8(Nullability::NonNullable)) - } -} - -impl<'a> FromIterator> for VarBinArray { - fn from_iter>>(iter: T) -> Self { - Self::from_iter(iter, DType::Binary(Nullability::Nullable)) - } -} - -impl FromIterator>> for VarBinArray { - fn from_iter>>>(iter: T) -> Self { - Self::from_iter(iter, DType::Binary(Nullability::Nullable)) - } -} +pub mod builder; -impl FromIterator> for VarBinArray { - fn from_iter>>(iter: T) -> Self { - Self::from_iter(iter, DType::Utf8(Nullability::Nullable)) - } -} +mod accessor; +mod operator; -impl<'a> FromIterator> for VarBinArray { - fn from_iter>>(iter: T) -> Self { - Self::from_iter(iter, DType::Utf8(Nullability::Nullable)) - } -} +use vortex_buffer::ByteBuffer; +use vortex_dtype::DType; +use vortex_error::{VortexUnwrap, vortex_err}; +use vortex_scalar::Scalar; pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar { if matches!(dtype, DType::Utf8(_)) { @@ -413,46 +31,4 @@ pub fn varbin_scalar(value: ByteBuffer, dtype: &DType) -> Scalar { } #[cfg(test)] -mod test { - use rstest::{fixture, rstest}; - use vortex_buffer::{Buffer, buffer}; - use vortex_dtype::{DType, Nullability}; - - use crate::arrays::varbin::VarBinArray; - use crate::validity::Validity; - use crate::{Array, ArrayRef, IntoArray}; - - #[fixture] - fn binary_array() -> ArrayRef { - let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes()); - let offsets = buffer![0, 11, 44].into_array(); - - VarBinArray::try_new( - offsets.into_array(), - values, - DType::Utf8(Nullability::NonNullable), - Validity::NonNullable, - ) - .unwrap() - .into_array() - } - - #[rstest] - pub fn test_scalar_at(binary_array: ArrayRef) { - assert_eq!(binary_array.len(), 2); - assert_eq!(binary_array.scalar_at(0), "hello world".into()); - assert_eq!( - binary_array.scalar_at(1), - "hello world this is a long string".into() - ) - } - - #[rstest] - pub fn slice_array(binary_array: ArrayRef) { - let binary_arr = binary_array.slice(1..2); - assert_eq!( - binary_arr.scalar_at(0), - "hello world this is a long string".into() - ); - } -} +mod tests; diff --git a/vortex-array/src/arrays/varbin/tests.rs b/vortex-array/src/arrays/varbin/tests.rs new file mode 100644 index 00000000000..23830e553f9 --- /dev/null +++ b/vortex-array/src/arrays/varbin/tests.rs @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use rstest::{fixture, rstest}; +use vortex_buffer::{Buffer, buffer}; +use vortex_dtype::{DType, Nullability}; + +use crate::arrays::varbin::VarBinArray; +use crate::validity::Validity; +use crate::{Array, ArrayRef, IntoArray}; + +#[fixture] +fn binary_array() -> ArrayRef { + let values = Buffer::copy_from("hello worldhello world this is a long string".as_bytes()); + let offsets = buffer![0, 11, 44].into_array(); + + VarBinArray::try_new( + offsets.into_array(), + values, + DType::Utf8(Nullability::NonNullable), + Validity::NonNullable, + ) + .unwrap() + .into_array() +} + +#[rstest] +pub fn test_scalar_at(binary_array: ArrayRef) { + assert_eq!(binary_array.len(), 2); + assert_eq!(binary_array.scalar_at(0), "hello world".into()); + assert_eq!( + binary_array.scalar_at(1), + "hello world this is a long string".into() + ) +} + +#[rstest] +pub fn slice_array(binary_array: ArrayRef) { + let binary_arr = binary_array.slice(1..2); + assert_eq!( + binary_arr.scalar_at(0), + "hello world this is a long string".into() + ); +} diff --git a/vortex-array/src/arrays/varbin/vtable/array.rs b/vortex-array/src/arrays/varbin/vtable/array.rs new file mode 100644 index 00000000000..1dec8e73384 --- /dev/null +++ b/vortex-array/src/arrays/varbin/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::varbin::{VarBinArray, VarBinVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for VarBinVTable { + fn len(array: &VarBinArray) -> usize { + array.offsets().len().saturating_sub(1) + } + + fn dtype(array: &VarBinArray) -> &DType { + &array.dtype + } + + fn stats(array: &VarBinArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/varbin/canonical.rs b/vortex-array/src/arrays/varbin/vtable/canonical.rs similarity index 99% rename from vortex-array/src/arrays/varbin/canonical.rs rename to vortex-array/src/arrays/varbin/vtable/canonical.rs index 5a4612a0fd3..3950f426692 100644 --- a/vortex-array/src/arrays/varbin/canonical.rs +++ b/vortex-array/src/arrays/varbin/vtable/canonical.rs @@ -52,7 +52,7 @@ impl CanonicalVTable for VarBinVTable { } #[cfg(test)] -mod test { +mod tests { use rstest::rstest; use vortex_dtype::{DType, Nullability}; diff --git a/vortex-array/src/arrays/varbin/vtable/mod.rs b/vortex-array/src/arrays/varbin/vtable/mod.rs new file mode 100644 index 00000000000..15295c495af --- /dev/null +++ b/vortex-array/src/arrays/varbin/vtable/mod.rs @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::varbin::VarBinArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; +mod visitor; + +vtable!(VarBin); + +impl VTable for VarBinVTable { + type Array = VarBinArray; + type Encoding = VarBinEncoding; + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.varbin") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(VarBinEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct VarBinEncoding; diff --git a/vortex-array/src/arrays/varbin/ops.rs b/vortex-array/src/arrays/varbin/vtable/operations.rs similarity index 100% rename from vortex-array/src/arrays/varbin/ops.rs rename to vortex-array/src/arrays/varbin/vtable/operations.rs diff --git a/vortex-array/src/arrays/varbin/serde.rs b/vortex-array/src/arrays/varbin/vtable/serde.rs similarity index 76% rename from vortex-array/src/arrays/varbin/serde.rs rename to vortex-array/src/arrays/varbin/vtable/serde.rs index 4840c6451d4..854071f5a84 100644 --- a/vortex-array/src/arrays/varbin/serde.rs +++ b/vortex-array/src/arrays/varbin/vtable/serde.rs @@ -9,8 +9,8 @@ use super::VarBinEncoding; use crate::arrays::{VarBinArray, VarBinVTable}; use crate::serde::ArrayChildren; use crate::validity::Validity; -use crate::vtable::{SerdeVTable, ValidityHelper, VisitorVTable}; -use crate::{Array, ArrayBufferVisitor, ArrayChildVisitor, ProstMetadata}; +use crate::vtable::SerdeVTable; +use crate::{Array, ProstMetadata}; #[derive(Clone, prost::Message)] pub struct VarBinMetadata { @@ -59,14 +59,3 @@ impl SerdeVTable for VarBinVTable { VarBinArray::try_new(offsets, bytes, dtype.clone(), validity) } } - -impl VisitorVTable for VarBinVTable { - fn visit_buffers(array: &VarBinArray, visitor: &mut dyn ArrayBufferVisitor) { - visitor.visit_buffer(array.bytes()); // TODO(ngates): sliced bytes? - } - - fn visit_children(array: &VarBinArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_child("offsets", array.offsets()); - visitor.visit_validity(array.validity(), array.len()); - } -} diff --git a/vortex-array/src/arrays/varbin/vtable/validity.rs b/vortex-array/src/arrays/varbin/vtable/validity.rs new file mode 100644 index 00000000000..cdcdf172d49 --- /dev/null +++ b/vortex-array/src/arrays/varbin/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::varbin::VarBinArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for VarBinArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/varbin/vtable/visitor.rs b/vortex-array/src/arrays/varbin/vtable/visitor.rs new file mode 100644 index 00000000000..9b8859eb99a --- /dev/null +++ b/vortex-array/src/arrays/varbin/vtable/visitor.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::{VarBinArray, VarBinVTable}; +use crate::vtable::{ValidityHelper, VisitorVTable}; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for VarBinVTable { + fn visit_buffers(array: &VarBinArray, visitor: &mut dyn ArrayBufferVisitor) { + visitor.visit_buffer(array.bytes()); // TODO(ngates): sliced bytes? + } + + fn visit_children(array: &VarBinArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_child("offsets", array.offsets()); + visitor.visit_validity(array.validity(), array.len()); + } +} diff --git a/vortex-array/src/arrays/varbinview/accessor.rs b/vortex-array/src/arrays/varbinview/accessor.rs index cf7aa33f334..3231eb7b37f 100644 --- a/vortex-array/src/arrays/varbinview/accessor.rs +++ b/vortex-array/src/arrays/varbinview/accessor.rs @@ -30,7 +30,7 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { } else { Some( &bytes[view.as_view().buffer_index() as usize] - [view.as_view().to_range()], + [view.as_view().as_range()], ) } }); @@ -49,7 +49,7 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { } else { Some( &bytes[view.as_view().buffer_index() as usize] - [view.as_view().to_range()], + [view.as_view().as_range()], ) } } else { diff --git a/vortex-array/src/arrays/varbinview/array.rs b/vortex-array/src/arrays/varbinview/array.rs new file mode 100644 index 00000000000..de83751f15f --- /dev/null +++ b/vortex-array/src/arrays/varbinview/array.rs @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_buffer::{Buffer, ByteBuffer}; +use vortex_dtype::{DType, Nullability}; +use vortex_error::{ + VortexExpect, VortexResult, vortex_bail, vortex_ensure, vortex_err, vortex_panic, +}; + +use crate::arrays::binary_view::BinaryView; +use crate::builders::{ArrayBuilder, VarBinViewBuilder}; +use crate::stats::ArrayStats; +use crate::validity::Validity; + +/// A variable-length binary view array that stores strings and binary data efficiently. +/// +/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides +/// an optimized representation for variable-length data with excellent performance +/// characteristics for both short and long strings. +/// +/// ## Data Layout +/// +/// The array uses a hybrid storage approach with two main components: +/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element) +/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes +/// +/// ## View Structure +/// +/// Commonly referred to as "German Strings", each 16-byte view entry contains either: +/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view +/// - **Reference data**: For strings > 12 bytes, contains: +/// - String length (4 bytes) +/// - First 4 bytes of string as prefix (4 bytes) +/// - Buffer index and offset (8 bytes total) +/// +/// The following ASCII graphic is reproduced verbatim from the Arrow documentation: +/// +/// ```text +/// ┌──────┬────────────────────────┐ +/// │length│ string value │ +/// Strings (len <= 12) │ │ (padded with 0) │ +/// └──────┴────────────────────────┘ +/// 0 31 127 +/// +/// ┌───────┬───────┬───────┬───────┐ +/// │length │prefix │ buf │offset │ +/// Strings (len > 12) │ │ │ index │ │ +/// └───────┴───────┴───────┴───────┘ +/// 0 31 63 95 127 +/// ``` +/// +/// # Examples +/// +/// ``` +/// use vortex_array::arrays::VarBinViewArray; +/// use vortex_dtype::{DType, Nullability}; +/// use vortex_array::IntoArray; +/// +/// // Create from an Iterator +/// let array = VarBinViewArray::from_iter_str([ +/// "inlined", +/// "this string is outlined" +/// ]); +/// +/// assert_eq!(array.len(), 2); +/// +/// // Access individual strings +/// let first = array.bytes_at(0); +/// assert_eq!(first.as_slice(), b"inlined"); // "short" +/// +/// let second = array.bytes_at(1); +/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string +/// ``` +#[derive(Clone, Debug)] +pub struct VarBinViewArray { + pub(super) dtype: DType, + pub(super) buffers: Arc<[ByteBuffer]>, + pub(super) views: Buffer, + pub(super) validity: Validity, + pub(super) stats_set: ArrayStats, +} + +impl VarBinViewArray { + /// Creates a new [`VarBinViewArray`]. + /// + /// # Panics + /// + /// Panics if the provided components do not satisfy the invariants documented + /// in [`VarBinViewArray::new_unchecked`]. + pub fn new( + views: Buffer, + buffers: Arc<[ByteBuffer]>, + dtype: DType, + validity: Validity, + ) -> Self { + Self::try_new(views, buffers, dtype, validity) + .vortex_expect("VarBinViewArray construction failed") + } + + /// Constructs a new `VarBinViewArray`. + /// + /// See [`VarBinViewArray::new_unchecked`] for more information. + /// + /// # Errors + /// + /// Returns an error if the provided components do not satisfy the invariants documented in + /// [`VarBinViewArray::new_unchecked`]. + pub fn try_new( + views: Buffer, + buffers: Arc<[ByteBuffer]>, + dtype: DType, + validity: Validity, + ) -> VortexResult { + Self::validate(&views, &buffers, &dtype, &validity)?; + + // SAFETY: validate ensures all invariants are met. + Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) }) + } + + /// Creates a new [`VarBinViewArray`] without validation from these components: + /// + /// * `views` is a buffer of 16-byte view entries (one per logical element). + /// * `buffers` contains the backing storage for strings longer than 12 bytes. + /// * `dtype` specifies whether this contains UTF-8 strings or binary data. + /// * `validity` holds the null values. + /// + /// # Safety + /// + /// The caller must ensure all of the following invariants are satisfied: + /// + /// ## View Requirements + /// + /// - Views must be properly formatted 16-byte [`BinaryView`] entries. + /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes. + /// - Reference views (length > 12) must: + /// - Have a valid buffer index < `buffers.len()`. + /// - Have valid offsets that don't exceed the referenced buffer's bounds. + /// - Have a 4-byte prefix that matches the actual data at the referenced location. + /// + /// ## Type Requirements + /// + /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`]. + /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8. + /// + /// ## Validity Requirements + /// + /// - The validity must have the same nullability as the dtype. + /// - If validity is an array, its length must match `views.len()`. + pub unsafe fn new_unchecked( + views: Buffer, + buffers: Arc<[ByteBuffer]>, + dtype: DType, + validity: Validity, + ) -> Self { + #[cfg(debug_assertions)] + Self::validate(&views, &buffers, &dtype, &validity) + .vortex_expect("[Debug Assertion]: Invalid `VarBinViewArray` parameters"); + + Self { + dtype, + buffers, + views, + validity, + stats_set: Default::default(), + } + } + + /// Validates the components that would be used to create a [`VarBinViewArray`]. + /// + /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`]. + pub(crate) fn validate( + views: &Buffer, + buffers: &Arc<[ByteBuffer]>, + dtype: &DType, + validity: &Validity, + ) -> VortexResult<()> { + vortex_ensure!( + validity.nullability() == dtype.nullability(), + "validity {:?} incompatible with nullability {:?}", + validity, + dtype.nullability() + ); + + match dtype { + DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| { + simdutf8::basic::from_utf8(string).is_ok() + })?, + DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?, + _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"), + } + + Ok(()) + } + + fn validate_views( + views: &Buffer, + buffers: &Arc<[ByteBuffer]>, + validity: &Validity, + validator: F, + ) -> VortexResult<()> + where + F: Fn(&[u8]) -> bool, + { + for (idx, &view) in views.iter().enumerate() { + if validity.is_null(idx) { + continue; + } + + if view.is_inlined() { + // Validate the inline bytestring + let bytes = &unsafe { view.inlined }.data[..view.len() as usize]; + vortex_ensure!( + validator(bytes), + "view at index {idx}: inlined bytes failed utf-8 validation" + ); + } else { + // Validate the view pointer + let view = view.as_view(); + let buf_index = view.buffer_index as usize; + let start_offset = view.offset as usize; + let end_offset = start_offset.saturating_add(view.size as usize); + + let buf = buffers.get(buf_index).ok_or_else(|| + vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers", + buffers.len()))?; + + vortex_ensure!( + start_offset < buf.len(), + "start offset {start_offset} out of bounds for buffer {buf_index} with size {}", + buf.len(), + ); + + vortex_ensure!( + end_offset <= buf.len(), + "end offset {end_offset} out of bounds for buffer {buf_index} with size {}", + buf.len(), + ); + + // Make sure the prefix data matches the buffer data. + let bytes = &buf[start_offset..end_offset]; + vortex_ensure!( + view.prefix == bytes[..4], + "VarBinView prefix does not match full string" + ); + + // Validate the full string + vortex_ensure!( + validator(bytes), + "view at index {idx}: outlined bytes fails utf-8 validation" + ); + } + } + + Ok(()) + } + + /// Number of raw string data buffers held by this array. + pub fn nbuffers(&self) -> usize { + self.buffers.len() + } + + /// Access to the primitive views buffer. + /// + /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that + /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of + /// the string (if the string has 12 bytes or fewer). + #[inline] + pub fn views(&self) -> &Buffer { + &self.views + } + + /// Access value bytes at a given index + /// + /// Will return a `ByteBuffer` containing the data without performing a copy. + #[inline] + pub fn bytes_at(&self, index: usize) -> ByteBuffer { + let views = self.views(); + let view = &views[index]; + // Expect this to be the common case: strings > 12 bytes. + if !view.is_inlined() { + let view_ref = view.as_view(); + self.buffer(view_ref.buffer_index() as usize) + .slice(view_ref.as_range()) + } else { + // Return access to the range of bytes around it. + views + .clone() + .into_byte_buffer() + .slice_ref(view.as_inlined().value()) + } + } + + /// Access one of the backing data buffers. + /// + /// # Panics + /// + /// This method panics if the provided index is out of bounds for the set of buffers provided + /// at construction time. + #[inline] + pub fn buffer(&self, idx: usize) -> &ByteBuffer { + if idx >= self.nbuffers() { + vortex_panic!( + "{idx} buffer index out of bounds, there are {} buffers", + self.nbuffers() + ); + } + &self.buffers[idx] + } + + /// Iterate over the underlying raw data buffers, not including the views buffer. + #[inline] + pub fn buffers(&self) -> &Arc<[ByteBuffer]> { + &self.buffers + } + + /// Accumulate an iterable set of values into our type here. + #[allow(clippy::same_name_method)] + pub fn from_iter, I: IntoIterator>>( + iter: I, + dtype: DType, + ) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0); + + for item in iter { + match item { + None => builder.append_null(), + Some(v) => builder.append_value(v), + } + } + + builder.finish_into_varbinview() + } + + pub fn from_iter_str, I: IntoIterator>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinViewBuilder::with_capacity( + DType::Utf8(Nullability::NonNullable), + iter.size_hint().0, + ); + + for item in iter { + builder.append_value(item.as_ref()); + } + + builder.finish_into_varbinview() + } + + pub fn from_iter_nullable_str, I: IntoIterator>>( + iter: I, + ) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinViewBuilder::with_capacity( + DType::Utf8(Nullability::Nullable), + iter.size_hint().0, + ); + + for item in iter { + match item { + None => builder.append_null(), + Some(v) => builder.append_value(v.as_ref()), + } + } + + builder.finish_into_varbinview() + } + + pub fn from_iter_bin, I: IntoIterator>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinViewBuilder::with_capacity( + DType::Binary(Nullability::NonNullable), + iter.size_hint().0, + ); + + for item in iter { + builder.append_value(item.as_ref()); + } + + builder.finish_into_varbinview() + } + + pub fn from_iter_nullable_bin, I: IntoIterator>>( + iter: I, + ) -> Self { + let iter = iter.into_iter(); + let mut builder = VarBinViewBuilder::with_capacity( + DType::Binary(Nullability::Nullable), + iter.size_hint().0, + ); + + for item in iter { + match item { + None => builder.append_null(), + Some(v) => builder.append_value(v.as_ref()), + } + } + + builder.finish_into_varbinview() + } +} + +impl<'a> FromIterator> for VarBinViewArray { + fn from_iter>>(iter: T) -> Self { + Self::from_iter_nullable_bin(iter) + } +} + +impl FromIterator>> for VarBinViewArray { + fn from_iter>>>(iter: T) -> Self { + Self::from_iter_nullable_bin(iter) + } +} + +impl FromIterator> for VarBinViewArray { + fn from_iter>>(iter: T) -> Self { + Self::from_iter_nullable_str(iter) + } +} + +impl<'a> FromIterator> for VarBinViewArray { + fn from_iter>>(iter: T) -> Self { + Self::from_iter_nullable_str(iter) + } +} diff --git a/vortex-array/src/arrays/varbinview/binary_view.rs b/vortex-array/src/arrays/varbinview/binary_view.rs new file mode 100644 index 00000000000..d4a98594eab --- /dev/null +++ b/vortex-array/src/arrays/varbinview/binary_view.rs @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::ops::Range; + +use static_assertions::{assert_eq_align, assert_eq_size}; +use vortex_error::VortexUnwrap; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(C, align(8))] +pub struct Inlined { + pub(super) size: u32, + pub(super) data: [u8; BinaryView::MAX_INLINED_SIZE], +} + +impl Inlined { + fn new(value: &[u8]) -> Self { + let mut inlined = Self { + size: N.try_into().vortex_unwrap(), + data: [0u8; BinaryView::MAX_INLINED_SIZE], + }; + inlined.data[..N].copy_from_slice(&value[..N]); + inlined + } + + #[inline] + pub fn value(&self) -> &[u8] { + &self.data[0..(self.size as usize)] + } +} + +#[derive(Clone, Copy, Debug)] +#[repr(C, align(8))] +pub struct Ref { + pub(super) size: u32, + pub(super) prefix: [u8; 4], + pub(super) buffer_index: u32, + pub(super) offset: u32, +} + +impl Ref { + pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self { + Self { + size, + prefix, + buffer_index, + offset, + } + } + + #[inline] + pub fn buffer_index(&self) -> u32 { + self.buffer_index + } + + #[inline] + pub fn offset(&self) -> u32 { + self.offset + } + + #[inline] + pub fn prefix(&self) -> &[u8; 4] { + &self.prefix + } + + #[inline] + pub fn as_range(&self) -> Range { + self.offset as usize..(self.offset + self.size) as usize + } +} + +#[derive(Clone, Copy)] +#[repr(C, align(16))] +pub union BinaryView { + // Numeric representation. This is logically `u128`, but we split it into the high and low + // bits to preserve the alignment. + pub(super) le_bytes: [u8; 16], + + // Inlined representation: strings <= 12 bytes + pub(super) inlined: Inlined, + + // Reference type: strings > 12 bytes. + pub(super) _ref: Ref, +} + +assert_eq_size!(BinaryView, [u8; 16]); +assert_eq_size!(Inlined, [u8; 16]); +assert_eq_size!(Ref, [u8; 16]); +assert_eq_align!(BinaryView, u128); + +impl Hash for BinaryView { + fn hash(&self, state: &mut H) { + unsafe { std::mem::transmute::<&BinaryView, &[u8; 16]>(self) }.hash(state); + } +} + +impl Default for BinaryView { + fn default() -> Self { + Self::make_view(&[], 0, 0) + } +} + +impl BinaryView { + pub const MAX_INLINED_SIZE: usize = 12; + + /// Create a view from a value, block and offset + /// + /// Depending on the length of the provided value either a new inlined + /// or a reference view will be constructed. + /// + /// Adapted from arrow-rs + /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores + #[inline(never)] + pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self { + match value.len() { + 0 => Self { + inlined: Inlined::new::<0>(value), + }, + 1 => Self { + inlined: Inlined::new::<1>(value), + }, + 2 => Self { + inlined: Inlined::new::<2>(value), + }, + 3 => Self { + inlined: Inlined::new::<3>(value), + }, + 4 => Self { + inlined: Inlined::new::<4>(value), + }, + 5 => Self { + inlined: Inlined::new::<5>(value), + }, + 6 => Self { + inlined: Inlined::new::<6>(value), + }, + 7 => Self { + inlined: Inlined::new::<7>(value), + }, + 8 => Self { + inlined: Inlined::new::<8>(value), + }, + 9 => Self { + inlined: Inlined::new::<9>(value), + }, + 10 => Self { + inlined: Inlined::new::<10>(value), + }, + 11 => Self { + inlined: Inlined::new::<11>(value), + }, + 12 => Self { + inlined: Inlined::new::<12>(value), + }, + _ => Self { + _ref: Ref::new( + u32::try_from(value.len()).vortex_unwrap(), + value[0..4].try_into().vortex_unwrap(), + block, + offset, + ), + }, + } + } + + /// Create a new empty view + #[inline] + pub fn empty_view() -> Self { + Self::new_inlined(&[]) + } + + /// Create a new inlined binary view + #[inline] + pub fn new_inlined(value: &[u8]) -> Self { + assert!( + value.len() <= Self::MAX_INLINED_SIZE, + "expected inlined value to be <= 12 bytes, was {}", + value.len() + ); + + Self::make_view(value, 0, 0) + } + + #[inline] + pub fn len(&self) -> u32 { + unsafe { self.inlined.size } + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline] + #[allow(clippy::cast_possible_truncation)] + pub fn is_inlined(&self) -> bool { + self.len() <= (Self::MAX_INLINED_SIZE as u32) + } + + pub fn as_inlined(&self) -> &Inlined { + unsafe { &self.inlined } + } + + pub fn as_view(&self) -> &Ref { + unsafe { &self._ref } + } + + pub fn as_u128(&self) -> u128 { + // SAFETY: binary view always safe to read as u128 LE bytes + unsafe { u128::from_le_bytes(self.le_bytes) } + } + + /// Override the buffer reference with the given buffer_idx, only if this view is not inlined. + #[inline(always)] + pub fn with_buffer_idx(self, buffer_idx: u32) -> Self { + if self.is_inlined() { + self + } else { + // Referencing views must have their buffer_index adjusted with new offsets + let view_ref = self.as_view(); + Self { + _ref: Ref::new( + self.len(), + *view_ref.prefix(), + buffer_idx, + view_ref.offset(), + ), + } + } + } + + /// Shifts the buffer reference by the view by a given offset, useful when merging many + /// varbinview arrays into one. + #[inline(always)] + pub fn offset_view(self, offset: u32) -> Self { + if self.is_inlined() { + self + } else { + // Referencing views must have their buffer_index adjusted with new offsets + let view_ref = self.as_view(); + Self { + _ref: Ref::new( + self.len(), + *view_ref.prefix(), + offset + view_ref.buffer_index(), + view_ref.offset(), + ), + } + } + } +} + +impl From for BinaryView { + fn from(value: u128) -> Self { + BinaryView { + le_bytes: value.to_le_bytes(), + } + } +} + +impl fmt::Debug for BinaryView { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut s = f.debug_struct("BinaryView"); + if self.is_inlined() { + s.field("inline", &self.as_inlined()); + } else { + s.field("ref", &self.as_view()); + } + s.finish() + } +} diff --git a/vortex-array/src/arrays/varbinview/compute/is_constant.rs b/vortex-array/src/arrays/varbinview/compute/is_constant.rs index 48a05b609ef..0c5a511b6bd 100644 --- a/vortex-array/src/arrays/varbinview/compute/is_constant.rs +++ b/vortex-array/src/arrays/varbinview/compute/is_constant.rs @@ -3,7 +3,8 @@ use vortex_error::{VortexExpect, VortexResult}; -use crate::arrays::{Ref, VarBinViewArray, VarBinViewVTable}; +use crate::arrays::varbinview::binary_view::Ref; +use crate::arrays::{VarBinViewArray, VarBinViewVTable}; use crate::compute::{IsConstantKernel, IsConstantKernelAdapter, IsConstantOpts}; use crate::register_kernel; @@ -31,7 +32,7 @@ impl IsConstantKernel for VarBinViewVTable { } else { // Directly fetch the values for a `Ref` let ref_bytes = |view_ref: &Ref| { - &array.buffer(view_ref.buffer_index() as usize).as_slice()[view_ref.to_range()] + &array.buffer(view_ref.buffer_index() as usize).as_slice()[view_ref.as_range()] }; let first_view_ref = first_value.as_view(); diff --git a/vortex-array/src/arrays/varbinview/compute/min_max.rs b/vortex-array/src/arrays/varbinview/compute/min_max.rs index affce0db450..19904a8e4bc 100644 --- a/vortex-array/src/arrays/varbinview/compute/min_max.rs +++ b/vortex-array/src/arrays/varbinview/compute/min_max.rs @@ -3,13 +3,13 @@ use vortex_error::VortexResult; -use crate::arrays::{VarBinViewArray, VarBinViewVTable, compute_min_max}; +use crate::arrays::{VarBinViewArray, VarBinViewVTable, varbin_compute_min_max}; use crate::compute::{MinMaxKernel, MinMaxKernelAdapter, MinMaxResult}; use crate::register_kernel; impl MinMaxKernel for VarBinViewVTable { fn min_max(&self, array: &VarBinViewArray) -> VortexResult> { - compute_min_max(array, array.dtype()) + varbin_compute_min_max(array, array.dtype()) } } diff --git a/vortex-array/src/arrays/varbinview/compute/take.rs b/vortex-array/src/arrays/varbinview/compute/take.rs index a3a953a0b8c..254b7e4a0e3 100644 --- a/vortex-array/src/arrays/varbinview/compute/take.rs +++ b/vortex-array/src/arrays/varbinview/compute/take.rs @@ -8,7 +8,8 @@ use vortex_buffer::Buffer; use vortex_dtype::match_each_integer_ptype; use vortex_error::VortexResult; -use crate::arrays::{BinaryView, VarBinViewArray, VarBinViewVTable}; +use crate::arrays::binary_view::BinaryView; +use crate::arrays::{VarBinViewArray, VarBinViewVTable}; use crate::compute::{TakeKernel, TakeKernelAdapter}; use crate::vtable::ValidityHelper; use crate::{Array, ArrayRef, IntoArray, ToCanonical, register_kernel}; diff --git a/vortex-array/src/arrays/varbinview/mod.rs b/vortex-array/src/arrays/varbinview/mod.rs index 3640c18e59c..3f2e5cd5b61 100644 --- a/vortex-array/src/arrays/varbinview/mod.rs +++ b/vortex-array/src/arrays/varbinview/mod.rs @@ -1,803 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::fmt::{Debug, Formatter}; -use std::hash::{Hash, Hasher}; -use std::ops::Range; -use std::sync::Arc; - -use static_assertions::{assert_eq_align, assert_eq_size}; -use vortex_buffer::{Buffer, ByteBuffer}; -use vortex_dtype::{DType, Nullability}; -use vortex_error::{ - VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_ensure, vortex_err, vortex_panic, -}; - -use crate::builders::{ArrayBuilder, VarBinViewBuilder}; -use crate::stats::{ArrayStats, StatsSetRef}; -use crate::validity::Validity; -use crate::vtable::{ - ArrayVTable, CanonicalVTable, NotSupported, VTable, ValidityHelper, - ValidityVTableFromValidityHelper, -}; -use crate::{Canonical, EncodingId, EncodingRef, vtable}; +mod array; +pub use array::VarBinViewArray; mod accessor; mod compact; -mod compute; -mod ops; -mod serde; - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -#[repr(C, align(8))] -pub struct Inlined { - size: u32, - data: [u8; BinaryView::MAX_INLINED_SIZE], -} - -impl Inlined { - fn new(value: &[u8]) -> Self { - let mut inlined = Self { - size: N.try_into().vortex_unwrap(), - data: [0u8; BinaryView::MAX_INLINED_SIZE], - }; - inlined.data[..N].copy_from_slice(&value[..N]); - inlined - } - - #[inline] - pub fn value(&self) -> &[u8] { - &self.data[0..(self.size as usize)] - } -} - -#[derive(Clone, Copy, Debug)] -#[repr(C, align(8))] -pub struct Ref { - size: u32, - prefix: [u8; 4], - buffer_index: u32, - offset: u32, -} - -impl Ref { - pub fn new(size: u32, prefix: [u8; 4], buffer_index: u32, offset: u32) -> Self { - Self { - size, - prefix, - buffer_index, - offset, - } - } - - #[inline] - pub fn buffer_index(&self) -> u32 { - self.buffer_index - } - - #[inline] - pub fn offset(&self) -> u32 { - self.offset - } - - #[inline] - pub fn prefix(&self) -> &[u8; 4] { - &self.prefix - } - - #[inline] - pub fn to_range(&self) -> Range { - self.offset as usize..(self.offset + self.size) as usize - } -} - -#[derive(Clone, Copy)] -#[repr(C, align(16))] -pub union BinaryView { - // Numeric representation. This is logically `u128`, but we split it into the high and low - // bits to preserve the alignment. - le_bytes: [u8; 16], - - // Inlined representation: strings <= 12 bytes - inlined: Inlined, - - // Reference type: strings > 12 bytes. - _ref: Ref, -} - -assert_eq_size!(BinaryView, [u8; 16]); -assert_eq_size!(Inlined, [u8; 16]); -assert_eq_size!(Ref, [u8; 16]); -assert_eq_align!(BinaryView, u128); - -impl Hash for BinaryView { - fn hash(&self, state: &mut H) { - unsafe { std::mem::transmute::<&BinaryView, &[u8; 16]>(self) }.hash(state); - } -} - -impl Default for BinaryView { - fn default() -> Self { - Self::make_view(&[], 0, 0) - } -} - -impl BinaryView { - pub const MAX_INLINED_SIZE: usize = 12; - - /// Create a view from a value, block and offset - /// - /// Depending on the length of the provided value either a new inlined - /// or a reference view will be constructed. - /// - /// Adapted from arrow-rs - /// Explicitly enumerating inlined view produces code that avoids calling generic `ptr::copy_non_interleave` that's slower than explicit stores - #[inline(never)] - pub fn make_view(value: &[u8], block: u32, offset: u32) -> Self { - match value.len() { - 0 => Self { - inlined: Inlined::new::<0>(value), - }, - 1 => Self { - inlined: Inlined::new::<1>(value), - }, - 2 => Self { - inlined: Inlined::new::<2>(value), - }, - 3 => Self { - inlined: Inlined::new::<3>(value), - }, - 4 => Self { - inlined: Inlined::new::<4>(value), - }, - 5 => Self { - inlined: Inlined::new::<5>(value), - }, - 6 => Self { - inlined: Inlined::new::<6>(value), - }, - 7 => Self { - inlined: Inlined::new::<7>(value), - }, - 8 => Self { - inlined: Inlined::new::<8>(value), - }, - 9 => Self { - inlined: Inlined::new::<9>(value), - }, - 10 => Self { - inlined: Inlined::new::<10>(value), - }, - 11 => Self { - inlined: Inlined::new::<11>(value), - }, - 12 => Self { - inlined: Inlined::new::<12>(value), - }, - _ => Self { - _ref: Ref::new( - u32::try_from(value.len()).vortex_unwrap(), - value[0..4].try_into().vortex_unwrap(), - block, - offset, - ), - }, - } - } - - /// Create a new empty view - #[inline] - pub fn empty_view() -> Self { - Self::new_inlined(&[]) - } - - /// Create a new inlined binary view - #[inline] - pub fn new_inlined(value: &[u8]) -> Self { - assert!( - value.len() <= Self::MAX_INLINED_SIZE, - "expected inlined value to be <= 12 bytes, was {}", - value.len() - ); - - Self::make_view(value, 0, 0) - } - - #[inline] - pub fn len(&self) -> u32 { - unsafe { self.inlined.size } - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.len() > 0 - } - - #[inline] - #[allow(clippy::cast_possible_truncation)] - pub fn is_inlined(&self) -> bool { - self.len() <= (Self::MAX_INLINED_SIZE as u32) - } - - pub fn as_inlined(&self) -> &Inlined { - unsafe { &self.inlined } - } - - pub fn as_view(&self) -> &Ref { - unsafe { &self._ref } - } - - pub fn as_u128(&self) -> u128 { - // SAFETY: binary view always safe to read as u128 LE bytes - unsafe { u128::from_le_bytes(self.le_bytes) } - } - - /// Override the buffer reference with the given buffer_idx, only if this view is not inlined. - #[inline(always)] - pub fn with_buffer_idx(self, buffer_idx: u32) -> Self { - if self.is_inlined() { - self - } else { - // Referencing views must have their buffer_index adjusted with new offsets - let view_ref = self.as_view(); - Self { - _ref: Ref::new( - self.len(), - *view_ref.prefix(), - buffer_idx, - view_ref.offset(), - ), - } - } - } - - /// Shifts the buffer reference by the view by a given offset, useful when merging many - /// varbinview arrays into one. - #[inline(always)] - pub fn offset_view(self, offset: u32) -> Self { - if self.is_inlined() { - self - } else { - // Referencing views must have their buffer_index adjusted with new offsets - let view_ref = self.as_view(); - Self { - _ref: Ref::new( - self.len(), - *view_ref.prefix(), - offset + view_ref.buffer_index(), - view_ref.offset(), - ), - } - } - } -} - -impl From for BinaryView { - fn from(value: u128) -> Self { - BinaryView { - le_bytes: value.to_le_bytes(), - } - } -} - -impl Debug for BinaryView { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let mut s = f.debug_struct("BinaryView"); - if self.is_inlined() { - s.field("inline", &self.as_inlined()); - } else { - s.field("ref", &self.as_view()); - } - s.finish() - } -} - -vtable!(VarBinView); - -impl VTable for VarBinViewVTable { - type Array = VarBinViewArray; - type Encoding = VarBinViewEncoding; - - type ArrayVTable = Self; - type CanonicalVTable = Self; - type OperationsVTable = Self; - type ValidityVTable = ValidityVTableFromValidityHelper; - type VisitorVTable = Self; - type ComputeVTable = NotSupported; - type EncodeVTable = NotSupported; - type PipelineVTable = NotSupported; - type SerdeVTable = Self; - - fn id(_encoding: &Self::Encoding) -> EncodingId { - EncodingId::new_ref("vortex.varbinview") - } - - fn encoding(_array: &Self::Array) -> EncodingRef { - EncodingRef::new_ref(VarBinViewEncoding.as_ref()) - } -} - -/// A variable-length binary view array that stores strings and binary data efficiently. -/// -/// This mirrors the Apache Arrow StringView/BinaryView array encoding and provides -/// an optimized representation for variable-length data with excellent performance -/// characteristics for both short and long strings. -/// -/// ## Data Layout -/// -/// The array uses a hybrid storage approach with two main components: -/// - **Views buffer**: Array of 16-byte `BinaryView` entries (one per logical element) -/// - **Data buffers**: Shared backing storage for strings longer than 12 bytes -/// -/// ## View Structure -/// -/// Commonly referred to as "German Strings", each 16-byte view entry contains either: -/// - **Inlined data**: For strings ≤ 12 bytes, the entire string is stored directly in the view -/// - **Reference data**: For strings > 12 bytes, contains: -/// - String length (4 bytes) -/// - First 4 bytes of string as prefix (4 bytes) -/// - Buffer index and offset (8 bytes total) -/// -/// The following ASCII graphic is reproduced verbatim from the Arrow documentation: -/// -/// ```text -/// ┌──────┬────────────────────────┐ -/// │length│ string value │ -/// Strings (len <= 12) │ │ (padded with 0) │ -/// └──────┴────────────────────────┘ -/// 0 31 127 -/// -/// ┌───────┬───────┬───────┬───────┐ -/// │length │prefix │ buf │offset │ -/// Strings (len > 12) │ │ │ index │ │ -/// └───────┴───────┴───────┴───────┘ -/// 0 31 63 95 127 -/// ``` -/// -/// # Examples -/// -/// ``` -/// use vortex_array::arrays::VarBinViewArray; -/// use vortex_dtype::{DType, Nullability}; -/// use vortex_array::IntoArray; -/// -/// // Create from an Iterator -/// let array = VarBinViewArray::from_iter_str([ -/// "inlined", -/// "this string is outlined" -/// ]); -/// -/// assert_eq!(array.len(), 2); -/// -/// // Access individual strings -/// let first = array.bytes_at(0); -/// assert_eq!(first.as_slice(), b"inlined"); // "short" -/// -/// let second = array.bytes_at(1); -/// assert_eq!(second.as_slice(), b"this string is outlined"); // Long string -/// ``` -#[derive(Clone, Debug)] -pub struct VarBinViewArray { - dtype: DType, - buffers: Arc<[ByteBuffer]>, - views: Buffer, - validity: Validity, - stats_set: ArrayStats, -} - -#[derive(Clone, Debug)] -pub struct VarBinViewEncoding; - -impl VarBinViewArray { - /// Creates a new [`VarBinViewArray`]. - /// - /// # Panics - /// - /// Panics if the provided components do not satisfy the invariants documented - /// in [`VarBinViewArray::new_unchecked`]. - pub fn new( - views: Buffer, - buffers: Arc<[ByteBuffer]>, - dtype: DType, - validity: Validity, - ) -> Self { - Self::try_new(views, buffers, dtype, validity) - .vortex_expect("VarBinViewArray construction failed") - } - - /// Constructs a new `VarBinViewArray`. - /// - /// See [`VarBinViewArray::new_unchecked`] for more information. - /// - /// # Errors - /// - /// Returns an error if the provided components do not satisfy the invariants documented in - /// [`VarBinViewArray::new_unchecked`]. - pub fn try_new( - views: Buffer, - buffers: Arc<[ByteBuffer]>, - dtype: DType, - validity: Validity, - ) -> VortexResult { - Self::validate(&views, &buffers, &dtype, &validity)?; - - // SAFETY: validate ensures all invariants are met. - Ok(unsafe { Self::new_unchecked(views, buffers, dtype, validity) }) - } - - /// Creates a new [`VarBinViewArray`] without validation from these components: - /// - /// * `views` is a buffer of 16-byte view entries (one per logical element). - /// * `buffers` contains the backing storage for strings longer than 12 bytes. - /// * `dtype` specifies whether this contains UTF-8 strings or binary data. - /// * `validity` holds the null values. - /// - /// # Safety - /// - /// The caller must ensure all of the following invariants are satisfied: - /// - /// ## View Requirements - /// - /// - Views must be properly formatted 16-byte [`BinaryView`] entries. - /// - Inlined views (length ≤ 12) must have valid data in the first `length` bytes. - /// - Reference views (length > 12) must: - /// - Have a valid buffer index < `buffers.len()`. - /// - Have valid offsets that don't exceed the referenced buffer's bounds. - /// - Have a 4-byte prefix that matches the actual data at the referenced location. - /// - /// ## Type Requirements - /// - /// - `dtype` must be either [`DType::Utf8`] or [`DType::Binary`]. - /// - For [`DType::Utf8`], all string data (both inlined and referenced) must be valid UTF-8. - /// - /// ## Validity Requirements - /// - /// - The validity must have the same nullability as the dtype. - /// - If validity is an array, its length must match `views.len()`. - pub unsafe fn new_unchecked( - views: Buffer, - buffers: Arc<[ByteBuffer]>, - dtype: DType, - validity: Validity, - ) -> Self { - Self { - dtype, - buffers, - views, - validity, - stats_set: Default::default(), - } - } - /// Validates the components that would be used to create a [`VarBinViewArray`]. - /// - /// This function checks all the invariants required by [`VarBinViewArray::new_unchecked`]. - pub(crate) fn validate( - views: &Buffer, - buffers: &Arc<[ByteBuffer]>, - dtype: &DType, - validity: &Validity, - ) -> VortexResult<()> { - vortex_ensure!( - validity.nullability() == dtype.nullability(), - "validity {:?} incompatible with nullability {:?}", - validity, - dtype.nullability() - ); +pub mod binary_view; - match dtype { - DType::Utf8(_) => Self::validate_views(views, buffers, validity, |string| { - simdutf8::basic::from_utf8(string).is_ok() - })?, - DType::Binary(_) => Self::validate_views(views, buffers, validity, |_| true)?, - _ => vortex_bail!("invalid DType {dtype} for `VarBinViewArray`"), - } - - Ok(()) - } - - fn validate_views( - views: &Buffer, - buffers: &Arc<[ByteBuffer]>, - validity: &Validity, - validator: F, - ) -> VortexResult<()> - where - F: Fn(&[u8]) -> bool, - { - for (idx, &view) in views.iter().enumerate() { - if validity.is_null(idx) { - continue; - } - - if view.is_inlined() { - // Validate the inline bytestring - let bytes = &unsafe { view.inlined }.data[..view.len() as usize]; - vortex_ensure!( - validator(bytes), - "view at index {idx}: inlined bytes failed utf-8 validation" - ); - } else { - // Validate the view pointer - let view = view.as_view(); - let buf_index = view.buffer_index as usize; - let start_offset = view.offset as usize; - let end_offset = start_offset.saturating_add(view.size as usize); - - let buf = buffers.get(buf_index).ok_or_else(|| - vortex_err!("view at index {idx} references invalid buffer: {buf_index} out of bounds for VarBinViewArray with {} buffers", - buffers.len()))?; - - vortex_ensure!( - start_offset < buf.len(), - "start offset {start_offset} out of bounds for buffer {buf_index} with size {}", - buf.len(), - ); - - vortex_ensure!( - end_offset <= buf.len(), - "end offset {end_offset} out of bounds for buffer {buf_index} with size {}", - buf.len(), - ); - - // Make sure the prefix data matches the buffer data. - let bytes = &buf[start_offset..end_offset]; - vortex_ensure!( - view.prefix == bytes[..4], - "VarBinView prefix does not match full string" - ); - - // Validate the full string - vortex_ensure!( - validator(bytes), - "view at index {idx}: outlined bytes fails utf-8 validation" - ); - } - } - - Ok(()) - } - - /// Number of raw string data buffers held by this array. - pub fn nbuffers(&self) -> usize { - self.buffers.len() - } - - /// Access to the primitive views buffer. - /// - /// Variable-sized binary view buffer contain a "view" child array, with 16-byte entries that - /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of - /// the string (if the string has 12 bytes or fewer). - #[inline] - pub fn views(&self) -> &Buffer { - &self.views - } - - /// Access value bytes at a given index - /// - /// Will return a `ByteBuffer` containing the data without performing a copy. - #[inline] - pub fn bytes_at(&self, index: usize) -> ByteBuffer { - let views = self.views(); - let view = &views[index]; - // Expect this to be the common case: strings > 12 bytes. - if !view.is_inlined() { - let view_ref = view.as_view(); - self.buffer(view_ref.buffer_index() as usize) - .slice(view_ref.to_range()) - } else { - // Return access to the range of bytes around it. - views - .clone() - .into_byte_buffer() - .slice_ref(view.as_inlined().value()) - } - } - - /// Access one of the backing data buffers. - /// - /// # Panics - /// - /// This method panics if the provided index is out of bounds for the set of buffers provided - /// at construction time. - #[inline] - pub fn buffer(&self, idx: usize) -> &ByteBuffer { - if idx >= self.nbuffers() { - vortex_panic!( - "{idx} buffer index out of bounds, there are {} buffers", - self.nbuffers() - ); - } - &self.buffers[idx] - } - - /// Iterate over the underlying raw data buffers, not including the views buffer. - #[inline] - pub fn buffers(&self) -> &Arc<[ByteBuffer]> { - &self.buffers - } - - /// Accumulate an iterable set of values into our type here. - #[allow(clippy::same_name_method)] - pub fn from_iter, I: IntoIterator>>( - iter: I, - dtype: DType, - ) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinViewBuilder::with_capacity(dtype, iter.size_hint().0); - - for item in iter { - match item { - None => builder.append_null(), - Some(v) => builder.append_value(v), - } - } - - builder.finish_into_varbinview() - } - - pub fn from_iter_str, I: IntoIterator>(iter: I) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinViewBuilder::with_capacity( - DType::Utf8(Nullability::NonNullable), - iter.size_hint().0, - ); - - for item in iter { - builder.append_value(item.as_ref()); - } - - builder.finish_into_varbinview() - } - - pub fn from_iter_nullable_str, I: IntoIterator>>( - iter: I, - ) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinViewBuilder::with_capacity( - DType::Utf8(Nullability::Nullable), - iter.size_hint().0, - ); - - for item in iter { - match item { - None => builder.append_null(), - Some(v) => builder.append_value(v.as_ref()), - } - } - - builder.finish_into_varbinview() - } - - pub fn from_iter_bin, I: IntoIterator>(iter: I) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinViewBuilder::with_capacity( - DType::Binary(Nullability::NonNullable), - iter.size_hint().0, - ); - - for item in iter { - builder.append_value(item.as_ref()); - } - - builder.finish_into_varbinview() - } - - pub fn from_iter_nullable_bin, I: IntoIterator>>( - iter: I, - ) -> Self { - let iter = iter.into_iter(); - let mut builder = VarBinViewBuilder::with_capacity( - DType::Binary(Nullability::Nullable), - iter.size_hint().0, - ); - - for item in iter { - match item { - None => builder.append_null(), - Some(v) => builder.append_value(v.as_ref()), - } - } - - builder.finish_into_varbinview() - } -} - -impl ArrayVTable for VarBinViewVTable { - fn len(array: &VarBinViewArray) -> usize { - array.views.len() - } - - fn dtype(array: &VarBinViewArray) -> &DType { - &array.dtype - } - - fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> { - array.stats_set.to_ref(array.as_ref()) - } -} - -impl ValidityHelper for VarBinViewArray { - fn validity(&self) -> &Validity { - &self.validity - } -} - -impl CanonicalVTable for VarBinViewVTable { - fn canonicalize(array: &VarBinViewArray) -> Canonical { - Canonical::VarBinView(array.clone()) - } - - fn append_to_builder(array: &VarBinViewArray, builder: &mut dyn ArrayBuilder) { - builder.extend_from_array(array.as_ref()) - } -} - -impl<'a> FromIterator> for VarBinViewArray { - fn from_iter>>(iter: T) -> Self { - Self::from_iter_nullable_bin(iter) - } -} - -impl FromIterator>> for VarBinViewArray { - fn from_iter>>>(iter: T) -> Self { - Self::from_iter_nullable_bin(iter) - } -} - -impl FromIterator> for VarBinViewArray { - fn from_iter>>(iter: T) -> Self { - Self::from_iter_nullable_str(iter) - } -} +mod compute; -impl<'a> FromIterator> for VarBinViewArray { - fn from_iter>>(iter: T) -> Self { - Self::from_iter_nullable_str(iter) - } -} +mod vtable; +pub use vtable::{VarBinViewEncoding, VarBinViewVTable}; #[cfg(test)] -mod test { - use vortex_scalar::Scalar; - - use crate::arrays::varbinview::{BinaryView, VarBinViewArray}; - use crate::{Array, ToCanonical}; - - #[test] - pub fn varbin_view() { - let binary_arr = - VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]); - assert_eq!(binary_arr.len(), 2); - assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world")); - assert_eq!( - binary_arr.scalar_at(1), - Scalar::from("hello world this is a long string") - ); - } - - #[test] - pub fn slice_array() { - let binary_arr = - VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]) - .slice(1..2); - assert_eq!( - binary_arr.scalar_at(0), - Scalar::from("hello world this is a long string") - ); - } - - #[test] - pub fn flatten_array() { - let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]); - let var_bin = binary_arr.to_varbinview(); - assert_eq!(var_bin.scalar_at(0), Scalar::from("string1")); - assert_eq!(var_bin.scalar_at(1), Scalar::from("string2")); - } - - #[test] - pub fn binary_view_size_and_alignment() { - assert_eq!(size_of::(), 16); - assert_eq!(align_of::(), 16); - } -} +mod tests; diff --git a/vortex-array/src/arrays/varbinview/tests.rs b/vortex-array/src/arrays/varbinview/tests.rs new file mode 100644 index 00000000000..6bd1490eb83 --- /dev/null +++ b/vortex-array/src/arrays/varbinview/tests.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_scalar::Scalar; + +use crate::arrays::VarBinViewArray; +use crate::arrays::binary_view::BinaryView; +use crate::{Array, ToCanonical}; + +#[test] +pub fn varbin_view() { + let binary_arr = + VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]); + assert_eq!(binary_arr.len(), 2); + assert_eq!(binary_arr.scalar_at(0), Scalar::from("hello world")); + assert_eq!( + binary_arr.scalar_at(1), + Scalar::from("hello world this is a long string") + ); +} + +#[test] +pub fn slice_array() { + let binary_arr = + VarBinViewArray::from_iter_str(["hello world", "hello world this is a long string"]) + .slice(1..2); + assert_eq!( + binary_arr.scalar_at(0), + Scalar::from("hello world this is a long string") + ); +} + +#[test] +pub fn flatten_array() { + let binary_arr = VarBinViewArray::from_iter_str(["string1", "string2"]); + let var_bin = binary_arr.to_varbinview(); + assert_eq!(var_bin.scalar_at(0), Scalar::from("string1")); + assert_eq!(var_bin.scalar_at(1), Scalar::from("string2")); +} + +#[test] +pub fn binary_view_size_and_alignment() { + assert_eq!(size_of::(), 16); + assert_eq!(align_of::(), 16); +} diff --git a/vortex-array/src/arrays/varbinview/vtable/array.rs b/vortex-array/src/arrays/varbinview/vtable/array.rs new file mode 100644 index 00000000000..eccfe851213 --- /dev/null +++ b/vortex-array/src/arrays/varbinview/vtable/array.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_dtype::DType; + +use crate::arrays::varbinview::{VarBinViewArray, VarBinViewVTable}; +use crate::stats::StatsSetRef; +use crate::vtable::ArrayVTable; + +impl ArrayVTable for VarBinViewVTable { + fn len(array: &VarBinViewArray) -> usize { + array.views.len() + } + + fn dtype(array: &VarBinViewArray) -> &DType { + &array.dtype + } + + fn stats(array: &VarBinViewArray) -> StatsSetRef<'_> { + array.stats_set.to_ref(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/varbinview/vtable/canonical.rs b/vortex-array/src/arrays/varbinview/vtable/canonical.rs new file mode 100644 index 00000000000..e4bbacc5581 --- /dev/null +++ b/vortex-array/src/arrays/varbinview/vtable/canonical.rs @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::Canonical; +use crate::arrays::VarBinViewVTable; +use crate::arrays::varbinview::VarBinViewArray; +use crate::builders::ArrayBuilder; +use crate::vtable::CanonicalVTable; + +impl CanonicalVTable for VarBinViewVTable { + fn canonicalize(array: &VarBinViewArray) -> Canonical { + Canonical::VarBinView(array.clone()) + } + + fn append_to_builder(array: &VarBinViewArray, builder: &mut dyn ArrayBuilder) { + builder.extend_from_array(array.as_ref()) + } +} diff --git a/vortex-array/src/arrays/varbinview/vtable/mod.rs b/vortex-array/src/arrays/varbinview/vtable/mod.rs new file mode 100644 index 00000000000..00b21cc4b1b --- /dev/null +++ b/vortex-array/src/arrays/varbinview/vtable/mod.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::varbinview::VarBinViewArray; +use crate::vtable::{NotSupported, VTable, ValidityVTableFromValidityHelper}; +use crate::{EncodingId, EncodingRef, vtable}; + +mod array; +mod canonical; +mod operations; +mod serde; +mod validity; +mod visitor; + +vtable!(VarBinView); + +impl VTable for VarBinViewVTable { + type Array = VarBinViewArray; + type Encoding = VarBinViewEncoding; + + type ArrayVTable = Self; + type CanonicalVTable = Self; + type OperationsVTable = Self; + type ValidityVTable = ValidityVTableFromValidityHelper; + type VisitorVTable = Self; + type ComputeVTable = NotSupported; + type EncodeVTable = NotSupported; + type PipelineVTable = NotSupported; + type SerdeVTable = Self; + + fn id(_encoding: &Self::Encoding) -> EncodingId { + EncodingId::new_ref("vortex.varbinview") + } + + fn encoding(_array: &Self::Array) -> EncodingRef { + EncodingRef::new_ref(VarBinViewEncoding.as_ref()) + } +} + +#[derive(Clone, Debug)] +pub struct VarBinViewEncoding; diff --git a/vortex-array/src/arrays/varbinview/ops.rs b/vortex-array/src/arrays/varbinview/vtable/operations.rs similarity index 100% rename from vortex-array/src/arrays/varbinview/ops.rs rename to vortex-array/src/arrays/varbinview/vtable/operations.rs diff --git a/vortex-array/src/arrays/varbinview/serde.rs b/vortex-array/src/arrays/varbinview/vtable/serde.rs similarity index 72% rename from vortex-array/src/arrays/varbinview/serde.rs rename to vortex-array/src/arrays/varbinview/vtable/serde.rs index 45f70982ae7..7ed7d3115ff 100644 --- a/vortex-array/src/arrays/varbinview/serde.rs +++ b/vortex-array/src/arrays/varbinview/vtable/serde.rs @@ -7,12 +7,13 @@ use vortex_buffer::{Buffer, ByteBuffer}; use vortex_dtype::DType; use vortex_error::{VortexExpect, VortexResult, vortex_bail}; -use super::{BinaryView, VarBinViewVTable}; +use super::VarBinViewVTable; +use crate::EmptyMetadata; +use crate::arrays::binary_view::BinaryView; use crate::arrays::{VarBinViewArray, VarBinViewEncoding}; use crate::serde::ArrayChildren; use crate::validity::Validity; -use crate::vtable::{SerdeVTable, ValidityHelper, VisitorVTable}; -use crate::{ArrayBufferVisitor, ArrayChildVisitor, EmptyMetadata}; +use crate::vtable::SerdeVTable; impl SerdeVTable for VarBinViewVTable { type Metadata = EmptyMetadata; @@ -53,16 +54,3 @@ impl SerdeVTable for VarBinViewVTable { VarBinViewArray::try_new(views, Arc::from(buffers), dtype.clone(), validity) } } - -impl VisitorVTable for VarBinViewVTable { - fn visit_buffers(array: &VarBinViewArray, visitor: &mut dyn ArrayBufferVisitor) { - for buffer in array.buffers().as_ref() { - visitor.visit_buffer(buffer); - } - visitor.visit_buffer(&array.views().clone().into_byte_buffer()); - } - - fn visit_children(array: &VarBinViewArray, visitor: &mut dyn ArrayChildVisitor) { - visitor.visit_validity(array.validity(), array.len()) - } -} diff --git a/vortex-array/src/arrays/varbinview/vtable/validity.rs b/vortex-array/src/arrays/varbinview/vtable/validity.rs new file mode 100644 index 00000000000..f15dd2f8696 --- /dev/null +++ b/vortex-array/src/arrays/varbinview/vtable/validity.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use crate::arrays::varbinview::VarBinViewArray; +use crate::validity::Validity; +use crate::vtable::ValidityHelper; + +impl ValidityHelper for VarBinViewArray { + fn validity(&self) -> &Validity { + &self.validity + } +} diff --git a/vortex-array/src/arrays/varbinview/vtable/visitor.rs b/vortex-array/src/arrays/varbinview/vtable/visitor.rs new file mode 100644 index 00000000000..b6fe3854390 --- /dev/null +++ b/vortex-array/src/arrays/varbinview/vtable/visitor.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use super::VarBinViewVTable; +use crate::arrays::VarBinViewArray; +use crate::vtable::{ValidityHelper, VisitorVTable}; +use crate::{ArrayBufferVisitor, ArrayChildVisitor}; + +impl VisitorVTable for VarBinViewVTable { + fn visit_buffers(array: &VarBinViewArray, visitor: &mut dyn ArrayBufferVisitor) { + for buffer in array.buffers().as_ref() { + visitor.visit_buffer(buffer); + } + visitor.visit_buffer(&array.views().clone().into_byte_buffer()); + } + + fn visit_children(array: &VarBinViewArray, visitor: &mut dyn ArrayChildVisitor) { + visitor.visit_validity(array.validity(), array.len()) + } +} diff --git a/vortex-array/src/builders/mod.rs b/vortex-array/src/builders/mod.rs index d023ae09098..4ded5e8c541 100644 --- a/vortex-array/src/builders/mod.rs +++ b/vortex-array/src/builders/mod.rs @@ -35,7 +35,7 @@ use vortex_error::{VortexResult, vortex_panic}; use vortex_mask::Mask; use vortex_scalar::{Scalar, match_each_decimal_value_type}; -use crate::arrays::smallest_storage_type; +use crate::arrays::smallest_decimal_value_type; use crate::canonical::Canonical; use crate::{Array, ArrayRef}; @@ -246,7 +246,7 @@ pub fn builder_with_capacity(dtype: &DType, capacity: usize) -> Box { - match_each_decimal_value_type!(smallest_storage_type(decimal_type), |D| { + match_each_decimal_value_type!(smallest_decimal_value_type(decimal_type), |D| { Box::new(DecimalBuilder::with_capacity::( capacity, *decimal_type, diff --git a/vortex-array/src/builders/varbinview.rs b/vortex-array/src/builders/varbinview.rs index e848dc4540d..28294d76699 100644 --- a/vortex-array/src/builders/varbinview.rs +++ b/vortex-array/src/builders/varbinview.rs @@ -11,7 +11,8 @@ use vortex_mask::Mask; use vortex_scalar::{BinaryScalar, Scalar, Utf8Scalar}; use vortex_utils::aliases::hash_map::{Entry, HashMap}; -use crate::arrays::{BinaryView, VarBinViewArray}; +use crate::arrays::VarBinViewArray; +use crate::arrays::binary_view::BinaryView; use crate::builders::{ArrayBuilder, LazyNullBufferBuilder}; use crate::canonical::{Canonical, ToCanonical}; use crate::{Array, ArrayRef, IntoArray}; diff --git a/vortex-array/src/pipeline/types.rs b/vortex-array/src/pipeline/types.rs index e8fb0d560ac..5394bbd9682 100644 --- a/vortex-array/src/pipeline/types.rs +++ b/vortex-array/src/pipeline/types.rs @@ -7,7 +7,7 @@ use vortex_dtype::half::f16; use vortex_dtype::{DType, NativePType, PType}; use vortex_error::vortex_panic; -use crate::arrays::BinaryView; +use crate::arrays::binary_view::BinaryView; /// Defines the "vector type", a physical type describing the data that's held in the vector. /// diff --git a/vortex-duckdb/src/exporter/varbinview.rs b/vortex-duckdb/src/exporter/varbinview.rs index fe30a3cca22..a4353ae27ea 100644 --- a/vortex-duckdb/src/exporter/varbinview.rs +++ b/vortex-duckdb/src/exporter/varbinview.rs @@ -4,7 +4,8 @@ use std::ffi::c_char; use itertools::Itertools; -use vortex::arrays::{BinaryView, Inlined, VarBinViewArray}; +use vortex::arrays::VarBinViewArray; +use vortex::arrays::binary_view::{BinaryView, Inlined}; use vortex::buffer::{Buffer, ByteBuffer}; use vortex::error::VortexResult; use vortex::mask::Mask;