diff --git a/encodings/roaring/src/integer/compute.rs b/encodings/roaring/src/integer/compute.rs index 8c7699c3bbd..84df83251f2 100644 --- a/encodings/roaring/src/integer/compute.rs +++ b/encodings/roaring/src/integer/compute.rs @@ -1,5 +1,7 @@ +use croaring::Bitmap; use vortex::compute::unary::ScalarAtFn; -use vortex::compute::ArrayCompute; +use vortex::compute::{ArrayCompute, SliceFn}; +use vortex::{Array, IntoArray}; use vortex_dtype::PType; use vortex_error::{vortex_err, VortexResult, VortexUnwrap as _}; use vortex_scalar::Scalar; @@ -10,12 +12,16 @@ impl ArrayCompute for RoaringIntArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } + + fn slice(&self) -> Option<&dyn SliceFn> { + Some(self) + } } impl ScalarAtFn for RoaringIntArray { fn scalar_at(&self, index: usize) -> VortexResult { let bitmap_value = self - .bitmap() + .owned_bitmap() .select(index as u32) .ok_or_else(|| vortex_err!(OutOfBounds: index, 0, self.len()))?; let scalar: Scalar = match self.metadata().ptype { @@ -32,3 +38,62 @@ impl ScalarAtFn for RoaringIntArray { ::scalar_at(self, index).vortex_unwrap() } } + +impl SliceFn for RoaringIntArray { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + let mut bitmap = self.owned_bitmap(); + let start = bitmap + .select(start as u32) + .ok_or_else(|| vortex_err!(OutOfBounds: start, 0, self.len()))?; + let stop_inclusive = if stop == self.len() { + bitmap.maximum().unwrap_or(0) + } else { + bitmap + .select(stop.saturating_sub(1) as u32) + .ok_or_else(|| vortex_err!(OutOfBounds: stop, 0, self.len()))? + }; + + bitmap.and_inplace(&Bitmap::from_range(start..=stop_inclusive)); + Self::try_new(bitmap, self.ptype()).map(IntoArray::into_array) + } +} + +#[cfg(test)] +mod tests { + use vortex::array::PrimitiveArray; + use vortex::compute::slice; + use vortex::compute::unary::scalar_at; + + use super::*; + + #[test] + #[cfg_attr(miri, ignore)] + pub fn test_scalar_at() { + let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array(); + let array = RoaringIntArray::encode(ints).unwrap(); + + assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into()); + assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into()); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_slice() { + let array = RoaringIntArray::try_new(Bitmap::from_range(10..20), PType::U32).unwrap(); + + let sliced = slice(&array, 0, 5).unwrap(); + assert_eq!(sliced.len(), 5); + assert_eq!(scalar_at(&sliced, 0).unwrap(), 10u32.into()); + assert_eq!(scalar_at(&sliced, 4).unwrap(), 14u32.into()); + + let sliced = slice(&array, 5, 10).unwrap(); + assert_eq!(sliced.len(), 5); + assert_eq!(scalar_at(&sliced, 0).unwrap(), 15u32.into()); + assert_eq!(scalar_at(&sliced, 4).unwrap(), 19u32.into()); + + let sliced = slice(&sliced, 3, 5).unwrap(); + assert_eq!(sliced.len(), 2); + assert_eq!(scalar_at(&sliced, 0).unwrap(), 18u32.into()); + assert_eq!(scalar_at(&sliced, 1).unwrap(), 19u32.into()); + } +} diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index 6ec2a031dd6..dedcd458de6 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -4,12 +4,16 @@ pub use compress::*; use croaring::{Bitmap, Portable}; use serde::{Deserialize, Serialize}; use vortex::array::PrimitiveArray; +use vortex::compute::unary::try_cast; use vortex::encoding::ids; -use vortex::stats::{ArrayStatisticsCompute, StatsSet}; -use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet}; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::variants::{ArrayVariants, PrimitiveArrayTrait}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, Array, ArrayTrait, Canonical, IntoArray, IntoCanonical, TypedArray}; +use vortex::{ + impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, IntoCanonical, + TypedArray, +}; use vortex_buffer::Buffer; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::{DType, PType}; @@ -34,9 +38,25 @@ impl Display for RoaringIntMetadata { impl RoaringIntArray { pub fn try_new(bitmap: Bitmap, ptype: PType) -> VortexResult { if !ptype.is_unsigned_int() { - vortex_bail!("RoaringInt expected unsigned int"); + vortex_bail!(MismatchedTypes: "unsigned int", ptype); } + let length = bitmap.statistics().cardinality as usize; + let max = bitmap.maximum(); + if max.map(|mv| mv as u64 > ptype.max_value()).unwrap_or(false) { + vortex_bail!( + "RoaringInt maximum value is greater than the maximum value for the primitive type" + ); + } + + let mut stats = StatsSet::new(); + stats.set(Stat::NullCount, 0.into()); + stats.set(Stat::Max, max.into()); + stats.set(Stat::Min, bitmap.minimum().into()); + stats.set(Stat::IsConstant, (length <= 1).into()); + stats.set(Stat::IsSorted, true.into()); + stats.set(Stat::IsStrictSorted, true.into()); + Ok(Self { typed: TypedArray::try_from_parts( DType::Primitive(ptype, NonNullable), @@ -49,8 +69,7 @@ impl RoaringIntArray { }) } - pub fn bitmap(&self) -> Bitmap { - //TODO(@jdcasale): figure out a way to avoid this deserialization per-call + pub fn owned_bitmap(&self) -> Bitmap { Bitmap::deserialize::( self.as_ref() .buffer() @@ -88,39 +107,40 @@ impl ArrayValidity for RoaringIntArray { } fn logical_validity(&self) -> LogicalValidity { - LogicalValidity::AllValid(self.bitmap().iter().count()) + LogicalValidity::AllValid(self.len()) } } impl IntoCanonical for RoaringIntArray { fn into_canonical(self) -> VortexResult { - todo!() + try_cast( + PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable), + self.dtype(), + ) + .and_then(Array::into_canonical) } } impl AcceptArrayVisitor for RoaringIntArray { - fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - todo!() + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_buffer( + self.as_ref() + .buffer() + .vortex_expect("Missing buffer in RoaringIntArray"), + ) } } -impl ArrayStatisticsCompute for RoaringIntArray {} - -#[cfg(test)] -mod test { - use vortex::array::PrimitiveArray; - use vortex::compute::unary::scalar_at; - use vortex::IntoArray; - - use crate::RoaringIntArray; - - #[test] - #[cfg_attr(miri, ignore)] - pub fn test_scalar_at() { - let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array(); - let array = RoaringIntArray::encode(ints).unwrap(); - - assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into()); - assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into()); +impl ArrayStatisticsCompute for RoaringIntArray { + fn compute_statistics(&self, stat: Stat) -> VortexResult { + // possibly faster to write an accumulator over the iterator, though not necessarily + if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount { + let primitive = + PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable); + primitive.statistics().compute(stat); + Ok(primitive.statistics().to_set()) + } else { + Ok(StatsSet::new()) + } } } diff --git a/vortex-array/src/array/chunked/compute/take.rs b/vortex-array/src/array/chunked/compute/take.rs index 61bbb771daa..e7f91a6e83a 100644 --- a/vortex-array/src/array/chunked/compute/take.rs +++ b/vortex-array/src/array/chunked/compute/take.rs @@ -77,17 +77,18 @@ fn take_strict_sorted(chunked: &ChunkedArray, indices: &Array) -> VortexResult usize { - match_each_integer_ptype!(self, |$T| $T::MAX as usize) + pub const fn max_value(&self) -> u64 { + match_each_integer_ptype!(self, |$T| $T::MAX as u64) } pub fn to_signed(self) -> Self { diff --git a/vortex-sampling-compressor/src/compressors/roaring_int.rs b/vortex-sampling-compressor/src/compressors/roaring_int.rs index 56cea12fa2f..bad77b7f125 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_int.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_int.rs @@ -18,11 +18,6 @@ impl EncodingCompressor for RoaringIntCompressor { } fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> { - // Only support primitive enc arrays - if array.encoding().id() != RoaringInt::ID { - return None; - } - // Only support non-nullable uint arrays if !array.dtype().is_unsigned_int() || array.dtype().is_nullable() { return None; @@ -60,3 +55,31 @@ impl EncodingCompressor for RoaringIntCompressor { HashSet::from([&RoaringIntEncoding as EncodingRef]) } } + +#[cfg(test)] +mod tests { + use vortex::array::PrimitiveArray; + use vortex::validity::Validity; + use vortex::IntoArray; + use vortex_roaring::RoaringIntArray; + + use crate::compressors::roaring_int::RoaringIntCompressor; + use crate::compressors::EncodingCompressor as _; + use crate::SamplingCompressor; + + #[test] + #[cfg_attr(miri, ignore)] + fn test_roaring_int_compressor() { + let array = + PrimitiveArray::from_vec(vec![1u32, 2, 3, 4, 5], Validity::NonNullable).into_array(); + assert!(RoaringIntCompressor.can_compress(&array).is_some()); + let compressed = RoaringIntCompressor + .compress(&array, None, SamplingCompressor::default()) + .unwrap(); + assert_eq!(compressed.array.len(), 5); + assert!(compressed.path.is_some()); + + let roaring = RoaringIntArray::try_from(compressed.array).unwrap(); + assert!(roaring.owned_bitmap().contains_range(1..=5)); + } +}