From 051de404e48ea73512cf42be18878437592a2e59 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 09:38:51 -0400 Subject: [PATCH 1/9] fix roaring int array --- .../src/compressors/roaring_int.rs | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/vortex-sampling-compressor/src/compressors/roaring_int.rs b/vortex-sampling-compressor/src/compressors/roaring_int.rs index 56cea12fa2f..260f56bfdbf 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_int.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_int.rs @@ -18,11 +18,6 @@ impl EncodingCompressor for RoaringIntCompressor { } fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> { - // Only support primitive enc arrays - if array.encoding().id() != RoaringInt::ID { - return None; - } - // Only support non-nullable uint arrays if !array.dtype().is_unsigned_int() || array.dtype().is_nullable() { return None; @@ -60,3 +55,23 @@ impl EncodingCompressor for RoaringIntCompressor { HashSet::from([&RoaringIntEncoding as EncodingRef]) } } + +#[cfg(test)] +mod tests { + use vortex::{array::PrimitiveArray, validity::Validity, IntoArray}; + use vortex_roaring::RoaringIntArray; + + use crate::{compressors::{roaring_int::RoaringIntCompressor, EncodingCompressor as _}, SamplingCompressor}; + + #[test] + fn test_roaring_int_compressor() { + let array = PrimitiveArray::from_vec(vec![1u32, 2, 3, 4, 5], Validity::NonNullable).into_array(); + assert!(RoaringIntCompressor.can_compress(&array).is_some()); + let compressed = RoaringIntCompressor.compress(&array, None, SamplingCompressor::default()).unwrap(); + assert_eq!(compressed.array.len(), 5); + assert!(compressed.path.is_some()); + + let roaring = RoaringIntArray::try_from(compressed.array).unwrap(); + assert!(roaring.bitmap().contains_range(1..=5)); + } +} From 1c656e87d5fa2ef5dbf523294c85b7048af25449 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 09:39:38 -0400 Subject: [PATCH 2/9] fmt --- .../src/compressors/roaring_int.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vortex-sampling-compressor/src/compressors/roaring_int.rs b/vortex-sampling-compressor/src/compressors/roaring_int.rs index 260f56bfdbf..78927fab20f 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_int.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_int.rs @@ -58,16 +58,23 @@ impl EncodingCompressor for RoaringIntCompressor { #[cfg(test)] mod tests { - use vortex::{array::PrimitiveArray, validity::Validity, IntoArray}; + use vortex::array::PrimitiveArray; + use vortex::validity::Validity; + use vortex::IntoArray; use vortex_roaring::RoaringIntArray; - use crate::{compressors::{roaring_int::RoaringIntCompressor, EncodingCompressor as _}, SamplingCompressor}; + use crate::compressors::roaring_int::RoaringIntCompressor; + use crate::compressors::EncodingCompressor as _; + use crate::SamplingCompressor; #[test] fn test_roaring_int_compressor() { - let array = PrimitiveArray::from_vec(vec![1u32, 2, 3, 4, 5], Validity::NonNullable).into_array(); + let array = + PrimitiveArray::from_vec(vec![1u32, 2, 3, 4, 5], Validity::NonNullable).into_array(); assert!(RoaringIntCompressor.can_compress(&array).is_some()); - let compressed = RoaringIntCompressor.compress(&array, None, SamplingCompressor::default()).unwrap(); + let compressed = RoaringIntCompressor + .compress(&array, None, SamplingCompressor::default()) + .unwrap(); assert_eq!(compressed.array.len(), 5); assert!(compressed.path.is_some()); From 2c9e284cbf1e0d35a06712153241e950dde88a0b Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 10:45:58 -0400 Subject: [PATCH 3/9] more fixes --- encodings/roaring/src/integer/mod.rs | 60 ++++++++++++++++--- .../src/array/chunked/compute/take.rs | 23 +++---- vortex-dtype/src/ptype.rs | 4 +- 3 files changed, 66 insertions(+), 21 deletions(-) diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index 6ec2a031dd6..61f4d89f275 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -4,12 +4,16 @@ pub use compress::*; use croaring::{Bitmap, Portable}; use serde::{Deserialize, Serialize}; use vortex::array::PrimitiveArray; +use vortex::compute::unary::try_cast; use vortex::encoding::ids; -use vortex::stats::{ArrayStatisticsCompute, StatsSet}; -use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet}; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::variants::{ArrayVariants, PrimitiveArrayTrait}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::{impl_encoding, Array, ArrayTrait, Canonical, IntoArray, IntoCanonical, TypedArray}; +use vortex::{ + impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, + IntoCanonical, TypedArray, +}; use vortex_buffer::Buffer; use vortex_dtype::Nullability::NonNullable; use vortex_dtype::{DType, PType}; @@ -34,9 +38,25 @@ impl Display for RoaringIntMetadata { impl RoaringIntArray { pub fn try_new(bitmap: Bitmap, ptype: PType) -> VortexResult { if !ptype.is_unsigned_int() { - vortex_bail!("RoaringInt expected unsigned int"); + vortex_bail!(MismatchedTypes: "unsigned int", ptype); } + let length = bitmap.statistics().cardinality as usize; + let max = bitmap.maximum(); + if max.map(|mv| mv as u64 > ptype.max_value()).unwrap_or(false) { + vortex_bail!( + "RoaringInt maximum value is greater than the maximum value for the primitive type" + ); + } + + let mut stats = StatsSet::new(); + stats.set(Stat::NullCount, 0.into()); + stats.set(Stat::Max, max.into()); + stats.set(Stat::Min, bitmap.minimum().into()); + stats.set(Stat::IsConstant, (length <= 1).into()); + stats.set(Stat::IsSorted, true.into()); + stats.set(Stat::IsStrictSorted, true.into()); + Ok(Self { typed: TypedArray::try_from_parts( DType::Primitive(ptype, NonNullable), @@ -94,17 +114,41 @@ impl ArrayValidity for RoaringIntArray { impl IntoCanonical for RoaringIntArray { fn into_canonical(self) -> VortexResult { - todo!() + try_cast( + PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable), + self.dtype(), + ) + .and_then(|a| a.into_primitive()) + .map(Canonical::Primitive) } } impl AcceptArrayVisitor for RoaringIntArray { - fn accept(&self, _visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - todo!() + fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { + visitor.visit_buffer( + self.as_ref() + .buffer() + .vortex_expect("Missing buffer in RoaringIntArray"), + ) } } -impl ArrayStatisticsCompute for RoaringIntArray {} +impl ArrayStatisticsCompute for RoaringIntArray { + fn compute_statistics(&self, stat: Stat) -> VortexResult { + let mut stats = self.statistics().to_set(); + if stats.get(stat).is_some() { + return Ok(stats); + } + + if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount { + let primitive = PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable); + let prim_stats = primitive.statistics().to_set(); + stats.merge(&prim_stats); + } + + Ok(stats) + } +} #[cfg(test)] mod test { diff --git a/vortex-array/src/array/chunked/compute/take.rs b/vortex-array/src/array/chunked/compute/take.rs index 61bbb771daa..e7f91a6e83a 100644 --- a/vortex-array/src/array/chunked/compute/take.rs +++ b/vortex-array/src/array/chunked/compute/take.rs @@ -77,17 +77,18 @@ fn take_strict_sorted(chunked: &ChunkedArray, indices: &Array) -> VortexResult usize { - match_each_integer_ptype!(self, |$T| $T::MAX as usize) + pub const fn max_value(&self) -> u64 { + match_each_integer_ptype!(self, |$T| $T::MAX as u64) } pub fn to_signed(self) -> Self { From 2b4b64c104b178134f59c9c33b2f394af4966cd9 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 10:53:02 -0400 Subject: [PATCH 4/9] moar --- encodings/roaring/src/integer/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index 61f4d89f275..925090f5c46 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -140,10 +140,13 @@ impl ArrayStatisticsCompute for RoaringIntArray { return Ok(stats); } + // possibly faster to write an accumulator over the iterator, though not necessarily if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount { let primitive = PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable); - let prim_stats = primitive.statistics().to_set(); - stats.merge(&prim_stats); + if let Some(prim_stat) = primitive.statistics().compute(stat) { + stats.set(stat, prim_stat); + } + stats.merge(&primitive.statistics().to_set()); } Ok(stats) From 9652876d44f93fdbea31b686c66875af0156e5ec Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 10:54:27 -0400 Subject: [PATCH 5/9] CR feedback --- encodings/roaring/src/integer/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index 925090f5c46..a383ee67c3b 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -11,7 +11,7 @@ use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::variants::{ArrayVariants, PrimitiveArrayTrait}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; use vortex::{ - impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, + impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, IntoCanonical, TypedArray, }; use vortex_buffer::Buffer; @@ -118,8 +118,7 @@ impl IntoCanonical for RoaringIntArray { PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable), self.dtype(), ) - .and_then(|a| a.into_primitive()) - .map(Canonical::Primitive) + .and_then(Array::into_canonical) } } From dd15b5e0ac7c8a901eefd1c9a162a2c9d32c5de4 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 10:59:11 -0400 Subject: [PATCH 6/9] moar --- encodings/roaring/src/integer/mod.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index a383ee67c3b..5b671703063 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -11,8 +11,8 @@ use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::variants::{ArrayVariants, PrimitiveArrayTrait}; use vortex::visitor::{AcceptArrayVisitor, ArrayVisitor}; use vortex::{ - impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, - IntoCanonical, TypedArray, + impl_encoding, Array, ArrayDType as _, ArrayTrait, Canonical, IntoArray, IntoCanonical, + TypedArray, }; use vortex_buffer::Buffer; use vortex_dtype::Nullability::NonNullable; @@ -134,21 +134,14 @@ impl AcceptArrayVisitor for RoaringIntArray { impl ArrayStatisticsCompute for RoaringIntArray { fn compute_statistics(&self, stat: Stat) -> VortexResult { - let mut stats = self.statistics().to_set(); - if stats.get(stat).is_some() { - return Ok(stats); - } - // possibly faster to write an accumulator over the iterator, though not necessarily if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount { let primitive = PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable); - if let Some(prim_stat) = primitive.statistics().compute(stat) { - stats.set(stat, prim_stat); - } - stats.merge(&primitive.statistics().to_set()); + primitive.statistics().compute(stat); + Ok(primitive.statistics().to_set()) + } else { + Ok(StatsSet::new()) } - - Ok(stats) } } From 59b68bed1c81d0309ded8bf9bae84c559ebdab69 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 11:37:46 -0400 Subject: [PATCH 7/9] even moar --- encodings/roaring/src/integer/compute.rs | 68 ++++++++++++++++++- encodings/roaring/src/integer/mod.rs | 29 ++------ .../src/compressors/roaring_int.rs | 2 +- 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/encodings/roaring/src/integer/compute.rs b/encodings/roaring/src/integer/compute.rs index 8c7699c3bbd..733c38000ae 100644 --- a/encodings/roaring/src/integer/compute.rs +++ b/encodings/roaring/src/integer/compute.rs @@ -1,5 +1,7 @@ +use croaring::Bitmap; use vortex::compute::unary::ScalarAtFn; -use vortex::compute::ArrayCompute; +use vortex::compute::{ArrayCompute, SliceFn}; +use vortex::{Array, IntoArray}; use vortex_dtype::PType; use vortex_error::{vortex_err, VortexResult, VortexUnwrap as _}; use vortex_scalar::Scalar; @@ -10,12 +12,16 @@ impl ArrayCompute for RoaringIntArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { Some(self) } + + fn slice(&self) -> Option<&dyn SliceFn> { + Some(self) + } } impl ScalarAtFn for RoaringIntArray { fn scalar_at(&self, index: usize) -> VortexResult { let bitmap_value = self - .bitmap() + .owned_bitmap() .select(index as u32) .ok_or_else(|| vortex_err!(OutOfBounds: index, 0, self.len()))?; let scalar: Scalar = match self.metadata().ptype { @@ -32,3 +38,61 @@ impl ScalarAtFn for RoaringIntArray { ::scalar_at(self, index).vortex_unwrap() } } + +impl SliceFn for RoaringIntArray { + fn slice(&self, start: usize, stop: usize) -> VortexResult { + let mut bitmap = self.owned_bitmap(); + let start = bitmap + .select(start as u32) + .ok_or_else(|| vortex_err!(OutOfBounds: start, 0, self.len()))?; + let stop_inclusive = if stop == self.len() { + bitmap.maximum().unwrap_or(0) + } else { + bitmap + .select(stop.saturating_sub(1) as u32) + .ok_or_else(|| vortex_err!(OutOfBounds: stop, 0, self.len()))? + }; + + bitmap.and_inplace(&Bitmap::from_range(start..=stop_inclusive)); + Self::try_new(bitmap, self.ptype()).map(IntoArray::into_array) + } +} + +#[cfg(test)] +mod tests { + use vortex::compute::slice; + use vortex::compute::unary::scalar_at; + + use super::*; + + #[test] + #[cfg_attr(miri, ignore)] + pub fn test_scalar_at() { + let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array(); + let array = RoaringIntArray::encode(ints).unwrap(); + + assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into()); + assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into()); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_slice() { + let array = RoaringIntArray::try_new(Bitmap::from_range(10..20), PType::U32).unwrap(); + + let sliced = slice(&array, 0, 5).unwrap(); + assert_eq!(sliced.len(), 5); + assert_eq!(scalar_at(&sliced, 0).unwrap(), 10u32.into()); + assert_eq!(scalar_at(&sliced, 4).unwrap(), 14u32.into()); + + let sliced = slice(&array, 5, 10).unwrap(); + assert_eq!(sliced.len(), 5); + assert_eq!(scalar_at(&sliced, 0).unwrap(), 15u32.into()); + assert_eq!(scalar_at(&sliced, 4).unwrap(), 19u32.into()); + + let sliced = slice(&sliced, 3, 5).unwrap(); + assert_eq!(sliced.len(), 2); + assert_eq!(scalar_at(&sliced, 0).unwrap(), 18u32.into()); + assert_eq!(scalar_at(&sliced, 1).unwrap(), 19u32.into()); + } +} diff --git a/encodings/roaring/src/integer/mod.rs b/encodings/roaring/src/integer/mod.rs index 5b671703063..dedcd458de6 100644 --- a/encodings/roaring/src/integer/mod.rs +++ b/encodings/roaring/src/integer/mod.rs @@ -69,8 +69,7 @@ impl RoaringIntArray { }) } - pub fn bitmap(&self) -> Bitmap { - //TODO(@jdcasale): figure out a way to avoid this deserialization per-call + pub fn owned_bitmap(&self) -> Bitmap { Bitmap::deserialize::( self.as_ref() .buffer() @@ -108,14 +107,14 @@ impl ArrayValidity for RoaringIntArray { } fn logical_validity(&self) -> LogicalValidity { - LogicalValidity::AllValid(self.bitmap().iter().count()) + LogicalValidity::AllValid(self.len()) } } impl IntoCanonical for RoaringIntArray { fn into_canonical(self) -> VortexResult { try_cast( - PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable), + PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable), self.dtype(), ) .and_then(Array::into_canonical) @@ -136,7 +135,8 @@ impl ArrayStatisticsCompute for RoaringIntArray { fn compute_statistics(&self, stat: Stat) -> VortexResult { // possibly faster to write an accumulator over the iterator, though not necessarily if stat == Stat::TrailingZeroFreq || stat == Stat::BitWidthFreq || stat == Stat::RunCount { - let primitive = PrimitiveArray::from_vec(self.bitmap().to_vec(), Validity::NonNullable); + let primitive = + PrimitiveArray::from_vec(self.owned_bitmap().to_vec(), Validity::NonNullable); primitive.statistics().compute(stat); Ok(primitive.statistics().to_set()) } else { @@ -144,22 +144,3 @@ impl ArrayStatisticsCompute for RoaringIntArray { } } } - -#[cfg(test)] -mod test { - use vortex::array::PrimitiveArray; - use vortex::compute::unary::scalar_at; - use vortex::IntoArray; - - use crate::RoaringIntArray; - - #[test] - #[cfg_attr(miri, ignore)] - pub fn test_scalar_at() { - let ints = PrimitiveArray::from(vec![2u32, 12, 22, 32]).into_array(); - let array = RoaringIntArray::encode(ints).unwrap(); - - assert_eq!(scalar_at(&array, 0).unwrap(), 2u32.into()); - assert_eq!(scalar_at(&array, 1).unwrap(), 12u32.into()); - } -} diff --git a/vortex-sampling-compressor/src/compressors/roaring_int.rs b/vortex-sampling-compressor/src/compressors/roaring_int.rs index 78927fab20f..27c675a9947 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_int.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_int.rs @@ -79,6 +79,6 @@ mod tests { assert!(compressed.path.is_some()); let roaring = RoaringIntArray::try_from(compressed.array).unwrap(); - assert!(roaring.bitmap().contains_range(1..=5)); + assert!(roaring.owned_bitmap().contains_range(1..=5)); } } From 98d4d9b16c4b28bb33689efd8a3210061960c185 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 11:42:04 -0400 Subject: [PATCH 8/9] fix --- encodings/roaring/src/integer/compute.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/encodings/roaring/src/integer/compute.rs b/encodings/roaring/src/integer/compute.rs index 733c38000ae..84df83251f2 100644 --- a/encodings/roaring/src/integer/compute.rs +++ b/encodings/roaring/src/integer/compute.rs @@ -60,6 +60,7 @@ impl SliceFn for RoaringIntArray { #[cfg(test)] mod tests { + use vortex::array::PrimitiveArray; use vortex::compute::slice; use vortex::compute::unary::scalar_at; From 316fa88ec523c6dabc6f9e7743249f8abfbd46ac Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 9 Oct 2024 11:58:00 -0400 Subject: [PATCH 9/9] derp miri --- vortex-sampling-compressor/src/compressors/roaring_int.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/vortex-sampling-compressor/src/compressors/roaring_int.rs b/vortex-sampling-compressor/src/compressors/roaring_int.rs index 27c675a9947..bad77b7f125 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_int.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_int.rs @@ -68,6 +68,7 @@ mod tests { use crate::SamplingCompressor; #[test] + #[cfg_attr(miri, ignore)] fn test_roaring_int_compressor() { let array = PrimitiveArray::from_vec(vec![1u32, 2, 3, 4, 5], Validity::NonNullable).into_array();