From 889b25f7de26dc1522ebe3cd87002d9e9d8bde2f Mon Sep 17 00:00:00 2001 From: Brian Hart Date: Sat, 25 Apr 2026 18:29:37 -0700 Subject: [PATCH] Fix dtype mismatch in FileStatsLayoutReader for stat scalars Use the stat's own dtype (e.g. u64 for NullCount) rather than the field dtype when constructing stat scalars in stats_ref. This fixes IS NULL pruning on nullable timestamp columns which previously failed with a dtype mismatch. Add regression test for is_null pruning on a nullable timestamp column. Signed-off-by: Brian Hart --- vortex-file/src/v2/file_stats_reader.rs | 63 ++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/vortex-file/src/v2/file_stats_reader.rs b/vortex-file/src/v2/file_stats_reader.rs index e5b504d1af4..8857e6422e2 100644 --- a/vortex-file/src/v2/file_stats_reader.rs +++ b/vortex-file/src/v2/file_stats_reader.rs @@ -126,7 +126,10 @@ impl StatsCatalog for FileStatsLayoutReader { let stat_value = field_stats.get(stat)?.as_exact()?; let field_dtype = self.struct_fields.field_by_index(field_idx)?; - let stat_scalar = Scalar::try_new(field_dtype, Some(stat_value)).ok()?; + // Use the stat's own dtype rather than the field dtype. For example, + // NullCount is always u64 regardless of the column type. + let stat_dtype = stat.dtype(&field_dtype)?; + let stat_scalar = Scalar::try_new(stat_dtype, Some(stat_value)).ok()?; Some(lit(stat_scalar)) } @@ -209,16 +212,20 @@ mod tests { use vortex_array::ArrayContext; use vortex_array::IntoArray as _; + use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::StructArray; + use vortex_array::arrays::datetime::TemporalData; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; use vortex_array::expr::get_item; use vortex_array::expr::gt; + use vortex_array::expr::is_null; use vortex_array::expr::lit; use vortex_array::expr::root; use vortex_array::expr::stats::Precision; use vortex_array::expr::stats::Stat; + use vortex_array::extension::datetime::TimeUnit; use vortex_array::scalar::ScalarValue; use vortex_array::scalar_fn::session::ScalarFnSession; use vortex_array::session::ArraySession; @@ -337,4 +344,58 @@ mod tests { Ok(()) }) } + + /// Regression test: `IS NULL` on a nullable timestamp column must not fail with a + /// dtype mismatch. The bug was that `stats_ref` used the *field* dtype (timestamp) + /// for the `NullCount` stat scalar instead of the stat's own dtype (u64). + #[test] + fn is_null_pruning_on_nullable_timestamp_column() -> VortexResult<()> { + block_on(|handle| async { + let ctx = ArrayContext::empty(); + let segments = Arc::new(TestSegments::default()); + let (ptr, eof) = SequenceId::root().split(); + + // Build a struct with a nullable timestamp column containing some nulls. + let prim_array = + PrimitiveArray::from_option_iter([Some(1_000_000i64), None, Some(3_000_000)]) + .into_array(); + let ts_data = TemporalData::new_timestamp(prim_array, TimeUnit::Microseconds, None); + let ts_dtype = ts_data.dtype().clone(); + let ts_array = ts_data.into_array(); + + let struct_array = StructArray::from_fields([("deleted_at", ts_array)].as_slice())?; + + let strategy = TableStrategy::new( + Arc::new(FlatLayoutStrategy::default()), + Arc::new(FlatLayoutStrategy::default()), + ); + let layout = strategy + .write_stream( + ctx, + segments.clone(), + struct_array.into_array().to_array_stream().sequenced(ptr), + eof, + handle, + ) + .await?; + + let child = layout.new_reader("".into(), segments, &SESSION)?; + + // File-level stats: 1 null in deleted_at. + let mut stats = StatsSet::default(); + stats.set(Stat::NullCount, Precision::exact(ScalarValue::from(1u64))); + let file_stats = FileStatistics::new(Arc::from([stats]), Arc::from([ts_dtype])); + + let reader = FileStatsLayoutReader::new(child, file_stats, SESSION.clone()); + + // `is_null(deleted_at)` — should NOT panic or error due to dtype mismatch. + let expr = is_null(get_item("deleted_at", root())); + let mask = Mask::new_true(3); + let result = reader.pruning_evaluation(&(0..3), &expr, mask)?.await?; + // null_count is 1 (non-zero), so is_null is not falsified => not pruned. + assert_eq!(result, Mask::new_true(3)); + + Ok(()) + }) + } }