diff --git a/vortex-file/public-api.lock b/vortex-file/public-api.lock index bb2a9b00d81..04db1feb837 100644 --- a/vortex-file/public-api.lock +++ b/vortex-file/public-api.lock @@ -184,6 +184,10 @@ pub fn vortex_file::Footer::into_serializer(self) -> vortex_file::FooterSerializ pub fn vortex_file::Footer::layout(&self) -> &vortex_layout::layout::LayoutRef +pub fn vortex_file::Footer::metadata_segment(&self, &str) -> core::option::Option<&vortex_buffer::ByteBuffer> + +pub fn vortex_file::Footer::metadata_segments(&self) -> impl core::iter::traits::iterator::Iterator + pub fn vortex_file::Footer::row_count(&self) -> u64 pub fn vortex_file::Footer::segment_map(&self) -> &alloc::sync::Arc<[vortex_file::SegmentSpec]> @@ -206,10 +210,14 @@ pub fn vortex_file::FooterDeserializer::buffer(&self) -> &vortex_buffer::ByteBuf pub fn vortex_file::FooterDeserializer::deserialize(&mut self) -> vortex_error::VortexResult +pub fn vortex_file::FooterDeserializer::include_metadata(self) -> Self + pub fn vortex_file::FooterDeserializer::prefix_data(&mut self, vortex_buffer::ByteBuffer) pub fn vortex_file::FooterDeserializer::with_dtype(self, vortex_array::dtype::DType) -> Self +pub fn vortex_file::FooterDeserializer::with_include_metadata(self, bool) -> Self + pub fn vortex_file::FooterDeserializer::with_size(self, u64) -> Self pub fn vortex_file::FooterDeserializer::with_some_dtype(self, core::option::Option) -> Self @@ -276,6 +284,10 @@ pub fn vortex_file::VortexFile::footer(&self) -> &vortex_file::Footer pub fn vortex_file::VortexFile::layout_reader(&self) -> vortex_error::VortexResult> +pub fn vortex_file::VortexFile::metadata_segment(&self, &str) -> core::option::Option<&vortex_buffer::ByteBuffer> + +pub fn vortex_file::VortexFile::metadata_segments(&self) -> impl core::iter::traits::iterator::Iterator + pub fn vortex_file::VortexFile::row_count(&self) -> u64 pub fn vortex_file::VortexFile::scan(&self) -> vortex_error::VortexResult> @@ -292,6 +304,8 @@ pub struct vortex_file::VortexOpenOptions impl vortex_file::VortexOpenOptions +pub fn vortex_file::VortexOpenOptions::include_metadata(self) -> Self + pub async fn vortex_file::VortexOpenOptions::open(self, alloc::sync::Arc) -> vortex_error::VortexResult pub fn vortex_file::VortexOpenOptions::open_buffer>(self, B) -> vortex_error::VortexResult @@ -306,6 +320,8 @@ pub fn vortex_file::VortexOpenOptions::with_file_size(self, u64) -> Self pub fn vortex_file::VortexOpenOptions::with_footer(self, vortex_file::Footer) -> Self +pub fn vortex_file::VortexOpenOptions::with_include_metadata(self, bool) -> Self + pub fn vortex_file::VortexOpenOptions::with_initial_read_size(self, usize) -> Self pub fn vortex_file::VortexOpenOptions::with_labels(self, alloc::vec::Vec) -> Self @@ -338,6 +354,10 @@ pub fn vortex_file::VortexWriteOptions::new(vortex_session::VortexSession) -> Se pub fn vortex_file::VortexWriteOptions::with_file_statistics(self, alloc::vec::Vec) -> Self +pub fn vortex_file::VortexWriteOptions::with_metadata_segment(self, impl core::convert::Into, impl core::convert::Into) -> Self + +pub fn vortex_file::VortexWriteOptions::with_metadata_segments(self, I) -> Self where I: core::iter::traits::collect::IntoIterator, K: core::convert::Into, B: core::convert::Into + pub fn vortex_file::VortexWriteOptions::with_strategy(self, alloc::sync::Arc) -> Self pub struct vortex_file::WriteStrategyBuilder diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index f321d774197..a588d5648f4 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -23,6 +23,7 @@ use vortex_array::dtype::FieldPathSet; use vortex_array::expr::Expression; use vortex_array::expr::pruning::checked_pruning_expr; use vortex_array::scalar_fn::internal::row_count::substitute_row_count; +use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_layout::LayoutReader; use vortex_layout::scan::layout::LayoutReaderDataSource; @@ -76,6 +77,16 @@ impl VortexFile { self.footer.statistics() } + /// Returns the user-defined metadata segments loaded for this file. + pub fn metadata_segments(&self) -> impl Iterator { + self.footer.metadata_segments() + } + + /// Returns the loaded user-defined metadata segment for the given key. + pub fn metadata_segment(&self, key: &str) -> Option<&ByteBuffer> { + self.footer.metadata_segment(key) + } + /// Create a new segment source for reading from the file. /// /// This may spawn a background I/O driver that will exit when the returned segment source diff --git a/vortex-file/src/footer/deserializer.rs b/vortex-file/src/footer/deserializer.rs index 769b94c715b..1ca141635a1 100644 --- a/vortex-file/src/footer/deserializer.rs +++ b/vortex-file/src/footer/deserializer.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::Arc; + use flatbuffers::root; use vortex_array::dtype::DType; use vortex_buffer::ByteBuffer; @@ -12,6 +14,7 @@ use vortex_error::vortex_err; use vortex_flatbuffers::FlatBuffer; use vortex_flatbuffers::ReadFlatBuffer; use vortex_session::VortexSession; +use vortex_utils::aliases::hash_map::HashMap; use crate::EOF_SIZE; use crate::Footer; @@ -19,6 +22,7 @@ use crate::MAGIC_BYTES; use crate::VERSION; use crate::footer::FileStatistics; use crate::footer::postscript::Postscript; +use crate::footer::postscript::PostscriptMetadata; use crate::footer::postscript::PostscriptSegment; /// Deserialize a footer from the end of a Vortex file or created from a @@ -37,6 +41,8 @@ pub struct FooterDeserializer { // The file size, possibly provided externally. file_size: Option, + // Whether to include user-defined metadata segments in the footer read. + include_metadata: bool, // The postscript, once we've parsed it. postscript: Option, } @@ -48,6 +54,7 @@ impl FooterDeserializer { session, dtype: None, file_size: None, + include_metadata: false, postscript: None, } } @@ -72,6 +79,18 @@ impl FooterDeserializer { self } + /// Include user-defined metadata segments in footer deserialization. + pub fn include_metadata(mut self) -> Self { + self.include_metadata = true; + self + } + + /// Whether to include user-defined metadata segments in footer deserialization. + pub fn with_include_metadata(mut self, include_metadata: bool) -> Self { + self.include_metadata = include_metadata; + self + } + /// Prefix more data to the existing buffer when requested by the deserializer. pub fn prefix_data(&mut self, more_data: ByteBuffer) { let mut buffer = ByteBufferMut::with_capacity(self.buffer.len() + more_data.len()); @@ -119,6 +138,11 @@ impl FooterDeserializer { if let Some(stats_segment) = &postscript.statistics { read_more_offset = read_more_offset.min(stats_segment.offset); } + if self.include_metadata { + for metadata in &postscript.metadata { + read_more_offset = read_more_offset.min(metadata.segment.offset); + } + } read_more_offset = read_more_offset.min(postscript.layout.offset); read_more_offset = read_more_offset.min(postscript.footer.offset); @@ -151,14 +175,27 @@ impl FooterDeserializer { ) }) .transpose()?; + let metadata = if self.include_metadata { + Arc::new( + postscript + .metadata + .iter() + .map(|metadata| { + self.parse_metadata_segment(initial_offset, &self.buffer, metadata) + }) + .collect::>>()?, + ) + } else { + Arc::new(HashMap::default()) + }; Ok(DeserializeStep::Done(self.parse_footer( initial_offset, &self.buffer, - &postscript.footer, - &postscript.layout, + postscript, dtype, file_stats, + metadata, )?)) } @@ -238,27 +275,67 @@ impl FooterDeserializer { FileStatistics::from_flatbuffer(&fb, dtype, session) } + /// Parse a user-defined metadata segment from the initial read buffer. + fn parse_metadata_segment( + &self, + initial_offset: u64, + initial_read: &ByteBuffer, + metadata: &PostscriptMetadata, + ) -> VortexResult<(String, ByteBuffer)> { + let offset = usize::try_from(metadata.segment.offset - initial_offset)?; + let length = metadata.segment.length as usize; + let end = offset + .checked_add(length) + .ok_or_else(|| vortex_err!("Metadata segment range overflowed usize"))?; + + if end > initial_read.len() { + vortex_bail!( + "Metadata segment {} range {}..{} out of bounds for initial read of length {}", + metadata.key, + offset, + end, + initial_read.len() + ); + } + + Ok(( + metadata.key.clone(), + initial_read + .slice_unaligned(offset..end) + .aligned(metadata.segment.alignment), + )) + } + /// Parse the rest of the footer from the initial read. fn parse_footer( &self, initial_offset: u64, initial_read: &[u8], - footer_segment: &PostscriptSegment, - layout_segment: &PostscriptSegment, + postscript: &Postscript, dtype: DType, file_stats: Option, + metadata: Arc>, ) -> VortexResult