From 195f368869fcd1799c23d7ad79a09fa1cf66b99d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 17 Apr 2026 11:11:03 -0400 Subject: [PATCH 1/4] fix #6599 Signed-off-by: Andrew Duffy --- vortex-io/src/filesystem/glob.rs | 105 ++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 15 deletions(-) diff --git a/vortex-io/src/filesystem/glob.rs b/vortex-io/src/filesystem/glob.rs index 6bbe26fa5dc..255c4e983c3 100644 --- a/vortex-io/src/filesystem/glob.rs +++ b/vortex-io/src/filesystem/glob.rs @@ -23,13 +23,21 @@ impl dyn FileSystem + '_ { validate_glob(pattern)?; // If there are no glob characters, the pattern is an exact file path. - // Return it directly without listing the filesystem. + // Use [`list`](FileSystem::list) with the full path as the prefix so that + // the filesystem confirms the file exists and populates its size, then + // filter to the exact match to avoid yielding prefix-collisions such as + // `foo.vortex.backup` when the caller asked for `foo.vortex`. if !pattern.contains(['*', '?', '[']) { - let listing = FileListing { - path: pattern.to_string(), - size: None, - }; - return Ok(futures::stream::once(async { Ok(listing) }).boxed()); + let pattern = pattern.to_string(); + let stream = self + .list(&pattern) + .try_filter(move |listing| { + let matches = listing.path == pattern; + async move { matches } + }) + .into_stream() + .boxed(); + return Ok(stream); } let glob_pattern = glob::Pattern::new(pattern) @@ -87,20 +95,40 @@ mod tests { use async_trait::async_trait; use futures::TryStreamExt; + use futures::stream; + use parking_lot::Mutex; use vortex_error::vortex_panic; use super::*; use crate::VortexReadAt; use crate::filesystem::FileSystem; - /// A mock filesystem that panics if `list` is called. + /// A mock filesystem whose `list` records the requested prefix and returns + /// a preconfigured set of listings. #[derive(Debug)] - struct NoListFileSystem; + struct MockFileSystem { + listings: Vec, + last_prefix: Mutex>, + } + + impl MockFileSystem { + fn new(listings: Vec) -> Self { + Self { + listings, + last_prefix: Mutex::new(None), + } + } + + fn last_prefix(&self) -> Option { + self.last_prefix.lock().clone() + } + } #[async_trait] - impl FileSystem for NoListFileSystem { - fn list(&self, _prefix: &str) -> BoxStream<'_, VortexResult> { - vortex_panic!("list() should not be called for exact paths") + impl FileSystem for MockFileSystem { + fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { + *self.last_prefix.lock() = Some(prefix.to_string()); + stream::iter(self.listings.clone().into_iter().map(Ok)).boxed() } async fn open_read(&self, _path: &str) -> VortexResult> { @@ -113,12 +141,59 @@ mod tests { } #[tokio::test] - async fn test_glob_exact_path_skips_list() -> VortexResult<()> { - let fs: &dyn FileSystem = &NoListFileSystem; - let results: Vec = fs.glob("data/file.vortex")?.try_collect().await?; + async fn test_glob_exact_path_existing_returns_listing_with_size() -> VortexResult<()> { + let fs = MockFileSystem::new(vec![FileListing { + path: "data/file.vortex".to_string(), + size: Some(1024), + }]); + let fs_dyn: &dyn FileSystem = &fs; + let results: Vec = fs_dyn.glob("data/file.vortex")?.try_collect().await?; assert_eq!(results.len(), 1); assert_eq!(results[0].path, "data/file.vortex"); - assert_eq!(results[0].size, None); + assert_eq!( + results[0].size, + Some(1024), + "exact-path glob should propagate the size reported by list" + ); + assert_eq!( + fs.last_prefix().as_deref(), + Some("data/file.vortex"), + "exact path should be passed as the list prefix" + ); + Ok(()) + } + + #[tokio::test] + async fn test_glob_exact_path_missing_returns_empty_stream() -> VortexResult<()> { + let fs = MockFileSystem::new(vec![]); + let fs_dyn: &dyn FileSystem = &fs; + let results: Vec = fs_dyn.glob("data/missing.vortex")?.try_collect().await?; + assert!( + results.is_empty(), + "missing exact path should yield an empty stream" + ); + Ok(()) + } + + #[tokio::test] + async fn test_glob_exact_path_filters_prefix_collisions() -> VortexResult<()> { + // Object stores list by prefix, so `list("foo.vortex")` can return `foo.vortex.backup`. + // The exact-path branch must filter to an equality match. + let fs = MockFileSystem::new(vec![ + FileListing { + path: "foo.vortex".to_string(), + size: Some(10), + }, + FileListing { + path: "foo.vortex.backup".to_string(), + size: Some(20), + }, + ]); + let fs_dyn: &dyn FileSystem = &fs; + let results: Vec = fs_dyn.glob("foo.vortex")?.try_collect().await?; + assert_eq!(results.len(), 1); + assert_eq!(results[0].path, "foo.vortex"); + assert_eq!(results[0].size, Some(10)); Ok(()) } From f11756fd388d4078c022de3b2b7341f2c11f96eb Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 4 Jun 2026 14:29:51 +0100 Subject: [PATCH 2/4] more Signed-off-by: Robert Kruszewski --- vortex-duckdb/src/filesystem.rs | 16 ++++ vortex-io/src/compat/filesystem.rs | 4 + vortex-io/src/filesystem/glob.rs | 98 +++++++++++------------- vortex-io/src/filesystem/mod.rs | 11 +++ vortex-io/src/filesystem/prefix.rs | 12 +++ vortex-io/src/object_store/filesystem.rs | 86 +++++++++++++++++++++ 6 files changed, 172 insertions(+), 55 deletions(-) diff --git a/vortex-duckdb/src/filesystem.rs b/vortex-duckdb/src/filesystem.rs index b88ce80fd38..b9c09dfe9d3 100644 --- a/vortex-duckdb/src/filesystem.rs +++ b/vortex-duckdb/src/filesystem.rs @@ -147,6 +147,22 @@ impl FileSystem for DuckDbFileSystem { .boxed() } + async fn head(&self, path: &str) -> VortexResult> { + // DuckDB's filesystem exposes no stat/exists call, so probe the path by opening it and + // reading its size. DuckDB does not distinguish "not found" from other open failures, so + // any open error is treated as a missing file (logged for diagnosability). + match self.open_read(path).await { + Ok(reader) => Ok(Some(FileListing { + path: path.to_string(), + size: Some(reader.size().await?), + })), + Err(e) => { + tracing::debug!("head({path}): treating open error as not-found: {e}"); + Ok(None) + } + } + } + async fn open_read(&self, path: &str) -> VortexResult> { let mut url = self.base_url.clone(); url.set_path(path); diff --git a/vortex-io/src/compat/filesystem.rs b/vortex-io/src/compat/filesystem.rs index 9f9902b6a08..440ecfe8a71 100644 --- a/vortex-io/src/compat/filesystem.rs +++ b/vortex-io/src/compat/filesystem.rs @@ -21,6 +21,10 @@ impl FileSystem for Compat { Compat::new(self.inner().list(prefix)).boxed() } + async fn head(&self, path: &str) -> VortexResult> { + Compat::new(self.inner().head(path)).await + } + async fn open_read(&self, path: &str) -> VortexResult> { let read_at = Compat::new(self.inner().open_read(path)).await?; Ok(Arc::new(Compat::new(read_at))) diff --git a/vortex-io/src/filesystem/glob.rs b/vortex-io/src/filesystem/glob.rs index 255c4e983c3..030bf4f56d1 100644 --- a/vortex-io/src/filesystem/glob.rs +++ b/vortex-io/src/filesystem/glob.rs @@ -3,6 +3,7 @@ use futures::StreamExt; use futures::TryStreamExt; +use futures::stream; use futures::stream::BoxStream; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -22,20 +23,16 @@ impl dyn FileSystem + '_ { pub fn glob(&self, pattern: &str) -> VortexResult>> { validate_glob(pattern)?; - // If there are no glob characters, the pattern is an exact file path. - // Use [`list`](FileSystem::list) with the full path as the prefix so that - // the filesystem confirms the file exists and populates its size, then - // filter to the exact match to avoid yielding prefix-collisions such as - // `foo.vortex.backup` when the caller asked for `foo.vortex`. + // If there are no glob characters, the pattern is an exact file path. `list` enumerates + // entries *under* a prefix on a path-segment basis and never yields the prefix itself, so + // listing an exact path would report an existing file as missing (and could surface prefix + // collisions such as `foo.vortex.backup` when the caller asked for `foo.vortex`). Use + // `head` to confirm the file exists and capture its size, yielding a single-element stream + // when it does and an empty stream when it does not. if !pattern.contains(['*', '?', '[']) { let pattern = pattern.to_string(); - let stream = self - .list(&pattern) - .try_filter(move |listing| { - let matches = listing.path == pattern; - async move { matches } - }) - .into_stream() + let stream = stream::once(async move { self.head(&pattern).await }) + .try_filter_map(|listing| async move { Ok(listing) }) .boxed(); return Ok(stream); } @@ -95,40 +92,47 @@ mod tests { use async_trait::async_trait; use futures::TryStreamExt; - use futures::stream; - use parking_lot::Mutex; use vortex_error::vortex_panic; use super::*; use crate::VortexReadAt; use crate::filesystem::FileSystem; - /// A mock filesystem whose `list` records the requested prefix and returns - /// a preconfigured set of listings. + /// A mock filesystem that resolves exact paths through [`head`](FileSystem::head) and + /// panics if [`list`](FileSystem::list) is called. This encodes the invariant the fix + /// depends on: the exact-path glob branch must never list, because an object store's `list` + /// does not return the exact path of a file. #[derive(Debug)] - struct MockFileSystem { - listings: Vec, - last_prefix: Mutex>, + struct HeadFileSystem { + files: Vec, } - impl MockFileSystem { - fn new(listings: Vec) -> Self { + impl HeadFileSystem { + fn new(files: &[(&str, u64)]) -> Self { Self { - listings, - last_prefix: Mutex::new(None), + files: files + .iter() + .map(|&(path, size)| FileListing { + path: path.to_string(), + size: Some(size), + }) + .collect(), } } - - fn last_prefix(&self) -> Option { - self.last_prefix.lock().clone() - } } #[async_trait] - impl FileSystem for MockFileSystem { - fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { - *self.last_prefix.lock() = Some(prefix.to_string()); - stream::iter(self.listings.clone().into_iter().map(Ok)).boxed() + impl FileSystem for HeadFileSystem { + fn list(&self, _prefix: &str) -> BoxStream<'_, VortexResult> { + vortex_panic!("list() must not be called for an exact path; glob should use head()") + } + + async fn head(&self, path: &str) -> VortexResult> { + Ok(self + .files + .iter() + .find(|listing| listing.path == path) + .cloned()) } async fn open_read(&self, _path: &str) -> VortexResult> { @@ -142,10 +146,7 @@ mod tests { #[tokio::test] async fn test_glob_exact_path_existing_returns_listing_with_size() -> VortexResult<()> { - let fs = MockFileSystem::new(vec![FileListing { - path: "data/file.vortex".to_string(), - size: Some(1024), - }]); + let fs = HeadFileSystem::new(&[("data/file.vortex", 1024)]); let fs_dyn: &dyn FileSystem = &fs; let results: Vec = fs_dyn.glob("data/file.vortex")?.try_collect().await?; assert_eq!(results.len(), 1); @@ -153,19 +154,14 @@ mod tests { assert_eq!( results[0].size, Some(1024), - "exact-path glob should propagate the size reported by list" - ); - assert_eq!( - fs.last_prefix().as_deref(), - Some("data/file.vortex"), - "exact path should be passed as the list prefix" + "exact-path glob should propagate the size reported by head" ); Ok(()) } #[tokio::test] async fn test_glob_exact_path_missing_returns_empty_stream() -> VortexResult<()> { - let fs = MockFileSystem::new(vec![]); + let fs = HeadFileSystem::new(&[]); let fs_dyn: &dyn FileSystem = &fs; let results: Vec = fs_dyn.glob("data/missing.vortex")?.try_collect().await?; assert!( @@ -176,19 +172,11 @@ mod tests { } #[tokio::test] - async fn test_glob_exact_path_filters_prefix_collisions() -> VortexResult<()> { - // Object stores list by prefix, so `list("foo.vortex")` can return `foo.vortex.backup`. - // The exact-path branch must filter to an equality match. - let fs = MockFileSystem::new(vec![ - FileListing { - path: "foo.vortex".to_string(), - size: Some(10), - }, - FileListing { - path: "foo.vortex.backup".to_string(), - size: Some(20), - }, - ]); + async fn test_glob_exact_path_ignores_prefix_siblings() -> VortexResult<()> { + // A real object store lists by prefix and would surface `foo.vortex.backup` when asked to + // list `foo.vortex`. Resolving the exact path via head sidesteps that: only the requested + // key is returned, and the panicking `list` proves the branch never enumerated. + let fs = HeadFileSystem::new(&[("foo.vortex", 10), ("foo.vortex.backup", 20)]); let fs_dyn: &dyn FileSystem = &fs; let results: Vec = fs_dyn.glob("foo.vortex")?.try_collect().await?; assert_eq!(results.len(), 1); diff --git a/vortex-io/src/filesystem/mod.rs b/vortex-io/src/filesystem/mod.rs index 97535909d28..740d9e2bc43 100644 --- a/vortex-io/src/filesystem/mod.rs +++ b/vortex-io/src/filesystem/mod.rs @@ -51,6 +51,17 @@ pub trait FileSystem: Debug + Send + Sync { /// callers should sort if deterministic ordering is required. fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult>; + /// Fetch metadata for the file at the exact `path`, if it exists. + /// + /// Unlike [`list`](FileSystem::list), which enumerates files *under* a prefix on a + /// path-segment basis and never yields the prefix itself, `head` looks up the object at + /// exactly `path`. It is the correct primitive for confirming that a single known file + /// exists and reading its size. + /// + /// Returns `Ok(Some(_))` with the file's [`FileListing`] when it exists, `Ok(None)` when no + /// file exists at `path`, and `Err(_)` for any other failure (I/O or permission errors, etc.). + async fn head(&self, path: &str) -> VortexResult>; + /// Open a file for reading at the given path. async fn open_read(&self, path: &str) -> VortexResult>; diff --git a/vortex-io/src/filesystem/prefix.rs b/vortex-io/src/filesystem/prefix.rs index 5f6304430aa..d96f54cd9bb 100644 --- a/vortex-io/src/filesystem/prefix.rs +++ b/vortex-io/src/filesystem/prefix.rs @@ -52,6 +52,18 @@ impl FileSystem for PrefixFileSystem { .boxed() } + async fn head(&self, path: &str) -> VortexResult> { + let full_path = format!("{}{}", self.prefix, path.trim_start_matches('/')); + Ok(self.inner.head(&full_path).await?.map(|mut listing| { + listing.path = listing + .path + .strip_prefix(&self.prefix) + .unwrap_or(&listing.path) + .to_string(); + listing + })) + } + async fn open_read(&self, path: &str) -> VortexResult> { self.inner .open_read(&format!("{}{}", self.prefix, path.trim_start_matches('/'))) diff --git a/vortex-io/src/object_store/filesystem.rs b/vortex-io/src/object_store/filesystem.rs index b8c2aa39ce2..9fb0adb636c 100644 --- a/vortex-io/src/object_store/filesystem.rs +++ b/vortex-io/src/object_store/filesystem.rs @@ -76,6 +76,19 @@ impl FileSystem for ObjectStoreFileSystem { .boxed() } + async fn head(&self, path: &str) -> VortexResult> { + // `head` issues a single metadata lookup (e.g. an S3 HEAD) for the exact key, unlike + // `list`, which enumerates by path-segment prefix and never returns the key itself. + match self.store.head(&Path::from(path)).await { + Ok(meta) => Ok(Some(FileListing { + path: meta.location.to_string(), + size: Some(meta.size), + })), + Err(object_store::Error::NotFound { .. }) => Ok(None), + Err(e) => Err(e.into()), + } + } + async fn open_read(&self, path: &str) -> VortexResult> { Ok(Arc::new(ObjectStoreReadAt::new( Arc::clone(&self.store), @@ -94,3 +107,76 @@ impl FileSystem for ObjectStoreFileSystem { Ok(()) } } + +// Exercises the fix against a real object store, whose `list` excludes the exact-path match. +// `Handle::find` only resolves a runtime under the `tokio` feature, so gate these tests on it. +#[cfg(test)] +#[cfg(feature = "tokio")] +mod tests { + use futures::TryStreamExt; + use object_store::ObjectStoreExt; + use object_store::memory::InMemory; + + use super::*; + use crate::filesystem::FileSystem; + use crate::runtime::Handle; + + /// Build an [`ObjectStoreFileSystem`] over an in-memory store seeded with `(path, size)` files. + async fn memory_fs(files: &[(&str, usize)]) -> VortexResult { + let store = Arc::new(InMemory::new()) as Arc; + for &(path, size) in files { + store.put(&Path::from(path), vec![0u8; size].into()).await?; + } + let handle = Handle::find().expect("tokio runtime available within #[tokio::test]"); + Ok(ObjectStoreFileSystem::new(store, handle)) + } + + /// Regression test for #6599: globbing an exact path that exists must return that one file. + /// `ObjectStore::list` never yields the prefix itself, so this would return nothing if the + /// exact-path branch used `list`. + #[tokio::test] + async fn test_glob_exact_existing_path() -> VortexResult<()> { + let fs = memory_fs(&[("data/file.vortex", 1024)]).await?; + let fs_dyn: &dyn FileSystem = &fs; + let results: Vec = fs_dyn.glob("data/file.vortex")?.try_collect().await?; + assert_eq!(results.len(), 1); + assert_eq!(results[0].path, "data/file.vortex"); + assert_eq!(results[0].size, Some(1024)); + Ok(()) + } + + #[tokio::test] + async fn test_glob_exact_missing_path_is_empty() -> VortexResult<()> { + let fs = memory_fs(&[("data/other.vortex", 1)]).await?; + let fs_dyn: &dyn FileSystem = &fs; + let results: Vec = fs_dyn.glob("data/missing.vortex")?.try_collect().await?; + assert!(results.is_empty()); + Ok(()) + } + + /// `list("foo.vortex")` would surface the prefix-sibling `foo.vortex.backup`; `head` does not. + #[tokio::test] + async fn test_glob_exact_path_ignores_prefix_siblings() -> VortexResult<()> { + let fs = memory_fs(&[("foo.vortex", 10), ("foo.vortex.backup", 20)]).await?; + let fs_dyn: &dyn FileSystem = &fs; + let results: Vec = fs_dyn.glob("foo.vortex")?.try_collect().await?; + assert_eq!(results.len(), 1); + assert_eq!(results[0].path, "foo.vortex"); + assert_eq!(results[0].size, Some(10)); + Ok(()) + } + + #[tokio::test] + async fn test_head_existing_and_missing() -> VortexResult<()> { + let fs = memory_fs(&[("a/b.vortex", 7)]).await?; + assert_eq!( + fs.head("a/b.vortex").await?, + Some(FileListing { + path: "a/b.vortex".to_string(), + size: Some(7), + }) + ); + assert_eq!(fs.head("a/missing.vortex").await?, None); + Ok(()) + } +} From e1df09e2dbfd0dcd4011f30f4e4de432e0695d54 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 4 Jun 2026 14:42:15 +0100 Subject: [PATCH 3/4] fixerror Signed-off-by: Robert Kruszewski --- vortex-ffi/test/scan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-ffi/test/scan.cpp b/vortex-ffi/test/scan.cpp index bc39fa12074..2e138a12913 100644 --- a/vortex-ffi/test/scan.cpp +++ b/vortex-ffi/test/scan.cpp @@ -209,7 +209,7 @@ TEST_CASE("Creating datasources", "[datasource]") { ds = vx_data_source_new(session, &opts, &error); REQUIRE(ds == nullptr); REQUIRE(error != nullptr); - REQUIRE_THAT(to_string(error), ContainsSubstring("No such file or directory")); + REQUIRE_THAT(to_string(error), ContainsSubstring("No files matched the glob pattern")); vx_error_free(error); opts.paths = "/tmp2/*.vortex"; From 134e11c069be255e1eb5186a77539939e4d9d8e7 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Thu, 4 Jun 2026 15:50:49 +0100 Subject: [PATCH 4/4] more Signed-off-by: Robert Kruszewski --- vortex-io/src/filesystem/mod.rs | 10 ++ vortex-io/src/object_store/filesystem.rs | 138 +++++++++++++++++++++-- 2 files changed, 136 insertions(+), 12 deletions(-) diff --git a/vortex-io/src/filesystem/mod.rs b/vortex-io/src/filesystem/mod.rs index 740d9e2bc43..d21a1bd8200 100644 --- a/vortex-io/src/filesystem/mod.rs +++ b/vortex-io/src/filesystem/mod.rs @@ -36,6 +36,16 @@ pub type FileSystemRef = Arc; /// Implementations handle the details of a particular storage backend (local disk, S3, GCS, etc.) /// while consumers work through this uniform interface. /// +/// # Paths +/// +/// Path strings are *literal* object keys / file paths: the characters are used verbatim, with no +/// shell-style `~` expansion (`~` is a literal tilde, not the home directory) and no +/// percent-encoding or -decoding applied by this layer (`%20` is the three characters `%`, `2`, +/// `0`, not a space). A path produced by [`list`](FileSystem::list) or [`head`](FileSystem::head) +/// is the object's actual key, so it can be passed straight back to +/// [`open_read`](FileSystem::open_read) — including when it contains characters such as `~`, `%`, +/// `[`, `]`, or `#`. +/// /// # Future Work /// /// An `open_write` method will be added once [`VortexWrite`](crate::VortexWrite) is diff --git a/vortex-io/src/object_store/filesystem.rs b/vortex-io/src/object_store/filesystem.rs index 9fb0adb636c..0688216fc0f 100644 --- a/vortex-io/src/object_store/filesystem.rs +++ b/vortex-io/src/object_store/filesystem.rs @@ -55,22 +55,41 @@ impl ObjectStoreFileSystem { } } +/// Convert a literal filesystem path into an object-store [`Path`] (key). +/// +/// Object stores key their objects *literally*: a file named `a~b.vortex` has the key +/// `a~b.vortex`, and `LocalFileSystem` likewise surfaces real filenames verbatim. [`Path::parse`] +/// preserves those characters, whereas [`Path::from`] percent-encodes `~`, `%`, `[`, `]`, `#`, +/// etc. — turning `a~b.vortex` into the key `a%7Eb.vortex`, which no real object has. Using +/// `parse` keeps inputs and the keys returned by [`list`](FileSystem::list) on the same literal +/// representation, so a path from `list`/`head` round-trips back through `open_read` unchanged. +/// +/// `parse` rejects empty, `.`, and `..` segments; for those we fall back to [`Path::from`], which +/// normalizes them (this never applies to a key `list` produced, so it cannot break a round-trip). +fn to_object_path(path: &str) -> Path { + Path::parse(path).unwrap_or_else(|_| Path::from(path)) +} + +fn listing_from_meta(location: &Path, size: u64) -> FileListing { + FileListing { + path: location.to_string(), + size: Some(size), + } +} + #[async_trait] impl FileSystem for ObjectStoreFileSystem { fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { let path = if prefix.is_empty() { None } else { - Some(Path::from(prefix)) + Some(to_object_path(prefix)) }; self.store .list(path.as_ref()) .map(|result| { result - .map(|meta| FileListing { - path: meta.location.to_string(), - size: Some(meta.size), - }) + .map(|meta| listing_from_meta(&meta.location, meta.size)) .map_err(Into::into) }) .boxed() @@ -79,11 +98,8 @@ impl FileSystem for ObjectStoreFileSystem { async fn head(&self, path: &str) -> VortexResult> { // `head` issues a single metadata lookup (e.g. an S3 HEAD) for the exact key, unlike // `list`, which enumerates by path-segment prefix and never returns the key itself. - match self.store.head(&Path::from(path)).await { - Ok(meta) => Ok(Some(FileListing { - path: meta.location.to_string(), - size: Some(meta.size), - })), + match self.store.head(&to_object_path(path)).await { + Ok(meta) => Ok(Some(listing_from_meta(&meta.location, meta.size))), Err(object_store::Error::NotFound { .. }) => Ok(None), Err(e) => Err(e.into()), } @@ -92,7 +108,7 @@ impl FileSystem for ObjectStoreFileSystem { async fn open_read(&self, path: &str) -> VortexResult> { Ok(Arc::new(ObjectStoreReadAt::new( Arc::clone(&self.store), - path.into(), + to_object_path(path), self.handle.clone(), ))) } @@ -115,17 +131,24 @@ impl FileSystem for ObjectStoreFileSystem { mod tests { use futures::TryStreamExt; use object_store::ObjectStoreExt; + use object_store::local::LocalFileSystem; use object_store::memory::InMemory; + use rstest::rstest; use super::*; use crate::filesystem::FileSystem; use crate::runtime::Handle; /// Build an [`ObjectStoreFileSystem`] over an in-memory store seeded with `(path, size)` files. + /// + /// Keys are written with [`to_object_path`] so the store holds the same literal keys a real + /// backend would (e.g. `a~b.vortex`, not the percent-encoded `a%7Eb.vortex`). async fn memory_fs(files: &[(&str, usize)]) -> VortexResult { let store = Arc::new(InMemory::new()) as Arc; for &(path, size) in files { - store.put(&Path::from(path), vec![0u8; size].into()).await?; + store + .put(&to_object_path(path), vec![0u8; size].into()) + .await?; } let handle = Handle::find().expect("tokio runtime available within #[tokio::test]"); Ok(ObjectStoreFileSystem::new(store, handle)) @@ -166,6 +189,97 @@ mod tests { Ok(()) } + /// Paths containing characters that `object_store` percent-encodes (`~ % [ ] # { } ^`, …) + /// must round-trip on a single literal-path convention: `head` returns the literal key, and + /// that exact string must reopen the file via `open_read` (what multi-file scan does). + /// Previously these were converted with `Path::from`, which encoded them into a key no real + /// object has, so the file was silently lost. + #[tokio::test] + #[rstest] + #[case::tilde("dir/a~b.vortex")] + #[case::percent("dir/a%20b.vortex")] + #[case::brackets("dir/a[1].vortex")] + #[case::hash("dir/a#b.vortex")] + #[case::braces("dir/a{x}.vortex")] + #[case::caret("dir/a^b.vortex")] + #[case::backslash_tilde("dir/a\\~b.vortex")] + #[case::space("dir/a b.vortex")] + async fn test_head_open_read_round_trip_special_chars(#[case] path: &str) -> VortexResult<()> { + let fs = memory_fs(&[(path, 5)]).await?; + // `head` returns the literal key, matching the caller's input. + assert_eq!( + fs.head(path).await?, + Some(FileListing { + path: path.to_string(), + size: Some(5), + }) + ); + // and that literal path reopens the file (`size()` issues a `head` under the hood). + assert_eq!(fs.open_read(path).await?.size().await?, 5); + Ok(()) + } + + /// Glob (both branches) over paths with encoded—but not glob-metacharacter—characters returns + /// the literal path, which must reopen the file. (`*`, `?`, `[` are excluded here: they make + /// the input a pattern rather than an exact path.) + #[tokio::test] + #[rstest] + #[case::tilde("dir/a~b.vortex")] + #[case::percent("dir/a%20b.vortex")] + #[case::hash("dir/a#b.vortex")] + #[case::backslash_tilde("dir/a\\~b.vortex")] + #[case::space("dir/a b.vortex")] + #[case::plain("dir/plain.vortex")] + async fn test_glob_round_trip_special_chars(#[case] path: &str) -> VortexResult<()> { + let fs = memory_fs(&[(path, 5)]).await?; + let fs_dyn: &dyn FileSystem = &fs; + let expected = FileListing { + path: path.to_string(), + size: Some(5), + }; + + // Exact-path glob returns the literal path. + let exact: Vec = fs_dyn.glob(path)?.try_collect().await?; + assert_eq!(exact, vec![expected.clone()]); + + // Wildcard glob over the directory lists the same literal path (not an encoded one). + let wild: Vec = fs_dyn.glob("dir/*.vortex")?.try_collect().await?; + assert!( + wild.contains(&expected), + "wildcard glob should list {path:?}, got {wild:?}" + ); + + // The returned path reopens the file. + assert_eq!(fs.open_read(path).await?.size().await?, 5); + Ok(()) + } + + /// The same round-trip against the real local-filesystem backend: a `~` in an on-disk filename + /// must be addressed literally (converting it with `Path::from` would percent-encode it and + /// miss the file). Confirms the literal-path convention holds across backends, not just for the + /// in-memory store. + #[tokio::test] + async fn test_local_filesystem_special_char_round_trip() -> anyhow::Result<()> { + let dir = tempfile::tempdir()?; + std::fs::write(dir.path().join("a~b.vortex"), [0u8; 5])?; + let store = Arc::new(LocalFileSystem::new_with_prefix(dir.path())?) as Arc; + let fs = ObjectStoreFileSystem::new(store, Handle::find().expect("tokio runtime")); + let fs_dyn: &dyn FileSystem = &fs; + let expected = FileListing { + path: "a~b.vortex".to_string(), + size: Some(5), + }; + + let wild: Vec = fs_dyn.glob("*.vortex")?.try_collect().await?; + assert!(wild.contains(&expected), "wildcard glob got {wild:?}"); + + let exact: Vec = fs_dyn.glob("a~b.vortex")?.try_collect().await?; + assert_eq!(exact, vec![expected]); + + assert_eq!(fs.open_read("a~b.vortex").await?.size().await?, 5); + Ok(()) + } + #[tokio::test] async fn test_head_existing_and_missing() -> VortexResult<()> { let fs = memory_fs(&[("a/b.vortex", 7)]).await?;