diff --git a/ci/doc.yml b/ci/doc.yml index aaa1293..7d7e922 100644 --- a/ci/doc.yml +++ b/ci/doc.yml @@ -13,4 +13,4 @@ dependencies: - "sphinx_design" - "sphinx_togglebutton" - "sphinx-autodoc-typehints" - - -e .. + - -e "..[test]" diff --git a/docs/usage.md b/docs/usage.md index 4fc5411..baaf5b1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,6 +27,7 @@ vds = open_virtual_dataset('air.nc') (Notice we did not have to explicitly indicate the file format, as {py:func}`open_virtual_dataset ` will attempt to automatically infer it.) + ```{note} In future we would like for it to be possible to just use `xr.open_dataset`, e.g. @@ -61,6 +62,15 @@ Attributes: These {py:class}`ManifestArray ` objects are each a virtual reference to some data in the `air.nc` netCDF file, with the references stored in the form of "Chunk Manifests". +### Opening remote files + +To open remote files as virtual datasets pass the `reader_options` options, e.g. + +```python +aws_credentials = {"key": ..., "secret": ...} +vds = open_virtual_dataset("s3://some-bucket/file.nc", reader_options={'storage_options': aws_credentials}) +``` + ## Chunk Manifests In the Zarr model N-dimensional arrays are stored as a series of compressed chunks, each labelled by a chunk key which indicates its position in the array. Whilst conventionally each of these Zarr chunks are a separate compressed binary file stored within a Zarr Store, there is no reason why these chunks could not actually already exist as part of another file (e.g. a netCDF file), and be loaded by reading a specific byte range from this pre-existing file. diff --git a/pyproject.toml b/pyproject.toml index 4665db8..423574d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,8 @@ dependencies = [ "numpy", "ujson", "packaging", + "universal-pathlib" + ] [project.optional-dependencies] @@ -40,6 +42,7 @@ test = [ "scipy", "pooch", "ruff", + "s3fs" ] diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 6d7cd2d..f24d5f3 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -1,9 +1,10 @@ from pathlib import Path -from typing import NewType, cast +from typing import NewType, Optional, cast import ujson # type: ignore import xarray as xr +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray, ZAttrs # Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean @@ -38,7 +39,11 @@ class FileType(AutoName): def read_kerchunk_references_from_file( - filepath: str, filetype: FileType | None + filepath: str, + filetype: FileType | None, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, ) -> KerchunkStoreRefs: """ Read a single legacy file and return kerchunk references to its contents. @@ -50,10 +55,15 @@ def read_kerchunk_references_from_file( filetype : FileType, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. + reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, + so ensure reader_options match selected Kerchunk reader arguments. """ if filetype is None: - filetype = _automatically_determine_filetype(filepath) + filetype = _automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) # if filetype is user defined, convert to FileType filetype = FileType(filetype) @@ -61,12 +71,14 @@ def read_kerchunk_references_from_file( if filetype.name.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr - refs = NetCDF3ToZarr(filepath, inline_threshold=0).translate() + refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() elif filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr - refs = SingleHdf5ToZarr(filepath, inline_threshold=0).translate() + refs = SingleHdf5ToZarr( + filepath, inline_threshold=0, **reader_options + ).translate() elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 @@ -74,11 +86,11 @@ def read_kerchunk_references_from_file( elif filetype.name.lower() == "tiff": from kerchunk.tiff import tiff_to_zarr - refs = tiff_to_zarr(filepath, inline_threshold=0) + refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options) elif filetype.name.lower() == "fits": from kerchunk.fits import process_file - refs = process_file(filepath, inline_threshold=0) + refs = process_file(filepath, inline_threshold=0, **reader_options) else: raise NotImplementedError(f"Unsupported file type: {filetype.name}") @@ -86,20 +98,24 @@ def read_kerchunk_references_from_file( return refs -def _automatically_determine_filetype(filepath: str) -> FileType: +def _automatically_determine_filetype( + *, filepath: str, reader_options: Optional[dict] = {} +) -> FileType: file_extension = Path(filepath).suffix + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) if file_extension == ".nc": # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - with open(filepath, "rb") as f: - magic = f.read() + magic = fpath.read() + if magic[0:3] == b"CDF": filetype = FileType.netcdf3 elif magic[1:4] == b"HDF": filetype = FileType.netcdf4 else: raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4") - elif file_extension == ".zarr": # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... raise NotImplementedError() @@ -112,6 +128,7 @@ def _automatically_determine_filetype(filepath: str) -> FileType: else: raise NotImplementedError(f"Unrecognised file extension: {file_extension}") + fpath.close() return filetype diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 66315f1..d482097 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -170,8 +170,12 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") ds.to_netcdf(netcdf4_file_path, engine="h5netcdf") - assert FileType("netcdf3") == _automatically_determine_filetype(netcdf3_file_path) - assert FileType("netcdf4") == _automatically_determine_filetype(netcdf4_file_path) + assert FileType("netcdf3") == _automatically_determine_filetype( + filepath=netcdf3_file_path + ) + assert FileType("netcdf4") == _automatically_determine_filetype( + filepath=netcdf4_file_path + ) def test_FileType(): diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index cf00ad6..d145550 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,6 +1,7 @@ from collections.abc import Mapping import numpy as np +import pytest import xarray as xr import xarray.testing as xrt from xarray.core.indexes import Index @@ -268,6 +269,24 @@ def test_combine_by_coords(self, netcdf4_files): assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing +pytest.importorskip("s3fs") + + +@pytest.mark.parametrize( + "filetype", ["netcdf4", None], ids=["netcdf4 filetype", "None filetype"] +) +@pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) +def test_anon_read_s3(filetype, indexes): + """Parameterized tests for empty vs supplied indexes and filetypes.""" + # TODO: Switch away from this s3 url after minIO is implemented. + fpath = "s3://carbonplan-share/virtualizarr/local.nc" + vds = open_virtual_dataset(fpath, filetype=filetype, indexes=indexes) + + assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} + for var in vds.variables: + assert isinstance(vds[var].data, ManifestArray), var + + class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): vars_to_load = ["air", "time"] @@ -280,6 +299,7 @@ def test_loadable_variables(self, netcdf4_file): assert isinstance(vds[name].data, ManifestArray), name full_ds = xr.open_dataset(netcdf4_file) + for name in full_ds.variables: if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py new file mode 100644 index 0000000..6ba7105 --- /dev/null +++ b/virtualizarr/utils.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from fsspec.implementations.local import LocalFileOpener + from s3fs.core import S3File + + +def _fsspec_openfile_from_filepath( + *, + filepath: str, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, +) -> S3File | LocalFileOpener: + """Converts input filepath to fsspec openfile object. + + Parameters + ---------- + filepath : str + Input filepath + reader_options : _type_, optional + Dict containing kwargs to pass to file opener, by default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + + Returns + ------- + S3File | LocalFileOpener + Either S3File or LocalFileOpener, depending on which protocol was supplied. + + Raises + ------ + NotImplementedError + Raises a Not Implemented Error if filepath protocol is not supported. + """ + + import fsspec + from upath import UPath + + universal_filepath = UPath(filepath) + protocol = universal_filepath.protocol + + if protocol == "": + fpath = fsspec.open(filepath, "rb").open() + + elif protocol in ["s3"]: + s3_anon_defaults = {"key": "", "secret": "", "anon": True} + if not bool(reader_options): + storage_options = s3_anon_defaults + + else: + storage_options = reader_options.get("storage_options") # type: ignore + + # using dict merge operator to add in defaults if keys are not specified + storage_options = s3_anon_defaults | storage_options + + fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) + + else: + raise NotImplementedError( + "Only local and s3 file protocols are currently supported" + ) + + return fpath diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 3fcb6dc..41d9985 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import ( Literal, + Optional, overload, ) @@ -15,6 +16,7 @@ import virtualizarr.kerchunk as kerchunk from virtualizarr.kerchunk import FileType, KerchunkStoreRefs from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, @@ -35,6 +37,9 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, ) -> xr.Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -63,6 +68,9 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, + so ensure reader_options match selected Kerchunk reader arguments. Returns ------- @@ -112,7 +120,11 @@ def open_virtual_dataset( # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - ds = xr.open_dataset(filepath, drop_variables=drop_variables) + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) + + ds = xr.open_dataset(fpath, drop_variables=drop_variables) if indexes is None: # add default indexes by reading data from file