Test fsspec roundtrip (#42)

* move kerchunk backend imports to be specific to each backend filetype * test roundtrip to json file then reading using fsspec * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test env dependencies * more test env deps * more * add pip install of xarray PR * correct pip url * roundtrip test involving concatenation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove duplication of pooch * correct formatting * try removing netcdf4-python from the environment --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
zarr-developers · May 16, 2024 · ca99d5a · ca99d5a
1 parent f9ca667
commit ca99d5a
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 3 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,8 +28,7 @@ dependencies = [
     "numpy",
     "ujson",
     "packaging",
-    "universal-pathlib"
-
+    "universal-pathlib",
 ]
 
 [project.optional-dependencies]
@@ -39,8 +38,9 @@ test = [
     "pytest-mypy",
     "pytest-cov",
     "pytest",
-    "scipy",
+    "fsspec",
     "pooch",
+    "scipy",
     "ruff",
     "fastparquet",
     "s3fs"

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -1,10 +1,70 @@
+import fsspec
 import pytest
 import xarray as xr
 import xarray.testing as xrt
 
 from virtualizarr import open_virtual_dataset
 
 
+def test_kerchunk_roundtrip_no_concat(tmpdir):
+    # set up example xarray dataset
+    ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
+
+    # save it to disk as netCDF (in temporary directory)
+    ds.to_netcdf(f"{tmpdir}/air.nc")
+
+    # use open_dataset_via_kerchunk to read it as references
+    vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
+
+    # write those references to disk as kerchunk json
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")
+
+    # use fsspec to read the dataset from disk via the zarr store
+    fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json")
+    m = fs.get_mapper("")
+
+    roundtrip = xr.open_dataset(m, engine="kerchunk")
+
+    # assert equal to original dataset
+    xrt.assert_equal(roundtrip, ds)
+
+
+def test_kerchunk_roundtrip_concat(tmpdir):
+    # set up example xarray dataset
+    ds = xr.tutorial.open_dataset("air_temperature", decode_times=False).isel(
+        time=slice(None, 2000)
+    )
+
+    # split into two datasets
+    ds1, ds2 = ds.isel(time=slice(None, 1000)), ds.isel(time=slice(1000, None))
+
+    # save it to disk as netCDF (in temporary directory)
+    ds1.to_netcdf(f"{tmpdir}/air1.nc")
+    ds2.to_netcdf(f"{tmpdir}/air2.nc")
+
+    # use open_dataset_via_kerchunk to read it as references
+    vds1 = open_virtual_dataset(f"{tmpdir}/air1.nc", indexes={})
+    vds2 = open_virtual_dataset(f"{tmpdir}/air2.nc", indexes={})
+
+    # concatenate virtually along time
+    vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")
+    print(vds["air"].variable._data)
+
+    # write those references to disk as kerchunk json
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")
+
+    # use fsspec to read the dataset from disk via the zarr store
+    fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json")
+    m = fs.get_mapper("")
+
+    roundtrip = xr.open_dataset(m, engine="kerchunk")
+
+    # user does analysis here
+
+    # assert equal to original dataset
+    xrt.assert_equal(roundtrip, ds)
+
+
 def test_open_scalar_variable(tmpdir):
     # regression test for GH issue #100