Skip to content

Commit

Permalink
Test fsspec roundtrip (#42)
Browse files Browse the repository at this point in the history
* move kerchunk backend imports to be specific to each backend filetype

* test roundtrip to json file then reading using fsspec

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test env dependencies

* more test env deps

* more

* add pip install of xarray PR

* correct pip url

* roundtrip test involving concatenation

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove duplication of pooch

* correct formatting

* try removing netcdf4-python from the environment

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
TomNicholas and pre-commit-ci[bot] committed May 16, 2024
1 parent f9ca667 commit ca99d5a
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 3 deletions.
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ dependencies = [
"numpy",
"ujson",
"packaging",
"universal-pathlib"

"universal-pathlib",
]

[project.optional-dependencies]
Expand All @@ -39,8 +38,9 @@ test = [
"pytest-mypy",
"pytest-cov",
"pytest",
"scipy",
"fsspec",
"pooch",
"scipy",
"ruff",
"fastparquet",
"s3fs"
Expand Down
60 changes: 60 additions & 0 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,70 @@
import fsspec
import pytest
import xarray as xr
import xarray.testing as xrt

from virtualizarr import open_virtual_dataset


def test_kerchunk_roundtrip_no_concat(tmpdir):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)

# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/air.nc")

# use open_dataset_via_kerchunk to read it as references
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})

# write those references to disk as kerchunk json
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")

# use fsspec to read the dataset from disk via the zarr store
fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json")
m = fs.get_mapper("")

roundtrip = xr.open_dataset(m, engine="kerchunk")

# assert equal to original dataset
xrt.assert_equal(roundtrip, ds)


def test_kerchunk_roundtrip_concat(tmpdir):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False).isel(
time=slice(None, 2000)
)

# split into two datasets
ds1, ds2 = ds.isel(time=slice(None, 1000)), ds.isel(time=slice(1000, None))

# save it to disk as netCDF (in temporary directory)
ds1.to_netcdf(f"{tmpdir}/air1.nc")
ds2.to_netcdf(f"{tmpdir}/air2.nc")

# use open_dataset_via_kerchunk to read it as references
vds1 = open_virtual_dataset(f"{tmpdir}/air1.nc", indexes={})
vds2 = open_virtual_dataset(f"{tmpdir}/air2.nc", indexes={})

# concatenate virtually along time
vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")
print(vds["air"].variable._data)

# write those references to disk as kerchunk json
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")

# use fsspec to read the dataset from disk via the zarr store
fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json")
m = fs.get_mapper("")

roundtrip = xr.open_dataset(m, engine="kerchunk")

# user does analysis here

# assert equal to original dataset
xrt.assert_equal(roundtrip, ds)


def test_open_scalar_variable(tmpdir):
# regression test for GH issue #100

Expand Down

0 comments on commit ca99d5a

Please sign in to comment.