From ca99d5adeb794b35bca91fe76587b44c6d13db12 Mon Sep 17 00:00:00 2001
From: Tom Nicholas <tom@cworthy.org>
Date: Wed, 15 May 2024 21:18:15 -0600
Subject: [PATCH] Test fsspec roundtrip (#42)

* move kerchunk backend imports to be specific to each backend filetype

* test roundtrip to json file then reading using fsspec

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test env dependencies

* more test env deps

* more

* add pip install of xarray PR

* correct pip url

* roundtrip test involving concatenation

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove duplication of pooch

* correct formatting

* try removing netcdf4-python from the environment

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 pyproject.toml                         |  6 +--
 virtualizarr/tests/test_integration.py | 60 ++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 16e8486..8338279 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,8 +28,7 @@ dependencies = [
     "numpy",
     "ujson",
     "packaging",
-    "universal-pathlib"
-
+    "universal-pathlib",
 ]
 
 [project.optional-dependencies]
@@ -39,8 +38,9 @@ test = [
     "pytest-mypy",
     "pytest-cov",
     "pytest",
-    "scipy",
+    "fsspec",
     "pooch",
+    "scipy",
     "ruff",
     "fastparquet",
     "s3fs"
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 578bfab..3d199b7 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -1,3 +1,4 @@
+import fsspec
 import pytest
 import xarray as xr
 import xarray.testing as xrt
@@ -5,6 +6,65 @@
 from virtualizarr import open_virtual_dataset
 
 
+def test_kerchunk_roundtrip_no_concat(tmpdir):
+    # set up example xarray dataset
+    ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
+
+    # save it to disk as netCDF (in temporary directory)
+    ds.to_netcdf(f"{tmpdir}/air.nc")
+
+    # use open_dataset_via_kerchunk to read it as references
+    vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
+
+    # write those references to disk as kerchunk json
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")
+
+    # use fsspec to read the dataset from disk via the zarr store
+    fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json")
+    m = fs.get_mapper("")
+
+    roundtrip = xr.open_dataset(m, engine="kerchunk")
+
+    # assert equal to original dataset
+    xrt.assert_equal(roundtrip, ds)
+
+
+def test_kerchunk_roundtrip_concat(tmpdir):
+    # set up example xarray dataset
+    ds = xr.tutorial.open_dataset("air_temperature", decode_times=False).isel(
+        time=slice(None, 2000)
+    )
+
+    # split into two datasets
+    ds1, ds2 = ds.isel(time=slice(None, 1000)), ds.isel(time=slice(1000, None))
+
+    # save it to disk as netCDF (in temporary directory)
+    ds1.to_netcdf(f"{tmpdir}/air1.nc")
+    ds2.to_netcdf(f"{tmpdir}/air2.nc")
+
+    # use open_dataset_via_kerchunk to read it as references
+    vds1 = open_virtual_dataset(f"{tmpdir}/air1.nc", indexes={})
+    vds2 = open_virtual_dataset(f"{tmpdir}/air2.nc", indexes={})
+
+    # concatenate virtually along time
+    vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")
+    print(vds["air"].variable._data)
+
+    # write those references to disk as kerchunk json
+    vds.virtualize.to_kerchunk(f"{tmpdir}/refs.json", format="json")
+
+    # use fsspec to read the dataset from disk via the zarr store
+    fs = fsspec.filesystem("reference", fo=f"{tmpdir}/refs.json")
+    m = fs.get_mapper("")
+
+    roundtrip = xr.open_dataset(m, engine="kerchunk")
+
+    # user does analysis here
+
+    # assert equal to original dataset
+    xrt.assert_equal(roundtrip, ds)
+
+
 def test_open_scalar_variable(tmpdir):
     # regression test for GH issue #100