Skip to content

Commit

Permalink
Convert zarr helpers to utilities, update numpy chunk encoding in zar…
Browse files Browse the repository at this point in the history
…r router (#260)

* Move zmetadata and zvariable to utils

* Handle numpy chunk encoding, Update tests

* Update tests (I think this is correct)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ruff

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean up, add docs for xpublish_id

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update cache test with new behavior

* Small doc update

* Update docs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update docs/source/user-guide/plugins.md

Co-authored-by: Alex Kerney <abk@mac.com>

* Update docs/source/user-guide/plugins.md

Co-authored-by: Alex Kerney <abk@mac.com>

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Alex Kerney <abk@mac.com>
  • Loading branch information
3 people committed Apr 12, 2024
1 parent 3dfdc26 commit a4c7d17
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 179 deletions.
15 changes: 14 additions & 1 deletion docs/source/user-guide/plugins.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,17 @@ The plugin should return a dataset if it knows about the dataset corresponding t
otherwise it should return None, so that Xpublish knows to continue looking to the next
plugin or the passed in dictionary of datasets.

```{warning}
When creating a dataset provider, you need to set a unique `_xpublish_id` attribute (use `DATASET_ID_ATTR_KEY` from `xpublish.utils.api`) on each dataset for routers to manage caching appropriately. See the `assign_attrs` call below for an example.
We suggest including the plugin name as part of the attribute to help uniqueness.
```

A plugin that provides the Xarray tutorial `air_temperature` dataset.

```python
from xpublish import Plugin, hookimpl
from xpublish.utils.api import DATASET_ID_ATTR_KEY


class TutorialDataset(Plugin):
Expand All @@ -307,7 +314,9 @@ class TutorialDataset(Plugin):
@hookimpl
def get_dataset(self, dataset_id: str):
if dataset_id == "air":
return xr.tutorial.open_dataset("air_temperature")
return xr.tutorial.open_dataset("air_temperature").assign_attrs(
{DATASET_ID_ATTR_KEY: f"{self.name}_air"}
)

return None
```
Expand All @@ -318,3 +327,7 @@ Plugins can also provide new hook specifications that other plugins can then imp
This allows Xpublish to support things that we haven't even thought of yet.

These return a class of hookspecs from {py:meth}`xpublish.plugins.hooks.PluginSpec.register_hookspec`.

```
```
9 changes: 4 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,17 @@ def test_invalid_encoding_chunks_with_dask_raise():
data = dask.array.zeros((10, 20, 30), chunks=expected)
ds = xr.Dataset({'foo': (['x', 'y', 'z'], data)})
ds['foo'].encoding['chunks'] = [8, 5, 1]
with pytest.raises(NotImplementedError) as excinfo:
with pytest.raises(ValueError) as excinfo:
_ = create_zmetadata(ds)
excinfo.match(r'Specified zarr chunks .*')


def test_invalid_encoding_chunks_with_numpy_raise():
def test_ignore_encoding_chunks_with_numpy():
data = np.zeros((10, 20, 30))
ds = xr.Dataset({'foo': (['x', 'y', 'z'], data)})
ds['foo'].encoding['chunks'] = [8, 5, 1]
with pytest.raises(ValueError) as excinfo:
_ = create_zmetadata(ds)
excinfo.match(r'Encoding chunks do not match inferred.*')
zmetadata = create_zmetadata(ds)
assert zmetadata['metadata']['foo/.zarray']['chunks'] == [10, 20, 30]


def test_get_data_chunk_numpy():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def test_cache(airtemp_ds):

response1 = client.get('/zarr/air/0.0.0')
assert response1.status_code == 200
assert '/air/0.0.0' in rest.cache
assert 'airtemp/air/0.0.0' in rest.cache

# test that we can retrieve
response2 = client.get('/zarr/air/0.0.0')
Expand Down
65 changes: 3 additions & 62 deletions xpublish/dependencies.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
"""Helper functions to use a FastAPI dependencies."""
from typing import (
TYPE_CHECKING,
Dict,
List,
)

from typing import TYPE_CHECKING, Dict, List

import cachey
import pluggy
import xarray as xr
from fastapi import Depends

from .utils.api import DATASET_ID_ATTR_KEY
from .utils.zarr import ZARR_METADATA_KEY, create_zmetadata, create_zvariables

if TYPE_CHECKING:
from .plugins import Plugin # pragma: no cover
Expand All @@ -28,6 +21,7 @@ def get_dataset_ids() -> List[str]:
Returns:
A list of unique keys for datasets
"""
return [] # pragma: no cover

Expand Down Expand Up @@ -66,58 +60,6 @@ def get_cache() -> cachey.Cache:
return cachey.Cache(available_bytes=1e6) # pragma: no cover


def get_zvariables(
dataset: xr.Dataset = Depends(get_dataset),
cache: cachey.Cache = Depends(get_cache),
) -> dict:
"""FastAPI dependency that returns a dictionary of zarr encoded variables.
Args:
dataset: The dataset to get the zvariables from.
cache: The cache to use for storing the zvariables.
Returns:
A dictionary of zarr encoded variables.
"""
cache_key = dataset.attrs.get(DATASET_ID_ATTR_KEY, '') + '/' + 'zvariables'
zvariables = cache.get(cache_key)

if zvariables is None:
zvariables = create_zvariables(dataset)

# we want to permanently cache this: set high cost value
cache.put(cache_key, zvariables, 99999)

return zvariables


def get_zmetadata(
dataset: xr.Dataset = Depends(get_dataset),
cache: cachey.Cache = Depends(get_cache),
zvariables: dict = Depends(get_zvariables),
) -> dict:
"""FastAPI dependency that returns a consolidated zmetadata dictionary.
Args:
dataset: The dataset to get the zmetadata from.
cache: The cache to use for storing the zmetadata.
zvariables: The zvariables to use for creating the zmetadata.
Returns:
A consolidated zmetadata dictionary.
"""
cache_key = dataset.attrs.get(DATASET_ID_ATTR_KEY, '') + '/' + ZARR_METADATA_KEY
zmeta = cache.get(cache_key)

if zmeta is None:
zmeta = create_zmetadata(dataset)

# we want to permanently cache this: set high cost value
cache.put(cache_key, zmeta, 99999)

return zmeta


def get_plugins() -> Dict[str, 'Plugin']:
"""FastAPI dependency that returns the a dictionary of loaded plugins.
Expand All @@ -129,4 +71,3 @@ def get_plugins() -> Dict[str, 'Plugin']:

def get_plugin_manager() -> pluggy.PluginManager:
"""Return the active plugin manager."""
...
13 changes: 6 additions & 7 deletions xpublish/plugins/included/dataset_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,20 @@

from xpublish.utils.api import JSONResponse

from ...dependencies import get_zmetadata, get_zvariables
from ...utils.zarr import get_zmetadata, get_zvariables
from .. import Dependencies, Plugin, hookimpl


class DatasetInfoPlugin(Plugin):
"""Dataset metadata and schema routes."""
"""Dataset metadata."""

name: str = 'dataset_info'

dataset_router_prefix: str = ''
dataset_router_tags: Sequence[str] = ['dataset_info']

@hookimpl
def dataset_router(self, deps: Dependencies) -> APIRouter:
"""Returns a router with dataset metadata and schema routes."""
def dataset_router(self, deps: Dependencies) -> APIRouter: # noqa: D102
router = APIRouter(
prefix=self.dataset_router_prefix,
tags=list(self.dataset_router_tags),
Expand All @@ -39,22 +38,22 @@ def html_representation(
def list_keys(
dataset=Depends(deps.dataset),
) -> list[str]:
"""Returns a of the keys in a dataset."""
"""List of the keys in a dataset."""
return JSONResponse(list(dataset.variables))

@router.get('/dict')
def to_dict(
dataset=Depends(deps.dataset),
) -> dict:
"""Returns the full dataset as a dictionary."""
"""The full dataset as a dictionary."""
return JSONResponse(dataset.to_dict(data=False))

@router.get('/info')
def info(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns the dataset schema (close to the NCO-JSON schema)."""
"""Dataset schema (close to the NCO-JSON schema)."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand Down
12 changes: 6 additions & 6 deletions xpublish/plugins/included/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@

from xpublish.utils.api import JSONResponse

from ...dependencies import get_zmetadata, get_zvariables
from ...utils.api import DATASET_ID_ATTR_KEY
from ...utils.cache import CostTimer
from ...utils.zarr import (
ZARR_METADATA_KEY,
encode_chunk,
get_data_chunk,
get_zmetadata,
get_zvariables,
jsonify_zmetadata,
)
from .. import Dependencies, Plugin, hookimpl
Expand All @@ -32,8 +33,7 @@ class ZarrPlugin(Plugin):
dataset_router_tags: Sequence[str] = ['zarr']

@hookimpl
def dataset_router(self, deps: Dependencies) -> APIRouter:
"""Returns a router with Zarr-like accessing endpoints for datasets."""
def dataset_router(self, deps: Dependencies) -> APIRouter: # noqa: D102
router = APIRouter(
prefix=self.dataset_router_prefix,
tags=list(self.dataset_router_tags),
Expand All @@ -44,7 +44,7 @@ def get_zarr_metadata(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns consolidated Zarr metadata."""
"""Consolidated Zarr metadata."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand All @@ -57,7 +57,7 @@ def get_zarr_group(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns Zarr group data."""
"""Zarr group data."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand All @@ -68,7 +68,7 @@ def get_zarr_attrs(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns Zarr attributes."""
"""Zarr attributes."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand Down
3 changes: 3 additions & 0 deletions xpublish/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from .routers import dataset_collection_router
from .utils.api import (
DATASET_ID_ATTR_KEY,
SingleDatasetOpenAPIOverrider,
check_route_conflicts,
normalize_app_routers,
Expand Down Expand Up @@ -163,6 +164,8 @@ def get_dataset_from_plugins(
dataset = self.pm.hook.get_dataset(dataset_id=dataset_id)

if dataset:
if dataset.attrs.get(DATASET_ID_ATTR_KEY, None) is None:
dataset.attrs[DATASET_ID_ATTR_KEY] = dataset_id
return dataset

if dataset_id not in self._datasets:
Expand Down

0 comments on commit a4c7d17

Please sign in to comment.