# Kvikio demo

In [1]:
%load_ext watermark

# These imports are currently unnecessary. I import them to show versions
# cupy_xarray registers the kvikio entrypoint on install.
# import cupy as cp
# import cudf
import cupy_xarray  # registers cupy accessor
import kvikio.zarr

import flox
import numpy_groupies
import numpy as np
import xarray as xr
import zarr

store = "./air-temperature.zarr"

%watermark -iv

flox          : 0.7.3.dev12+g796dcd2
json          : 2.0.9
xarray        : 2023.7.0
cupy_xarray   : 0.1.1+21.gd2da1e4.dirty
kvikio        : 23.2.0
zarr          : 2.16.0
numpy         : 1.24.4
sys           : 3.9.17 | packaged by conda-forge | (main, Aug 10 2023, 07:02:31) 
[GCC 12.3.0]
numpy_groupies: 0.9.22+2.gd148074



In [2]:
xr.backends.list_engines()

{'kvikio': <KvikioBackendEntrypoint>
   Open zarr files (.zarr) using Kvikio
   Learn more at https://docs.rapids.ai/api/kvikio/nightly/api.html#zarr,
 'store': <StoreBackendEntrypoint>
   Open AbstractDataStore instances in Xarray
   Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.StoreBackendEntrypoint.html,
 'zarr': <ZarrBackendEntrypoint>
   Open zarr files (.zarr) using zarr in Xarray
   Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.ZarrBackendEntrypoint.html}

In [23]:
%autoreload

# Consolidated must be False
ds = xr.open_dataset(store, engine="kvikio", consolidated=False)
print(ds.air._variable._data)
ds

MemoryCachedArray(array=CopyOnWriteArray(array=LazilyIndexedArray(array=_ElementwiseFunctionArray(LazilyIndexedArray(array=<cupy_xarray.kvikio.CupyZarrArrayWrapper object at 0x2b5b9f9b4130>, key=BasicIndexer((slice(None, None, None), slice(None, None, None), slice(None, None, None)))), func=functools.partial(<function _scale_offset_decoding at 0x2b5b86161d30>, scale_factor=0.01, add_offset=None, dtype=<class 'numpy.float32'>), dtype=dtype('float32')), key=BasicIndexer((slice(None, None, None), slice(None, None, None), slice(None, None, None))))))


## Create example dataset

- cannot be compressed

In [None]:
airt = xr.tutorial.open_dataset("air_temperature", engine="netcdf4")
for var in airt.variables:
    airt[var].encoding["compressor"] = None
airt["scalar"] = 12.0
airt.to_zarr(store, mode="w", consolidated=True)

## Test opening

### Standard usage

In [24]:
xr.open_dataset(store, engine="zarr").air

### Now with kvikio!

 - must read with `consolidated=False` (https://github.com/rapidsai/kvikio/issues/119)
 - dask.from_zarr to GDSStore / open_mfdataset

In [25]:
# Consolidated must be False
ds = xr.open_dataset(store, engine="kvikio", consolidated=False)
print(ds.air._variable._data)
ds

MemoryCachedArray(array=CopyOnWriteArray(array=LazilyIndexedArray(array=_ElementwiseFunctionArray(LazilyIndexedArray(array=<cupy_xarray.kvikio.CupyZarrArrayWrapper object at 0x2b5b9f92e950>, key=BasicIndexer((slice(None, None, None), slice(None, None, None), slice(None, None, None)))), func=functools.partial(<function _scale_offset_decoding at 0x2b5b86161d30>, scale_factor=0.01, add_offset=None, dtype=<class 'numpy.float32'>), dtype=dtype('float32')), key=BasicIndexer((slice(None, None, None), slice(None, None, None), slice(None, None, None))))))


In [26]:
ds.scalar.variable._data

MemoryCachedArray(array=CopyOnWriteArray(array=LazilyIndexedArray(array=<cupy_xarray.kvikio.CupyZarrArrayWrapper object at 0x2b5b9f9c4310>, key=BasicIndexer(()))))

## Lazy reading

In [27]:
ds.air

## Data load for repr

In [31]:
ds["air"].isel(time=0, lat=10).load()

In [29]:
ds.scalar

## CuPy array on load

In [32]:
type(ds["air"].isel(time=0, lat=10).data)

cupy.ndarray

In [33]:
type(ds["air"].isel(time=0, lat=10).load().data)

cupy.ndarray

## Load to host

In [34]:
ds.air.as_numpy()

In [35]:
type(ds.air.as_numpy().data)

numpy.ndarray

In [36]:
type(ds.air.mean("time").load().data)

cupy.ndarray

## Chunk with dask

In [37]:
ds.chunk(time=10)

Unnamed: 0,Array,Chunk
Bytes,14.76 MiB,51.76 kiB
Shape,"(2920, 25, 53)","(10, 25, 53)"
Dask graph,292 chunks in 2 graph layers,292 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 14.76 MiB 51.76 kiB Shape (2920, 25, 53) (10, 25, 53) Dask graph 292 chunks in 2 graph layers Data type float32 numpy.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,14.76 MiB,51.76 kiB
Shape,"(2920, 25, 53)","(10, 25, 53)"
Dask graph,292 chunks in 2 graph layers,292 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## GroupBy with flox

Requires

1. flox main branch?
2. https://github.com/ml31415/numpy-groupies/pull/63

In [None]:
ds.air.groupby("time.month").mean()