In [8]:
import xarray as xr
import numpy as np

# reference: http://james.hiebert.name/blog/work/2015/04/18/NetCDF-Scale-Factors.html

In [4]:
def compute_scale_and_offset(min_value, max_value, n):
    """Function to compute the scale factor and add offset for packing data.
    Args:
        min_value (float): minimum value in the dataset
        max_value (float): maximum value in the dataset
        n (int): number of bits to use for packing
    Returns:
        tuple: scale factor and add offset"""
    # stretch/compress data to the available packed range
    scale_factor = (max_value - min_value) / (2**n - 1)
    # translate the range to be symmetric about zero
    add_offset = min_value + 2 ** (n - 1) * scale_factor
    return (scale_factor, add_offset)


def pack_dataset_by_var(ds, ds_packed, n=16, var=None):
    """Function to pack data in a dataset by variable, reducing the number of bits used to store the data.
    Args:
        ds (xarray.Dataset): input dataset
        ds_packed (xarray.Dataset): output dataset with packed values. This should be a copy of the original dataset.
        n (int): number of bits to use for packing, must be one of [8, 16, 32]. Defaults to 16.
        var (str, optional): variable to pack. If None, all variables will be packed. Defaults to None.

    Returns:
        xarray.Dataset: dataset with packed values"""
    dim_names = list(ds.dims)

    # validate n
    if n not in [8, 16, 32]:
        raise ValueError("n must be one of [8, 16, 32]")

    if var:
        vars_to_pack = [var]
    else:
        vars_to_pack = list(ds.data_vars)
    for var in vars_to_pack:
        min_value = xr.DataArray.min(ds[var], skipna=True).values
        max_value = xr.DataArray.max(ds[var], skipna=True).values
        scale_factor, add_offset = compute_scale_and_offset(min_value, max_value, n)
        packed_array = (ds[var].values - add_offset) / scale_factor
        # apply floor function to all values in the packed array, while replacing NaNs with -9999
        # convert to integer type using the number of bits specified by n
        packed_array = np.floor(np.nan_to_num(packed_array, nan=-9999)).astype(
            f"int{n}"
        )

        # overwrite the values in copied dataset with the packed values
        ds_packed[var] = xr.DataArray(packed_array, dims=dim_names)

        ds_packed[var].attrs = {
            "_FillValue": -9999,
            "scale_factor": scale_factor,
            "add_offset": add_offset,
        }

    # drop variables that are not packed
    for var in ds_packed.data_vars:
        if var not in vars_to_pack:
            ds_packed = ds_packed.drop_vars(var)

    return ds_packed

In [5]:
ds = xr.open_dataset(
    "/beegfs/CMIP6/jdpaul3/cmip6_daily_for_rasdaman_full/cmip6_regrid_day_pr_tasmax_tasmin_historical_ssp126_ssp245_ssp370_ssp585_ensemble.nc"
)
ds_packed = ds.copy()

out_fp = "/beegfs/CMIP6/jdpaul3/cmip6_daily_for_rasdaman_full/cmip6_regrid_day_pr_full_16bit.nc"

In [6]:
ds_packed = pack_dataset_by_var(ds, ds_packed, n=16, var="pr")
ds_packed.to_netcdf(out_fp)

In [37]:
####### INGEST THAT DATASET! ^^^^  ########

In [9]:
import requests
import io

In [12]:
url = "https://zeus.snap.uaf.edu/rasdaman/ows?&SERVICE=WCS&VERSION=2.0.1&REQUEST=GetCoverage&COVERAGEID=cmip6_daily_pr_full_16bit&SUBSET=time(0.5)&SUBSET=lat(64.8)&SUBSET=lon(-147.5)&FORMAT=application/netcdf"

with requests.get(url) as r:
    ds_test = xr.open_dataset(io.BytesIO(r.content), mask_and_scale=False)

In [13]:
ds_test

In [14]:
# apply the scale and offset to unpack the data
# but only for values that are not -9999 in the packed data
ds_test_unpacked = ds_test.copy()
for var in ds_test.data_vars:
    scale_factor = float(ds_test[var].attrs["scale_factor"])
    add_offset = float(ds_test[var].attrs["add_offset"])
    ds_test_unpacked[var] = ds_test[var].where(ds_test[var] != -9999, other=np.nan)
    ds_test_unpacked[var] = (ds_test_unpacked[var] * scale_factor) + add_offset

ds_test_unpacked

In [33]:
# compare values in the original dataset to unpacked values
ds["pr"].isel(time=0).sel(lat=64.8, lon=-147.5, method="nearest")

In [15]:
# unpacked values
ds_test_unpacked["pr"]

In [36]:
# compare packed values
ds_test["pr"]

In [10]:
# will xarray do the scaling automatically?
with requests.get(url) as r:
    ds_test_2 = xr.open_dataset(io.BytesIO(r.content), mask_and_scale=True)

In [29]:
# try to print the pr values
ds_test_2["pr"].values

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('float64'), dtype('<U20')) -> None

In [33]:
# this doesn't work because Rasdaman is returning scale and offset values as strings...
# see the first dataset where we specified mask_and_scale=False:
ds_test.pr.attrs

{'valid_min': np.int16(-32768),
 'valid_max': np.int16(32767),
 'missing_value': np.int16(-9999),
 '_FillValue': np.int16(-9999),
 'add_offset': '318.85487144274055',
 'definition': '',
 'description': '',
 'scale_factor': '0.009730678449790666',
 'units': '10^0'}

In [34]:
# in the second dataset where we specified mask_and_scale=True, the scale and offset attributes are recognized
# and they disappear when viewing the metadata
# but due to lazy loading, we don't see the error until we try to access the values:
ds_test_2.pr.attrs

{'valid_min': np.int16(-32768),
 'valid_max': np.int16(32767),
 'definition': '',
 'description': '',
 'units': '10^0'}

In [35]:
ds_test_2["pr"]

In [36]:
ds_test_2["pr"].values

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('float64'), dtype('<U20')) -> None