In [82]:
import xarray as xr
import numpy as np
import os

In [83]:
def compute_scale_and_offset(min_value, max_value, n):
    """Function to compute the scale factor and add offset for packing data.
    Args:
        min_value (float): minimum value in the dataset
        max_value (float): maximum value in the dataset
        n (int): number of bits to use for packing
    Returns:
        tuple: scale factor and add offset"""
    # stretch/compress data to the available packed range
    scale_factor = (max_value - min_value) / (2**n - 1)
    # translate the range to be symmetric about zero
    add_offset = min_value + 2 ** (n - 1) * scale_factor
    return (scale_factor, add_offset)


def pack_dataset_by_var(ds, ds_packed, n):
    """Function to pack data in a dataset by variable, reducing the number of bits used to store the data.
    Args:
        ds (xarray.Dataset): input dataset
        ds_packed (xarray.Dataset): output dataset with packed values. This should be a copy of the original dataset.
        n (int): number of bits to use for packing
    Returns:
        xarray.Dataset: dataset with packed values"""
    dim_names = list(ds.dims)

    for var in ds.data_vars:
        min_value = xr.DataArray.min(ds[var], skipna=True).values
        max_value = xr.DataArray.max(ds[var], skipna=True).values
        scale_factor, add_offset = compute_scale_and_offset(min_value, max_value, n)
        packed_array = (ds[var].values - add_offset) / scale_factor
        # apply floor function to all values in the packed array, while replacing NaNs with -9999
        # convert to integer type using the number of bits specified by n
        packed_array = np.floor(np.nan_to_num(packed_array, nan=-9999)).astype(
            f"int{n}"
        )

        # overwrite the values in copied dataset with the packed values
        ds_packed[var] = xr.DataArray(packed_array, dims=dim_names)

        ds_packed[var].attrs = {
            "_FillValue": -9999,
            "scale_factor": scale_factor,
            "add_offset": add_offset,
        }

    return ds_packed

In [84]:
ds = xr.open_dataset("/beegfs/CMIP6/jdpaul3/hydroviz_data/nc/seg.nc")
ds_packed = ds.copy()

In [86]:
bits = [32, 16, 8]
files = ["/beegfs/CMIP6/jdpaul3/hydroviz_data/nc/seg.nc"]  # seed with original file
for n in bits:
    ds_packed = pack_dataset_by_var(ds, ds_packed, n)
    filename = f"/beegfs/CMIP6/jdpaul3/scratch/seg_packed_{n}bit.nc"
    ds_packed.to_netcdf(filename)
    print(f"Finished packing {n}-bit dataset")
    files.append(filename)

Finished packing 32-bit dataset
Finished packing 16-bit dataset
Finished packing 8-bit dataset


In [87]:
def test_unpacked_values(ds):
    # check two values we know are float & nan in the original dataset.... how do the packed & unpacked values compare?
    print(ds["dh15"].sel(lc=1, model=0, scenario=0, era=0, geom_id=12).values)
    print(ds["dh15"].sel(lc=1, model=0, scenario=0, era=1, geom_id=12).values)


for file in files:
    print(file, " size: ", (os.path.getsize(file) / (1024 * 1024 * 1024)), "GB")
    # the xr.open_dataset() function should automatically apply the scaling and offset when we set mask_and_scale=True
    ds_unpacked = xr.open_dataset(file, mask_and_scale=True)
    test_unpacked_values(ds_unpacked)  # packed dataset

/beegfs/CMIP6/jdpaul3/hydroviz_data/nc/seg.nc  size:  5.031551872380078 GB
9.8125
nan
/beegfs/CMIP6/jdpaul3/scratch/seg_packed_32bit.nc  size:  2.516008894890547 GB
9.812499981097062
nan
/beegfs/CMIP6/jdpaul3/scratch/seg_packed_16bit.nc  size:  1.2582361213862896 GB
9.811215381094073
nan
/beegfs/CMIP6/jdpaul3/scratch/seg_packed_8bit.nc  size:  0.629349734634161 GB
9.529411764705884
nan
