# Zarr files to netCDF

With this script, the zarr files from CLEOS output can be transformed into a single netcdf file which has the dimensions
- time
- sd_id (super droplet id)

In [7]:
import sys
import numpy as np
import random
from pathlib import Path
import yaml
import secrets
import xarray as xr
import matplotlib.pyplot as plt

# from plotssrc import pltsds, pltmoms, animations
from pySD.plotssrc import pltsds, pltmoms, animations
from pySD.sdmout_src import *
from pySD.sdmout_src import sdtracing
from pySD.gbxboundariesbinary_src import read_gbxboundaries as rgrid
from pySD.initsuperdropsbinary_src import *
from pySD.initsuperdropsbinary_src import read_initsuperdrops as rsupers
from pySD.thermobinary_src import read_thermodynamics as rthermo


from sdm_eurec4a.visulization import set_custom_rcParams

set_custom_rcParams()

path2CLEO = Path("/home/m/m301096/CLEO")
path2sdm_eurec4a = Path("/home/m/m301096/repositories/sdm-eurec4a")
sys.path.append(path2CLEO)  # for imports from pySD package
# sys.path.append(path2CLEO / "examples/exampleplotting") # for imports from example plotting package

# use paths to files
path2build = path2CLEO / "build"
configfile = path2CLEO / "eurec4a/experiment_02/src/config/rain1d_config.txt"
yaml_config_file = path2sdm_eurec4a / "data/model/input/example_input_18.yaml"


In [8]:
with open(yaml_config_file, 'r') as f:
    config_yaml = yaml.safe_load(f)


### ----------------------- INPUT PARAMETERS ----------------------- ###
### --- essential paths and filenames --- ###
# path and filenames for creating initial SD conditions
constsfile    = path2CLEO / "libs/cleoconstants.hpp"
# path and file names for plotting results
setupfile     = path2CLEO / "data/output/raw/rain1d_setup.txt"
dataset       = path2CLEO / "data/output/raw/rain1d_sol.zarr"

# get cloud imformation
cloud_id = config_yaml['cloud']['cloud_id']
identification_type = config_yaml['cloud']['identification_type']
savefigpath = path2CLEO / "results/experiment_02" / f"{identification_type}_{cloud_id}" # directory for saving figures
savefigpath.mkdir(exist_ok=True, parents=True)

Get the config, constants and also an initial superdroplet dataset

In [11]:
# read in constants and intial setup from setup .txt file
config = pysetuptxt.get_config(setupfile, nattrs=3, isprint=False)
consts = pysetuptxt.get_consts(setupfile, isprint=False)

# Create a first simple dataset to have the coordinates for later netcdf creation
sddata = pyzarr.get_supers(str(dataset), consts)
simple_ds = xr.open_dataset(dataset, engine="zarr",
                                consolidated=False);

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------
supers dataset:  /home/m/m301096/CLEO/data/output/raw/rain1d_sol.zarr


## Create a xarray dataset from the zarr file


For this, the ``sdtracing.attribute_for_superdroplets_sample`` function will be used for a subset of superdroplet ids.
Using a size of 100 makes sense, and is not too slow.

The subdatasets will be stored in a temporary folder.

To do so, we create a list of temporaray filenames

In [15]:
path2dataoutput = path2CLEO / "data/output"
TEMPORARY_DIR = path2dataoutput / "temporary"
OUTPUT_DIR = path2dataoutput / "processed/" f"{identification_type}_{cloud_id}"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [33]:
all_ids = np.arange(config['totnsupers'], dtype = int)
len(all_ids)
# all_ids_reshaped = all_ids.reshape(100)
# use each gridcell on its own
n = config['totnsupers'] / 256
all_ids = all_ids.reshape(int(n), -1)
# temporary file names based on seed

seed = 2
np.random.seed(seed)
hashs = np.random.choice(int(1e6), size = all_ids.shape[0])
str_func = np.vectorize(lambda x : TEMPORARY_DIR / f"{hex(x)}.nc")
temp_filepaths = str_func(hashs)

assert len(temp_filepaths) == all_ids.shape[0]
print(temp_filepaths[0])

/home/m/m301096/CLEO/data/output/temporary/0xd5ca8.nc


In [34]:
import tqdm
attributes = ["xi", "radius", "coord3", "sdgbxindex", "msol"]

for ids, temp_filepath in tqdm(zip(all_ids[0:2], temp_filepaths[0:2])):
    print(f"Processing {ids.min()}-{ids.max()} to \t{temp_filepath}")
    ids = list(ids)
    list_dataarrays = []
    for attr in attributes:
        data = sdtracing.attribute_for_superdroplets_sample(
            sddata,
            attr,
            ids=ids,
        )
        da = xr.DataArray(
            data=data,
            dims=["time", "sd_id"],
            coords={"time": simple_ds.time, "sd_id": ids}
        )
        da.name = attr
        list_dataarrays.append(da)

    ds = xr.merge(list_dataarrays)
    ds.to_netcdf(temp_filepath)
# ds = xr.concate(list_dataarrays)
# display(ds)

Processing 0-255 to /home/m/m301096/CLEO/data/output/temporary/0xd5ca8.nc


Processing 256-511 to /home/m/m301096/CLEO/data/output/temporary/0x18a0f.nc


## Combine datasets and store in folder

In [46]:
full_dataset = xr.open_mfdataset(temp_filepaths[0:2], parallel = True)
full_dataset.to_netcdf(OUTPUT_DIR / "full_dataset.nc")
