In [78]:
import xarray as xr
from pathlib import Path

# local imports
# go up one directory to get the config file
import sys
sys.path.append("..")
from transfers.config import prod_variant_lu

In [74]:
adjusted_dir = Path("/beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted")
output_dir = Path("/beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted_reformatted")

In [75]:
# list zarr files
files = list(adjusted_dir.glob("**/*.zarr"))
files.sort()

In [76]:
# variables need to be in order (time, y, x, ensemble) to match CF conventions
# create function to do this to each dataset
def reorder_vars(ds):
    return ds.transpose("time", "y", "x", "ensemble")

# each file needs a new "ensemble" dimensions that includes variant and scenario label like <model_variant_scenario>
# we can get this from the filename
# filename is like <variable>_<model>_<scenario>_adjusted.zarr
# and the variant is in the prod_variant_lu dict which uses the model as key

def get_ensemble_str_from_filename(file, prod_variant_lu):
    model = file.name.split("_")[1]
    scenario = file.name.split("_")[2]
    try:
        variant = prod_variant_lu[model]
    except KeyError:
        print(f"Warning: model {model} not found in variant lookup table, setting variant to 'unknown'!")
        variant = "unknown"  # default if not found ... we can look for this later on during QC
    ensemble_str = f"{model}_{variant}_{scenario}"
    return ensemble_str

# create function to add ensemble dimension
def add_ensemble_dim(ds, ensemble_str):
    ds = ds.expand_dims({"ensemble": [ensemble_str]})
    return ds


# drop attributes that are not 'contact', 'creation_date', or 'history'
def clean_attrs(ds):
    attrs_to_keep = ["contact", "creation_date", "history"]
    attrs_to_drop = [attr for attr in ds.attrs if attr not in attrs_to_keep]
    ds.attrs = {attr: ds.attrs[attr] for attr in attrs_to_keep if attr in ds.attrs}
    return ds

# wrapper for all functions
def process_file(file, prod_variant_lu):
    ds = xr.open_zarr(file, chunks="auto")
    ds = clean_attrs(ds)
    ensemble_str = get_ensemble_str_from_filename(file, prod_variant_lu)
    ds = add_ensemble_dim(ds, ensemble_str)
    ds = reorder_vars(ds)
    return ds



In [None]:
# process datasets and save to new zarr files
# creating output directory if it doesn't exist

success_count = 0
failure_count = 0

output_dir.mkdir(parents=True, exist_ok=True)
for file in files:
    print(f"Processing {file}...")
    try:
        ds = process_file(file, prod_variant_lu)
        print(f"File processed successfully!")
        success_count += 1
    except:
        print(f"File could not be processed!")
        failure_count += 1
    output_file = output_dir / file.name
    ds.to_zarr(output_file, mode="w")
    ds.close()
    print(f"Saved to {output_file}")

print(f"\n\nProcessing complete. Success: {success_count}, Failure: {failure_count}")

Processing /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted/dtr_EC-Earth3-Veg_historical_adjusted.zarr...
File processed successfully!
Saved to /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted_reformatted/dtr_EC-Earth3-Veg_historical_adjusted.zarr
Processing /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted/dtr_EC-Earth3-Veg_ssp370_adjusted.zarr...
File processed successfully!
Saved to /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted_reformatted/dtr_EC-Earth3-Veg_ssp370_adjusted.zarr
Processing /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted/dtr_EC-Earth3-Veg_ssp585_adjusted.zarr...
File processed successfully!
Saved to /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted_reformatted/dtr_EC-Earth3-Veg_ssp585_adjusted.zarr
Processing /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted/pr_EC-Earth3-Veg_historical_adjusted.zarr...
File processed successfully!
Saved to /beegfs/CMIP6/jdpaul3/cmip6_4km_downscaling/adjusted_reformatted/pr_EC-Earth3-Veg_historical_adjusted