In [None]:
import pandas as pd
from pathlib import Path

In [None]:
data_path = Path("/home/jovyan/shared_materials/climattr_data_please-dont-copy/nc")
data_files = sorted(data_path.glob("*.nc"))
len(data_files)

In [None]:
def parse_file_names(file_name):
    variable, domain_resolution, gcm, experiment, member, rcm, version, time_resolution, start_end = file_name.split(".")[0].split("_")
    return {
        "variable": variable,
        "domain_resolution": domain_resolution,
        "experiment": experiment,
        "member": member,
        "gcm": gcm,
        "rcm": rcm,
        "time_resolution": time_resolution,
        "version": version,
        "start_end": start_end,
    }


def build_nc_file_names(parsed):
    return "_".join(
        parsed[k] if k in parsed.keys() else "*" for k in [
            "variable", "domain_resolution", "gcm", "experiment", "member", "rcm", "version", "time_resolution", "start_end",
        ]
    ) + ".nc"


def build_zarr_store_names(parsed):
    return "{variable}_{domain_resolution}_{gcm}_{rcm}.zarr".format(**parsed)

    
def test_parse_file_names():
    file_name = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    parsed = parse_file_names(file_name)
    assert parsed["variable"] == "tasmax"
    assert parsed["domain_resolution"] == "EUR-11"
    assert parsed["experiment"] == "rcp85"
    assert parsed["member"] == "r1i1p1"
    assert parsed["gcm"] == "MPI-M-MPI-ESM-LR"
    assert parsed["rcm"] == "MPI-CSC-REMO2009"
    assert parsed["time_resolution"] == "day"
    assert parsed["version"] == "v1"
    assert parsed["start_end"] == "20360101-20401231"


def test_build_zarr_name():
    file_name = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    parsed = parse_file_names(file_name)
    zarr_name_true = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_MPI-CSC-REMO2009.zarr"
    zarr_name_test = build_zarr_store_names(parsed)
    assert zarr_name_test == zarr_name_true


def test_parse_build_roundtrip():
    file_name_true = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    parsed = parse_file_names(file_name_true)
    file_name_test = build_nc_file_names(parsed)
    assert file_name_test == file_name_true


def test_parse_build_wildcard():
    file_name_full = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    file_name_true = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_*_*_MPI-CSC-REMO2009_*_day_*.nc"
    parsed = parse_file_names(file_name_full)
    parsed.pop("experiment")
    parsed.pop("member")
    parsed.pop("version")
    parsed.pop("start_end")
    file_name_test = build_nc_file_names(parsed)
    assert file_name_test == file_name_true


test_build_zarr_name()
test_parse_file_names()
test_parse_build_roundtrip()
test_parse_build_wildcard()

In [None]:
df = df = pd.DataFrame.from_records(
    [dict(parse_file_names(data_files[n].name), data_file=data_files[n]) for n in range(len(data_files))]
)
df["data_file_glob"] = df[["domain_resolution", "gcm", "rcm", "variable", "data_file"]].apply(lambda r: str(r["data_file"].parent / build_nc_file_names(r)), axis=1)
df

In [None]:
df[["domain_resolution", "gcm", "rcm", "variable", "data_file_glob"]].drop_duplicates().to_csv(data_path / "catalog.csv", index=False)

In [None]:
pd.read_csv(data_path/ "catalog.csv")

In [None]:
df_zarr = df[["domain_resolution", "gcm", "rcm", "variable"]].copy()
df_zarr["zarr_name"] = df_zarr.apply(lambda r: build_zarr_store_names(dict(r)), axis=1)
df_zarr = df_zarr.drop_duplicates(ignore_index=True)
df_zarr.to_csv(data_path / "../zarr/catalog.csv", index=False)
!cat {str(data_path / "../zarr/catalog.csv")}