In [1]:
import pandas as pd
from pathlib import Path

In [2]:
data_path = Path("/home/jovyan/shared_materials/climattr_data_please-dont-copy/nc")
data_files = sorted(data_path.glob("*.nc"))
len(data_files)

598

In [3]:
def parse_file_names(file_name):
    variable, domain_resolution, gcm, experiment, member, rcm, version, time_resolution, start_end = file_name.split(".")[0].split("_")
    return {
        "variable": variable,
        "domain_resolution": domain_resolution,
        "experiment": experiment,
        "member": member,
        "gcm": gcm,
        "rcm": rcm,
        "time_resolution": time_resolution,
        "version": version,
        "start_end": start_end,
    }


def build_nc_file_names(parsed):
    return "_".join(
        parsed[k] if k in parsed.keys() else "*" for k in [
            "variable", "domain_resolution", "gcm", "experiment", "member", "rcm", "version", "time_resolution", "start_end",
        ]
    ) + ".nc"


def build_zarr_store_names(parsed):
    return "{variable}_{domain_resolution}_{gcm}_{rcm}.zarr".format(**parsed)

    
def test_parse_file_names():
    file_name = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    parsed = parse_file_names(file_name)
    assert parsed["variable"] == "tasmax"
    assert parsed["domain_resolution"] == "EUR-11"
    assert parsed["experiment"] == "rcp85"
    assert parsed["member"] == "r1i1p1"
    assert parsed["gcm"] == "MPI-M-MPI-ESM-LR"
    assert parsed["rcm"] == "MPI-CSC-REMO2009"
    assert parsed["time_resolution"] == "day"
    assert parsed["version"] == "v1"
    assert parsed["start_end"] == "20360101-20401231"


def test_build_zarr_name():
    file_name = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    parsed = parse_file_names(file_name)
    zarr_name_true = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_MPI-CSC-REMO2009.zarr"
    zarr_name_test = build_zarr_store_names(parsed)
    assert zarr_name_test == zarr_name_true


def test_parse_build_roundtrip():
    file_name_true = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    parsed = parse_file_names(file_name_true)
    file_name_test = build_nc_file_names(parsed)
    assert file_name_test == file_name_true


def test_parse_build_wildcard():
    file_name_full = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_MPI-CSC-REMO2009_v1_day_20360101-20401231.nc"
    file_name_true = "tasmax_EUR-11_MPI-M-MPI-ESM-LR_*_*_MPI-CSC-REMO2009_*_day_*.nc"
    parsed = parse_file_names(file_name_full)
    parsed.pop("experiment")
    parsed.pop("member")
    parsed.pop("version")
    parsed.pop("start_end")
    file_name_test = build_nc_file_names(parsed)
    assert file_name_test == file_name_true


test_build_zarr_name()
test_parse_file_names()
test_parse_build_roundtrip()
test_parse_build_wildcard()

In [4]:
df = df = pd.DataFrame.from_records(
    [dict(parse_file_names(data_files[n].name), data_file=data_files[n]) for n in range(len(data_files))]
)
df["data_file_glob"] = df[["domain_resolution", "gcm", "rcm", "variable", "data_file"]].apply(lambda r: str(r["data_file"].parent / build_nc_file_names(r)), axis=1)
df

Unnamed: 0,variable,domain_resolution,experiment,member,gcm,rcm,time_resolution,version,start_end,data_file,data_file_glob
0,pr,AFR-22,historical,r1i1p1,CCCma-CanESM2,CCCma-CanRCM4,day,r2,19500101-19501231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
1,pr,AFR-22,historical,r1i1p1,CCCma-CanESM2,CCCma-CanRCM4,day,r2,19510101-19551231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
2,pr,AFR-22,historical,r1i1p1,CCCma-CanESM2,CCCma-CanRCM4,day,r2,19560101-19601231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
3,pr,AFR-22,historical,r1i1p1,CCCma-CanESM2,CCCma-CanRCM4,day,r2,19610101-19651231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
4,pr,AFR-22,historical,r1i1p1,CCCma-CanESM2,CCCma-CanRCM4,day,r2,19660101-19701231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
...,...,...,...,...,...,...,...,...,...,...,...
593,tasmax,EUR-11,rcp85,r1i1p1,NCC-NorESM1-M,SMHI-RCA4,day,v1,20260101-20301231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
594,tasmax,EUR-11,rcp85,r1i1p1,NCC-NorESM1-M,SMHI-RCA4,day,v1,20310101-20351231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
595,tasmax,EUR-11,rcp85,r1i1p1,NCC-NorESM1-M,SMHI-RCA4,day,v1,20360101-20401231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...
596,tasmax,EUR-11,rcp85,r1i1p1,NCC-NorESM1-M,SMHI-RCA4,day,v1,20410101-20451231,/home/jovyan/shared_materials/climattr_data_pl...,/home/jovyan/shared_materials/climattr_data_pl...


In [5]:
df[["domain_resolution", "gcm", "rcm", "variable", "data_file_glob"]].drop_duplicates().to_csv(data_path / "catalog.csv", index=False)

In [6]:
pd.read_csv(data_path/ "catalog.csv")

Unnamed: 0,domain_resolution,gcm,rcm,variable,data_file_glob
0,AFR-22,CCCma-CanESM2,CCCma-CanRCM4,pr,/home/jovyan/shared_materials/climattr_data_pl...
1,AFR-22,MOHC-HadGEM2-ES,CLMcom-KIT-CCLM5-0-15,pr,/home/jovyan/shared_materials/climattr_data_pl...
2,AFR-22,MOHC-HadGEM2-ES,GERICS-REMO2015,pr,/home/jovyan/shared_materials/climattr_data_pl...
3,AFR-22,MPI-M-MPI-ESM-LR,CLMcom-KIT-CCLM5-0-15,pr,/home/jovyan/shared_materials/climattr_data_pl...
4,AFR-22,MPI-M-MPI-ESM-LR,GERICS-REMO2015,pr,/home/jovyan/shared_materials/climattr_data_pl...
5,AFR-22,NCC-NorESM1-M,CLMcom-KIT-CCLM5-0-15,pr,/home/jovyan/shared_materials/climattr_data_pl...
6,AFR-22,NCC-NorESM1-M,GERICS-REMO2015,pr,/home/jovyan/shared_materials/climattr_data_pl...
7,EUR-11,CCCma-CanESM2,CLMcom-CCLM4-8-17,pr,/home/jovyan/shared_materials/climattr_data_pl...
8,EUR-11,CNRM-CERFACS-CNRM-CM5,CNRM-ALADIN63,pr,/home/jovyan/shared_materials/climattr_data_pl...
9,EUR-11,IPSL-IPSL-CM5A-MR,DMI-HIRHAM5,pr,/home/jovyan/shared_materials/climattr_data_pl...


In [7]:
df_zarr = df[["domain_resolution", "gcm", "rcm", "variable"]].copy()
df_zarr["zarr_name"] = df_zarr.apply(lambda r: build_zarr_store_names(dict(r)), axis=1)
df_zarr = df_zarr.drop_duplicates(ignore_index=True)
df_zarr.to_csv(data_path / "../zarr/catalog.csv", index=False)
!cat {str(data_path / "../zarr/catalog.csv")}

domain_resolution,gcm,rcm,variable,zarr_name
AFR-22,CCCma-CanESM2,CCCma-CanRCM4,pr,pr_AFR-22_CCCma-CanESM2_CCCma-CanRCM4.zarr
AFR-22,MOHC-HadGEM2-ES,CLMcom-KIT-CCLM5-0-15,pr,pr_AFR-22_MOHC-HadGEM2-ES_CLMcom-KIT-CCLM5-0-15.zarr
AFR-22,MOHC-HadGEM2-ES,GERICS-REMO2015,pr,pr_AFR-22_MOHC-HadGEM2-ES_GERICS-REMO2015.zarr
AFR-22,MPI-M-MPI-ESM-LR,CLMcom-KIT-CCLM5-0-15,pr,pr_AFR-22_MPI-M-MPI-ESM-LR_CLMcom-KIT-CCLM5-0-15.zarr
AFR-22,MPI-M-MPI-ESM-LR,GERICS-REMO2015,pr,pr_AFR-22_MPI-M-MPI-ESM-LR_GERICS-REMO2015.zarr
AFR-22,NCC-NorESM1-M,CLMcom-KIT-CCLM5-0-15,pr,pr_AFR-22_NCC-NorESM1-M_CLMcom-KIT-CCLM5-0-15.zarr
AFR-22,NCC-NorESM1-M,GERICS-REMO2015,pr,pr_AFR-22_NCC-NorESM1-M_GERICS-REMO2015.zarr
EUR-11,CCCma-CanESM2,CLMcom-CCLM4-8-17,pr,pr_EUR-11_CCCma-CanESM2_CLMcom-CCLM4-8-17.zarr
EUR-11,CNRM-CERFACS-CNRM-CM5,CNRM-ALADIN63,pr,pr_EUR-11_CNRM-CERFACS-CNRM-CM5_CNRM-ALADIN63.zarr
EUR-11,IPSL-IPSL-CM5A-MR,DMI-HIRHAM5,pr,pr_EUR-11_IPSL-IPSL-CM5A-MR_DMI-HIRHAM5.zarr
EUR-11,MIROC-MIROC5,UHOH-WRF361H,pr,p