# Create CMIP6 daily batch files

This notebook is used to derive the `batch_files/daily_*.txt` files that contain the endpoints and files to transfer to the Arctic Climate Data Node.

In [93]:
from itertools import product
from multiprocessing import Pool
import pandas as pd
from config import *
import utils

import importlib
importlib.reload(utils)

<module 'utils' from '/workspace/UA/kmredilla/cmip6-utils/transfers/utils.py'>

Define the models and scenarios we are interested in transferring data for, using the same values used in the directory structure of ESGF:

In [145]:
# format model: institution
model_inst_lu = {
    "ACCESS-CM2": "CSIRO-ARCCSS",
    "CESM2": "NCAR",
    "CNRM-CM6-1-HR": "CNRM-CERFACS",
    "EC-Earth3-Veg-LR": "EC-Earth-Consortium",
    "GFDL-ESM4": "NOAA-GFDL",
    "HadGEM3-GC31-LL": "MOHC",
    "HadGEM3-GC31-MM": "MOHC",
    "KACE-1-0-G": "NIMS-KMA",
    "MIROC6": "MIROC",
    "MPI-ESM1-2-LR": "MPI-M",
    "MRI-ESM2-0": "MPI-M",
    "NorESM2-MM": "NCC",
}

scenarios = [
    "ssp126",
    "ssp245",
    "ssp585",
]

variables_tier1 = {
    "tas": "near_surface_air_temperature",
    "pr": "precipitation",
    "psl": "sea_level_pressure",
}

variables_tier2 = {
    "evspsbl": "evaporation_including_sublimation_and_transpiration",
    "mrsos": "moisture_in_upper_portion_of_soil_column",
    "prsn": "snowfall_flux",
    "snw": "surface_snow_amount",
    "mrro": "total_runoff",
}

const_variables = {
    "orog": "surface_altitude",
    "sftlf": "percentage_of_the_grid_cell_occupied_by_land_including_lakes",
    "sftof": "sea_area_percentage",
}

# prefix for LLNL CMIP6 data
llnl_prefix = Path("/css03_data/CMIP6")



Iterate over models and scenarios of interest and see what is available at the LLNL endpoint under the r1i1p1f1 variant for each. 

A useful table would be one with the following columns: model, scenario, variable, grid, version, number of files available

Note, file paths will not be consistent between models. Things that are likely to differ are the name of the subfolder under the variable folder, as well as the version number under that. E.g., here are two example paths from two different models for daily temperature data:

```
# NCAR-CESM2
/css03_data/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/tas/gn/v20190308/
# GFDL-ESM4
/css03_data/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/day/tas/gr1/v20190726/
```

So we will need to use the Globus CLI to make queries along the way.

In [151]:
# generate lists of arguments
args = list(
    product(["CMIP"], model_inst_lu, ["historical"], variables_tier1)
) + list(
    product(["ScenarioMIP"], model_inst_lu, scenarios, variables_tier1)
)

In [159]:
def get_attrs(args):
    activity, model, scenario, varname = args
    scenario_path = llnl_prefix.joinpath(activity, model_inst_lu[model], model, scenario)
    variants = utils.get_contents(llnl_ep, scenario_path)

    if isinstance(variants, int):
        row_di = {
            "model": model,
            "scenario": scenario,
            "variant": None,
            "variable": None,
            "grid_type": None,
            "version": None,
            "n_files": None,
            "filenames": None,
        }
        
        return [row_di]
    
    variant_list = []
    for variant in variants:
        var_path = llnl_prefix.joinpath(activity, model_inst_lu[model], model, scenario, variant, freq, varname)
        grid_type = utils.get_contents(llnl_ep, var_path)

        if isinstance(grid_type, int):
            row_di = {
                "model": model,
                "scenario": scenario,
                "variant": variant,
                "variable": varname,
                "grid_type": None,
                "version": None,
                "n_files": None,
                "filenames": None,
            }
            
            variant_list.append(row_di)
            continue
        else:
            grid_type = grid_type[0].replace("/", "")

        versions = utils.get_contents(llnl_ep, var_path.joinpath(grid_type))
        # go with newer version
        use_version = sorted([v.replace("/", "") for v in versions])[-1]
        # add "v" back in
        fns = utils.get_contents(llnl_ep, var_path.joinpath(grid_type, use_version))
        row_di = {
            "model": model,
            "scenario": scenario,
            "variant": variant,
            "variable": varname,
            "grid_type": grid_type,
            "version": use_version,
            "n_files": len(fns),
            "filenames": fns,
        }
        variant_list.append(row_di)
        
    return variant_list

In [160]:
%%time
with Pool(32) as pool:
    results = pool.map(get_attrs, args)

CPU times: user 89.3 ms, sys: 332 ms, total: 421 ms
Wall time: 18min 29s


In [162]:
df = pd.DataFrame([r for result in results for r in result])

In [163]:
df

Unnamed: 0,model,scenario,variant,variable,grid_type,version,n_files,filenames
0,ACCESS-CM2,historical,r10i1p1f1/,tas,gn,v20220819,4.0,[tas_day_ACCESS-CM2_historical_r10i1p1f1_gn_18...
1,ACCESS-CM2,historical,r1i1p1f1/,tas,gn,v20191108,4.0,[tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_185...
2,ACCESS-CM2,historical,r2i1p1f1/,tas,gn,v20191125,4.0,[tas_day_ACCESS-CM2_historical_r2i1p1f1_gn_185...
3,ACCESS-CM2,historical,r3i1p1f1/,tas,gn,v20200306,4.0,[tas_day_ACCESS-CM2_historical_r3i1p1f1_gn_185...
4,ACCESS-CM2,historical,r4i1p1f1/,tas,gn,v20210607,4.0,[tas_day_ACCESS-CM2_historical_r4i1p1f1_gn_185...
...,...,...,...,...,...,...,...,...
1330,NorESM2-MM,ssp245,r1i1p1f1/,psl,gn,v20191108,9.0,[psl_day_NorESM2-MM_ssp245_r1i1p1f1_gn_2015010...
1331,NorESM2-MM,ssp245,r2i1p1f1/,psl,gn,v20200702,9.0,[psl_day_NorESM2-MM_ssp245_r2i1p1f1_gn_2015010...
1332,NorESM2-MM,ssp585,r1i1p1f1/,tas,gn,v20191108,9.0,[tas_day_NorESM2-MM_ssp585_r1i1p1f1_gn_2015010...
1333,NorESM2-MM,ssp585,r1i1p1f1/,pr,gn,v20191108,9.0,[pr_day_NorESM2-MM_ssp585_r1i1p1f1_gn_20150101...
