# Create CMIP6 daily batch files

This notebook is used to derive the `batch_files/daily_*.txt` files that contain the endpoints and files to transfer to the Arctic Climate Data Node.

In [1]:
import pandas as pd
import luts
from config import *

In [2]:
df = pd.read_csv("llnl_esgf_day_filenames.csv", converters={"filenames": lambda x: x.strip("[]").split(", ")})
# ignore rows where data not on LLNL node for now
df = df.query("~n_files.isnull()")

In [3]:
def generate_transfer_paths(row):
    """Generate the paths for transferring between LLNL ESGF node and ACDN
    
    Args:
        row (pandas.core.series.Series): a single row series from pandas.DataFrame.iterrows() on dataframe of desired data filenames
    
    Returns:
        transfer_list (list): has format [(<remote path>, <target path>), ...] for all files in row["filenames"]
    """
    activity = "CMIP" if row["scenario"] == "historical" else "ScenarioMIP"
    model = row["model"]
    institution = luts.model_inst_lu[model]["institution"]
    group_path = Path().joinpath(
        activity,
        institution,
        model,
        row["scenario"],
        row["variant"],
        "day",
        row["variable"],
        row["grid_type"],
        row["version"],
    )
    
    transfer_list = []
    for fn in row["filenames"]:
        fp = group_path.joinpath(fn.replace("'", ""))
        transfer_list.append((llnl_prefix.joinpath(fp), acdn_prefix.joinpath(fp)))
        
    return transfer_list

In [4]:
# ESGF directory structure convention is /<activity>/<institution>/<model>/<scenario>/<variant>/<frequency>/<variable>/<grid type>/<version>/
# iterate over rows and write to batch file with remote path and local path. Should actually be nearly identical.
transfer_paths = []
# set up a query string to get a subset of the data, because I'm not sure it's a great idea to make a single batch file for all daily data 
query_str = "variable == 'tas' & scenario == 'historical' & model == 'ACCESS-CM2'"
for row in df.query(query_str).iterrows():
    transfer_paths.extend(generate_transfer_paths(row[1]))

In [7]:
# build a text file for use with the globus transfer --batch option. This will have lines of the format "<remote> <local>"
batch_file = "batch_llnl_day_tas_historical_test.txt"
with open(batch_file, "w") as f:
    for paths in transfer_paths:
        f.write(f"{paths[0]} {paths[1]}\n")

In [6]:
pycat batch_llnl_day_tas_historical.txt

[0;34m/[0m[0mcss03_data[0m[0;34m/[0m[0mCMIP6[0m[0;34m/[0m[0mCMIP[0m[0;34m/[0m[0mCSIRO[0m[0;34m-[0m[0mARCCSS[0m[0;34m/[0m[0mACCESS[0m[0;34m-[0m[0mCM2[0m[0;34m/[0m[0mhistorical[0m[0;34m/[0m[0mr1i1p1f1[0m[0;34m/[0m[0mday[0m[0;34m/[0m[0mtas[0m[0;34m/[0m[0mgn[0m[0;34m/[0m[0mv20191108[0m[0;34m/[0m[0mtas_day_ACCESS[0m[0;34m-[0m[0mCM2_historical_r1i1p1f1_gn_18500101[0m[0;34m-[0m[0;36m18991231.[0m[0mnc[0m [0;34m/[0m[0mCMIP6[0m[0;34m/[0m[0mCMIP[0m[0;34m/[0m[0mCSIRO[0m[0;34m-[0m[0mARCCSS[0m[0;34m/[0m[0mACCESS[0m[0;34m-[0m[0mCM2[0m[0;34m/[0m[0mhistorical[0m[0;34m/[0m[0mr1i1p1f1[0m[0;34m/[0m[0mday[0m[0;34m/[0m[0mtas[0m[0;34m/[0m[0mgn[0m[0;34m/[0m[0mv20191108[0m[0;34m/[0m[0mtas_day_ACCESS[0m[0;34m-[0m[0mCM2_historical_r1i1p1f1_gn_18500101[0m[0;34m-[0m[0;36m18991231.[0m[0mnc[0m[0;34m[0m
[0;34m[0m[0;34m/[0m[0mcss03_data[0m[0;34m/[0m[0mCMIP6[0m[0;34m/[0m[0mCMIP

In [None]:
!globus transfer 415a6320-e49c-11e5-9798-22000b9da45e 7235217a-be50-46ba-be31-70bffe2b5bf4 --label "BAtch day test historical tas" --batch batch_llnl_day_tas_historical_test.txt