# ESGF Virtual Aggregation

The aim of this project is to create a ready-to-deploy TDS catalog including ALL available data in ESGF, using OPeNDAP endpoints to provide ESGF data analysis while avoiding the download of any data from remote repositories.

In [None]:
import time
import psutil

import xarray
import dask

In [None]:
dask.config.set(scheduler="processes")

## Performance testing

OPeNDAP reads chunks from netCDF files, performs decompression on the server, and transmits the uncompressed data over the network. On the other hand, Zarr or netCDF+kerchunk both send the chunks compressed through the network.

netCDF clients support HTTP compression but the compression is applied by the HTTP component, OPeNDAP still performs decompression when reading.

In [None]:
nworkers = [2, 4, 8]
results = []
runs = 3
# DKRZ - no OPeNDAP any longer
# NCI - 503, reading from tape?
data_nodes = ["esgf.ceda.ac.uk", "aims3.llnl.gov", "esg1.umr-cnrm.fr", "esgf.nci.org.au", "esgf3.dkrz.de"]
protocol = {
    "esgf.ceda.ac.uk": "https",
    "esg1.umr-cnrm.fr": "http",
    "aims3.llnl.gov": "https",
    "esgf3.dkrz.de": "https",
}

def measure(op, name, nworkers, run, data_node):
    start_net = psutil.net_io_counters()
    start_time = time.time()

    op.compute(num_workers=nworkers)

    end_time = time.time()
    end_net = psutil.net_io_counters()

    result = {
        "name": name,
        "run": run,
        "data_node": data_node,
        "time": end_time-start_time,
        "bytes_recv": end_net.bytes_recv-start_net.bytes_recv,
        "bytes_sent": end_net.bytes_sent-start_net.bytes_sent,
        "packets_recv": end_net.packets_recv-start_net.packets_recv,
        "packets_sent": end_net.packets_sent-start_net.packets_sent,
        "errin": end_net.errin-start_net.errin,
        "errout": end_net.errout-start_net.errout,
        "dropin": end_net.dropin-start_net.dropin,
        "dropout": end_net.dropout-start_net.dropout,
        "workers": nworkers
    }

    return result

### Kerchunk

In [None]:
for dn in data_nodes:
    try:
        for n in nworkers:
            for r in range(runs):
                ds = xarray.open_dataset(
                    "reference://",
                    engine="zarr",
                    backend_kwargs={
                        "consolidated": False,
                        "storage_options": {"fo": f"kerchunks/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp245_day_tas_gr_v20190410_{dn}.json", "remote_protocol": "https"}
                    }).chunk({"variant_label": 1, "time": 400})
                v = ds["tas"]
                results.append(measure(v.mean(["lat", "lon"]), "Kerchunk", n, r, dn))
    except:
        print(f"Error with {dn}")

### OPeNDAP

In [None]:
# Disable HTTP compression
!sed -i '/HTTP\.DEFLATE/{s|1|0|}' ~/.dodsrc

In [None]:
for dn in data_nodes:
    try:
        for n in nworkers:
            for r in range(runs):
                dataset = f"https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/demo/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp245_day_tas_gr_v20190410_{dn}.ncml"
                ds = xarray.open_dataset(dataset).chunk({"variant_label": 1, "time": 400})
                v = ds["tas"]
                results.append(measure(v.mean(["lat", "lon"]), "OPeNDAP", n, r, dn))
    except:
        print(f"Error with {dn}")

### OPeNDAP with HTTP compression

In [None]:
# Enable HTTP compression
!sed -i '/HTTP\.DEFLATE/{s|0|1|}' ~/.dodsrc

In [None]:
for dn in data_nodes:
    try:
        for n in nworkers:
            for r in range(runs):
                dataset = f"https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/demo/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp245_day_tas_gr_v20190410_{dn}.ncml"
                ds = xarray.open_dataset(dataset).chunk({"variant_label": 1, "time": 400})
                v = ds["tas"]
                results.append(measure(v.mean(["lat", "lon"]), "OPeNDAP-compression", n, r, dn))
    except:
        print(f"Error with {dn}")

### Store the results

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_records(results)
df

In [None]:
df.to_csv("results.csv", index=False)