# ESGF Virtual Aggregation

The aim of this project is to create a ready-to-deploy TDS catalog including ALL available data in ESGF, using OPeNDAP endpoints to provide ESGF data analysis while avoiding the download of any data from remote repositories.

In [1]:
import time
import psutil

import xarray
import dask

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cartopy.crs as ccrs

In [2]:
dask.config.set(scheduler="processes")

<dask.config.set at 0x7fbc7b20fa00>

## Performance testing

OPeNDAP reads chunks from netCDF files, performs decompression on the server, and transmits the uncompressed data over the network. On the other hand, Zarr or netCDF+kerchunk both send the chunks compressed through the network.

netCDF clients support HTTP compression but the compression is applied by the HTTP component, OPeNDAP still performs decompression when reading.

In [3]:
nworkers = [2, 4, 8]
results = []

def measure(op, name, nworkers):
    start_net = psutil.net_io_counters()
    start_time = time.time()

    op.compute(num_workers=nworkers)

    end_time = time.time()
    end_net = psutil.net_io_counters()

    result = {
        "name": name,
        "time": end_time-start_time,
        "bytes_recv": end_net.bytes_recv-start_net.bytes_recv,
        "bytes_sent": end_net.bytes_sent-start_net.bytes_sent,
        "packets_recv": end_net.packets_recv-start_net.packets_recv,
        "packets_sent": end_net.packets_sent-start_net.packets_sent,
        "errin": end_net.errin-start_net.errin,
        "errout": end_net.errout-start_net.errout,
        "dropin": end_net.dropin-start_net.dropin,
        "dropout": end_net.dropout-start_net.dropout,
        "workers": nworkers
    }

    return result

### Kerchunk

In [4]:
ds = xarray.open_dataset(
    "reference://",
    engine="zarr",
    backend_kwargs={
        "consolidated": False,
        "storage_options": {"fo": 'CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp245_day_tas_gr_v20190410_aims3.llnl.gov.json', "remote_protocol": "https"}
    }).chunk({"variant_label": 1, "time": 400})
v = ds["tas"]

In [None]:
for n in nworkers:
    results.append(
        measure(v.mean(["lat", "lon"]), "Kerchunk", n))

### OPeNDAP

In [None]:
dataset = "https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/demo/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp245_day_tas_gr_v20190410_aims3.llnl.gov.ncml"

ds = xarray.open_dataset(dataset).chunk({"variant_label": 1, "time": 400})
v = ds["tas"]

In [None]:
# Disable HTTP compression
!sed -i '/HTTP\.DEFLATE/{s|1|0|}' ~/.dodsrc

In [None]:
for n in nworkers:
    results.append(
        measure(v.mean(["lat", "lon"]), "OPeNDAP", n))

### OPeNDAP with HTTP compression

In [None]:
# Enable HTTP compression
!sed -i '/HTTP\.DEFLATE/{s|0|1|}' ~/.dodsrc

In [None]:
for n in nworkers:
    results.append(
        measure(v.mean(["lat", "lon"]), "OPeNDAP-deflate", n))

### Store the results

In [None]:
df = pd.DataFrame.from_records(results)
df

In [None]:
# df.to_csv("kerchunk-results-2.csv", index=False)

### Analyze the results

In [None]:
df = pd.read_csv("kerchunk-results-2.csv")
df

In [None]:
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(1, 3, figsize=(16,5))

    df["throughput"] = (df["bytes_recv"] / 2**20) / df["time"]

    sns.barplot(data=df, y="time", x="name", hue="workers", ax=axes[0])
    sns.barplot(data=df, y="throughput", x="name", hue="workers", ax=axes[1])
    sns.barplot(data=df, y="bytes_recv", x="name", hue="workers", ax=axes[2])

    for ax in axes:
        ax.set_xlabel("")

    axes[0].set_ylabel("Time (seconds)")
    axes[1].set_ylabel("Throughput (MiB/s)")
    axes[2].set_ylabel("Size (bytes)")