# Access DART data from NCAR's data origin and benchmark

In [1]:
# Display output of plots directly in Notebook
import intake
import numpy as np
import pandas as pd
import xarray as xr
import aiohttp
import time
from contextlib import contextmanager
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 

In [3]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [4]:
init_year0  = '1991'
init_year1  = '2020'
final_year0 = '2071'
final_year1 = '2100'

In [5]:
# # This overwrites the default scheduler with a single-threaded scheduler
# dask.config.set(scheduler='synchronous')  

In [6]:
# File paths
rda_scratch   = '/gpfs/csfs1/collections/rda/scratch/harshah'
rda_url       =  'https://data.rda.ucar.edu/'
database_num  = 'd345001'
cam6_dart_url = rda_url + database_num
#
https_catalog = cam6_dart_url + '/catalogs/https/'+ database_num +'-https-zarr.json'
osdf_catalog  = cam6_dart_url + '/catalogs/osdf/'+ database_num +'-osdf-zarr.json'

### Create a Dask cluster

#### Dask Introduction

Dask is a solution that enables the scaling of Python libraries. It mimics popular scientific libraries such as numpy, pandas, and xarray that enables an easier path to parallel processing without having to refactor code.

There are 3 components to parallel processing with Dask: the client, the scheduler, and the workers.

The Client is best envisioned as the application that sends information to the Dask cluster. In Python applications this is handled when the client is defined with client = Client(CLUSTER_TYPE). A Dask cluster comprises of a single scheduler that manages the execution of tasks on workers. The CLUSTER_TYPE can be defined in a number of different ways.

There is LocalCluster, a cluster running on the same hardware as the application and sharing the available resources, directly in Python with dask.distributed.

In certain JupyterHubs Dask Gateway may be available and a dedicated dask cluster with its own resources can be created dynamically with dask.gateway.

On HPC systems dask_jobqueue is used to connect to the HPC Slurm, PBS or HTCondor job schedulers to provision resources.

The dask.distributed client python module can also be used to connect to existing clusters. A Dask Scheduler and Workers can be deployed in containers, or on Kubernetes, without using a Python function to create a dask cluster. The dask.distributed Client is configured to connect to the scheduler either by container name, or by the Kubernetes service name.

#### Select the Dask cluster type
The default will be LocalCluster as that can run on any system.

If running on a HPC computer with a PBS Scheduler, set to True. Otherwise, set to False.

In [7]:
USE_PBS_SCHEDULER = False

If running on Jupyter server with Dask Gateway configured, set to True. Otherwise, set to False.

In [8]:
USE_DASK_GATEWAY = False

#### Python function for a PBS cluster

In [9]:
# Create a PBS cluster object
def get_pbs_cluster():
    """ Create cluster through dask_jobqueue.   
    """
    from dask_jobqueue import PBSCluster
    cluster = PBSCluster(
        job_name = 'dask-osdf-24',
        cores = 1,
        memory = '4GiB',
        processes = 1,
        local_directory = rda_scratch + '/dask/spill',
        log_directory = rda_scratch + '/dask/logs/',
        resource_spec = 'select=1:ncpus=1:mem=4GB',
        queue = 'casper',
        walltime = '3:00:00',
        #interface = 'ib0'
        interface = 'ext'
    )
    return cluster

#### Python function for a Gateway Cluster

In [10]:
def get_gateway_cluster():
    """ Create cluster through dask_gateway
    """
    from dask_gateway import Gateway

    gateway = Gateway()
    cluster = gateway.new_cluster()
    cluster.adapt(minimum=2, maximum=4)
    return cluster

#### Python function for a Local Cluster

In [11]:
def get_local_cluster():
    """ Create cluster using the Jupyter server's resources
    """
    from distributed import LocalCluster, performance_report
    cluster = LocalCluster()    

    cluster.scale(4)
    return cluster

#### Python logic to select the Dask Cluster type

This uses True/False boolean logic based on the variables set in the previous cells

In [12]:
# Obtain dask cluster in one of three ways

if USE_PBS_SCHEDULER:
    cluster = get_pbs_cluster()
elif USE_DASK_GATEWAY:
    cluster = get_gateway_cluster()
else:
    cluster = get_local_cluster()

# Connect to cluster
from distributed import Client
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43011 instead


In [13]:
# Scale the cluster and display cluster dashboard URL
# cluster.scale(4)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/43011/status,Workers: 4
Total threads: 4,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:45045,Workers: 4
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/43011/status,Total threads: 4
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:39633,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/32857/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:44883,
Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-nmx8gyv9,Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-nmx8gyv9

0,1
Comm: tcp://127.0.0.1:37735,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/33247/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:40219,
Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-e6wsuy6k,Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-e6wsuy6k

0,1
Comm: tcp://127.0.0.1:41017,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/42717/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:46437,
Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-nxgbqfqb,Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-nxgbqfqb

0,1
Comm: tcp://127.0.0.1:42185,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37505/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:43113,
Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-h4a42ocm,Local directory: /glade/derecho/scratch/harshah/tmp/dask-scratch-space/worker-h4a42ocm


## Access the data from NCAR's Research Data Archive using intake

In [14]:
df_https_test = intake.open_esm_datastore(https_catalog)
df_https_test.df['path'].values

array(['https://data.rda.ucar.edu/d345001/hourly6/HR.zarr',
       'https://data.rda.ucar.edu/d345001/hourly6/TSA.zarr',
       'https://data.rda.ucar.edu/d345001/hourly6/EFLX_LH_TOT.zarr',
       'https://data.rda.ucar.edu/d345001/hourly6/ER.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/VS.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/PS.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/Q.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/US.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/CLDICE.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/T.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/CLDLIQ.zarr'],
      dtype=object)

In [15]:
df_osdf_test = intake.open_esm_datastore(osdf_catalog)
df_osdf_test.df['path'].values

array(['osdf:///ncar/rda/d345001/hourly6/HR.zarr',
       'osdf:///ncar/rda/d345001/hourly6/TSA.zarr',
       'osdf:///ncar/rda/d345001/hourly6/EFLX_LH_TOT.zarr',
       'osdf:///ncar/rda/d345001/hourly6/ER.zarr',
       'osdf:///ncar/rda/d345001/weekly/VS.zarr',
       'osdf:///ncar/rda/d345001/weekly/PS.zarr',
       'osdf:///ncar/rda/d345001/weekly/Q.zarr',
       'osdf:///ncar/rda/d345001/weekly/US.zarr',
       'osdf:///ncar/rda/d345001/weekly/CLDICE.zarr',
       'osdf:///ncar/rda/d345001/weekly/T.zarr',
       'osdf:///ncar/rda/d345001/weekly/CLDLIQ.zarr'], dtype=object)

In [16]:
data_var = 'PS'
col_subset_https = df_https_test.search(variable=data_var)
col_subset_osdf  = df_osdf_test.search(variable=data_var)

In [17]:
dsets_https = col_subset_https.to_dataset_dict(zarr_kwargs={"consolidated": True})
#
print(f"\nDataset dictionary keys:\n {dsets_https.keys()}")
# Load the first dataset and display a summary.
dataset_key = list(dsets_https.keys())[0]
#
ds_https = dsets_https[dataset_key]


--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.frequency.component.vertical_levels'



Dataset dictionary keys:
 dict_keys(['PS.weekly.atm.1'])


In [18]:
dsets_osdf  = col_subset_osdf.to_dataset_dict()
ds_osdf     = dsets_osdf[dataset_key]


--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.frequency.component.vertical_levels'


In [19]:
ds_osdf  = ds_osdf.PS
ds_https = ds_https.PS
ds_osdf

Unnamed: 0,Array,Chunk
Bytes,15.52 GiB,50.00 MiB
Shape,"(80, 471, 192, 288)","(80, 80, 32, 32)"
Dask graph,324 chunks in 2 graph layers,324 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.52 GiB 50.00 MiB Shape (80, 471, 192, 288) (80, 80, 32, 32) Dask graph 324 chunks in 2 graph layers Data type float64 numpy.ndarray",80  1  288  192  471,

Unnamed: 0,Array,Chunk
Bytes,15.52 GiB,50.00 MiB
Shape,"(80, 471, 192, 288)","(80, 80, 32, 32)"
Dask graph,324 chunks in 2 graph layers,324 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


## Data Access Speed tests
- We will now test how long it takes to access data (via OSDF and https-only prrotocols) for various sizes using the above array

### Prepare data subsets

In [20]:
# Define file path for CSV
csv_file_path = "ncar_benchmark_results.csv"

In [21]:
ds_osdf_1Kb  = ds_osdf.isel(lat=0,lon=0,member_id=0).isel(time=np.arange(130))
ds_https_1Kb = ds_https.isel(lat=0,lon=0,member_id=0).isel(time=np.arange(130))
ds_https_1Kb

Unnamed: 0,Array,Chunk
Bytes,1.02 kiB,640 B
Shape,"(130,)","(80,)"
Dask graph,2 chunks in 4 graph layers,2 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.02 kiB 640 B Shape (130,) (80,) Dask graph 2 chunks in 4 graph layers Data type float64 numpy.ndarray",130  1,

Unnamed: 0,Array,Chunk
Bytes,1.02 kiB,640 B
Shape,"(130,)","(80,)"
Dask graph,2 chunks in 4 graph layers,2 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [22]:
ds_osdf_1Mb  = ds_osdf.isel(time=0).isel(member_id =1+ np.arange(3))
ds_https_1Mb = ds_https.isel(time=0).isel(member_id =1+ np.arange(3))
ds_osdf_1Mb

Unnamed: 0,Array,Chunk
Bytes,1.27 MiB,24.00 kiB
Shape,"(3, 192, 288)","(3, 32, 32)"
Dask graph,54 chunks in 4 graph layers,54 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.27 MiB 24.00 kiB Shape (3, 192, 288) (3, 32, 32) Dask graph 54 chunks in 4 graph layers Data type float64 numpy.ndarray",288  192  3,

Unnamed: 0,Array,Chunk
Bytes,1.27 MiB,24.00 kiB
Shape,"(3, 192, 288)","(3, 32, 32)"
Dask graph,54 chunks in 4 graph layers,54 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [23]:
ds_osdf_10Mb  = ds_osdf.isel(member_id =4).isel(time=np.arange(24))
ds_https_10Mb = ds_https.isel(member_id =4).isel(time=np.arange(24))
ds_osdf_10Mb

Unnamed: 0,Array,Chunk
Bytes,10.12 MiB,192.00 kiB
Shape,"(24, 192, 288)","(24, 32, 32)"
Dask graph,54 chunks in 4 graph layers,54 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 10.12 MiB 192.00 kiB Shape (24, 192, 288) (24, 32, 32) Dask graph 54 chunks in 4 graph layers Data type float64 numpy.ndarray",288  192  24,

Unnamed: 0,Array,Chunk
Bytes,10.12 MiB,192.00 kiB
Shape,"(24, 192, 288)","(24, 32, 32)"
Dask graph,54 chunks in 4 graph layers,54 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [24]:
ds_osdf_100Mb  = ds_osdf.isel(member_id =5).isel(time=np.arange(238))
ds_https_100Mb = ds_https.isel(member_id =5).isel(time=np.arange(238))
#ds_osdf_100Mb

In [25]:
ds_osdf_200Mb  = ds_osdf.isel(member_id = 6)
ds_https_200Mb = ds_https.isel(member_id =6)
#ds_https_200Mb 

In [26]:
ds_osdf_400Mb  = ds_osdf.isel(member_id = 7 +np.arange(2))
ds_https_400Mb = ds_https.isel(member_id =7 + np.arange(2))

In [27]:
ds_osdf_600Mb  = ds_osdf.isel(member_id  = 10 +np.arange(3))
ds_https_600Mb = ds_https.isel(member_id = 10 + np.arange(3))

In [28]:
ds_osdf_800Mb  = ds_osdf.isel(member_id  = 14 +np.arange(4))
ds_https_800Mb = ds_https.isel(member_id = 14 + np.arange(4))

In [29]:
ds_osdf_1Gb  = ds_osdf.isel(member_id  = 19 + np.arange(6)).isel(time = np.arange(410))
ds_https_1Gb = ds_https.isel(member_id = 19 + np.arange(6)).isel(time = np.arange(410))
ds_osdf_1Gb

Unnamed: 0,Array,Chunk
Bytes,1.01 GiB,3.75 MiB
Shape,"(6, 410, 192, 288)","(6, 80, 32, 32)"
Dask graph,324 chunks in 4 graph layers,324 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.01 GiB 3.75 MiB Shape (6, 410, 192, 288) (6, 80, 32, 32) Dask graph 324 chunks in 4 graph layers Data type float64 numpy.ndarray",6  1  288  192  410,

Unnamed: 0,Array,Chunk
Bytes,1.01 GiB,3.75 MiB
Shape,"(6, 410, 192, 288)","(6, 80, 32, 32)"
Dask graph,324 chunks in 4 graph layers,324 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [30]:
# ds_osdf_10Gb  = ds_osdf.isel(member_id  = 12 + np.arange(52))
# ds_https_10Gb = ds_https.isel(member_id = 12 + np.arange(52))
# ds_osdf_10Gb

### Now access data and plot

In [31]:
ds_osdf_list  = [ds_osdf_1Mb,ds_osdf_10Mb,ds_osdf_100Mb,ds_osdf_200Mb,ds_osdf_400Mb,
                 ds_osdf_600Mb,ds_osdf_800Mb,ds_osdf_1Gb]
ds_https_list = [ds_https_1Mb,ds_https_10Mb,ds_https_100Mb,ds_https_200Mb,ds_https_400Mb,
                 ds_https_600Mb,ds_https_800Mb,ds_https_1Gb]

In [32]:
# Number of data access calls
num_calls = 7  # Modify this as needed
n_workers = 4  # Set this to your preferred number of workers

In [33]:
# DiagnosticTimer class to keep track of runtimes
class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []

    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)

    def dataframe(self):
        return pd.DataFrame(self.diagnostics)

# Initialize the DiagnosticTimer
diag_timer = DiagnosticTimer()

In [34]:
# Function to check existing CSV file and determine missing runs
def load_existing_results():
    if os.path.exists(csv_file_path):
        # Load existing CSV into DataFrame
        existing_df = pd.read_csv(csv_file_path)
    else:
        # Create an empty DataFrame if the file does not exist
        existing_df = pd.DataFrame(columns=["dataset_size", "protocol", "call_number", "runtime", "MBps"])
    return existing_df

def filter_missing_runs(datasets, protocol_name, existing_df):
    # Convert dataset sizes to MB for checking, using a list of tuples
    dataset_sizes_mb = [(dataset, dataset.nbytes / (1024 ** 2)) for dataset in datasets]

    # Identify missing dataset sizes and calls
    filtered_datasets = []
    for dataset, dataset_size_mb in dataset_sizes_mb:
        for call_num in range(1, num_calls + 1):
            # Check if this dataset size and call number combination already exists
            if not ((existing_df["dataset_size"] == dataset_size_mb) &
                    (existing_df["protocol"] == protocol_name) &
                    (existing_df["call_number"] == call_num)).any():
                filtered_datasets.append((dataset, dataset_size_mb, call_num))
    
    return filtered_datasets

In [35]:
def benchmark_protocol(datasets, protocol_name, cluster=None):
    existing_df = load_existing_results()  # Load existing results as a checkpoint

    # Filter for missing runs based on existing results
    missing_runs = filter_missing_runs(datasets, protocol_name, existing_df)
    diag_timer = DiagnosticTimer()  # Initialize the diagnostic timer

    # Process each dataset and call
    for (dataset, dataset_size_mb, call_num) in missing_runs:
        # Restart the Dask cluster if provided
        if cluster is not None:
            cluster.scale(0)  # Scale down to release worker memory
            cluster.scale(n_workers)  # Scale up to required number of workers
            client.wait_for_workers(n_workers)  # Wait for workers to be ready

        # Inform the start of processing for this dataset and call
        print(f"Starting processing of dataset for protocol '{protocol_name}' (Size: {dataset_size_mb} MB) in call {call_num}")

        # Only count the time for loading dataset into memory
        dataset_copy = dataset.copy()
        with diag_timer.time(dataset_size=dataset_size_mb, protocol=protocol_name, call_number=call_num):
            dataset_copy.load()  # Load the dataset into memory

        # Convert the single call result to a DataFrame and add MBps column
        call_result_df = diag_timer.dataframe().iloc[[-1]].copy()  # Get the latest diagnostic entry
        call_result_df["MBps"] = call_result_df["dataset_size"] / call_result_df["runtime"]

        # Append this call's result to CSV
        call_result_df.to_csv(csv_file_path, mode='a', header=not os.path.exists(csv_file_path), index=False)
        print(f"Appended results for protocol '{protocol_name}', call {call_num} to '{csv_file_path}'")

        # Print statement after finishing each call
        print(f"Finished processing dataset for protocol '{protocol_name}' in call {call_num}")


In [36]:
# def benchmark_protocol(datasets, protocol_name, cluster=None):
#     for index, dataset in enumerate(datasets):
#         # Calculate dataset size in MB for logging
#         dataset_size_mb = dataset.nbytes / (1024 ** 2)
        
#         # Each dataset will be loaded multiple times to capture caching effect
#         for call_num in range(num_calls):
#             if cluster is not None:
#                 # Scale down to zero workers to clear memory
#                 cluster.scale(0)  # Stop all workers
#                 cluster.scale(n_workers)  # Scale up to the required number of workers
#                 client.wait_for_workers(n_workers)  # Wait for the workers to be ready

#             # Only count the time for loading dataset into memory, excluding cluster scaling time
#             dataset_copy = dataset.copy()
#             with diag_timer.time(dataset_size=dataset_size_mb, protocol=protocol_name, call_number=call_num + 1):
#                 dataset_copy.load()  # Load the dataset into memory
#             print(f" Finished processing dataset {index + 1} in {call_num + 1} th call") 

In [None]:
# Run benchmark for each protocol
benchmark_protocol(ds_https_list, "HTTPS-only",cluster=None)
benchmark_protocol(ds_osdf_list, "OSDF-director",cluster=None)

# Convert diagnostics to a DataFrame for analysis
df_diagnostics = diag_timer.dataframe()

# Calculate MB/s for each run
df_diagnostics['MBps'] = df_diagnostics['dataset_size'] / df_diagnostics['runtime']
df_diagnostics

Starting processing of dataset for protocol 'HTTPS-only' (Size: 100.40625 MB) in call 3


In [None]:
# Plotting MBps vs data size for each protocol and call type
# Define different alpha values for each protocol
alpha_values = {"HTTPS-only": 0.8, "OSDF-director": 0.5}  # Adjust transparency as needed
marker_style = {"HTTPS-only": "o", "OSDF-director": "x"}  # Define different markers for each protocol
#
fig, ax = plt.subplots(figsize=(10, 6))
for protocol in ["HTTPS-only", "OSDF-director"]:
    # First access (call_number == 1)
    first_access = df_diagnostics[(df_diagnostics['protocol'] == protocol) & (df_diagnostics['call_number'] == 1)]
    ax.scatter(first_access['dataset_size'], first_access['MBps'], label=f"{protocol} - First Access",
            alpha=alpha_values[protocol],marker=marker_style[protocol],markersize=8)

    # Subsequent access (call_number > 1)
    subsequent_access = df_diagnostics[(df_diagnostics['protocol'] == protocol) & (df_diagnostics['call_number'] > 1)]
    subsequent_access_avg = subsequent_access.groupby('dataset_size')['MBps'].mean()
    ax.plot(subsequent_access_avg.index, subsequent_access_avg.values, 
            linestyle='--', label=f"{protocol} - Subsequent Access (Avg)",alpha=alpha_values[protocol],marker=marker_style[protocol],markersize=8)
    
# Customize plot appearance
ax.set_xlabel("Data Size (MB)")
ax.set_ylabel("Data Access Speed (MBps)")
ax.set_title("NCAR origin benchmark")
ax.legend()
plt.show()


In [None]:
# Convert dataset size to categorical to control the order in the plot
df_diagnostics['dataset_size'] = df_diagnostics['dataset_size'].astype("category")

# Set the order for dataset sizes to appear in ascending order
size_order = sorted(df_diagnostics['dataset_size'].unique())

# Create the box plot
plt.figure(figsize=(12, 6))
sns.boxplot(
    data=df_diagnostics, 
    x="dataset_size", 
    y="MBps", 
    hue="protocol", 
    order=size_order
)

# Customize plot appearance
plt.xlabel("Data Size (MB)")
plt.ylabel("Data Access Speed (MBps)")
plt.title("NCAR to UWMadison, Dask: 4x4GiB, 7 requests")
plt.legend(title="Protocol")
plt.show()

### Try with a specific cache

In [None]:
# historical_smbb_test1 = historical_smbb.isel(time=0).isel(member_id =1+ np.arange(5))
# historical_smbb_test1

In [None]:
# %%timeit -r2 -n3 -o
# historical_smbb_test1.compute()

In [None]:
# #Try using a specific cache
# sdsc_cache='https://sdsc-cache.nationalresearchplatform.org:8443/aws-opendata/us-west-2/ncar-cesm2-lens/atm/monthly/'+\
#             'cesm2LE-historical-smbb-TREFHTMX.zarr'

In [None]:
# %%time
# test_1 = xr.open_zarr(sdsc_cache).TREFHTMX.isel(time=0)
# test_1