# Access AWS CESM2 using the AWS open data origin data and benchmark

In [30]:
# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import intake
import numpy as np
import pandas as pd
import xarray as xr
import re
import aiohttp

In [31]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 

In [32]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [33]:
init_year0  = '1991'
init_year1  = '2020'
final_year0 = '2071'
final_year1 = '2100'

In [34]:
# This overwrites the default scheduler with a single-threaded scheduler
dask.config.set(scheduler='synchronous')  

<dask.config.set at 0x14a685ec7200>

In [35]:
# File paths
rda_scratch   = '/gpfs/csfs1/collections/rda/scratch/harshah'
rda_url       =  'https://data.rda.ucar.edu/'
database_num  = 'd345001'
cam6_dart_url = rda_url + database_num
#
https_catalog = cam6_dart_url + '/catalogs/https/'+ database_num +'-https-zarr.json'
osdf_catalog  = cam6_dart_url + '/catalogs/osdf/'+ database_num +'-osdf-zarr.json'

## Open intake catalog and load files after spinning up a cluster

In [36]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '4GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=4GB',
    queue = 'casper',
    walltime = '5:00:00',
    log_directory = rda_scratch+'/dask/logs',
    #interface = 'ib0'
    interface = 'ext'
)

In [37]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37047/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37047/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.98:44211,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37047/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [38]:
cluster.scale(2)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37047/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.98:44211,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37047/status,Total threads: 0
Started: Just now,Total memory: 0 B


# Access the data from the AWS bucket using intake

In [40]:
df_https_test = intake.open_esm_datastore(https_catalog)
df_https_test.df['path'].values

array(['https://data.rda.ucar.edu/d345001/hourly6/HR.zarr',
       'https://data.rda.ucar.edu/d345001/hourly6/TSA.zarr',
       'https://data.rda.ucar.edu/d345001/hourly6/EFLX_LH_TOT.zarr',
       'https://data.rda.ucar.edu/d345001/hourly6/ER.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/VS.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/PS.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/Q.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/US.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/CLDICE.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/T.zarr',
       'https://data.rda.ucar.edu/d345001/weekly/CLDLIQ.zarr'],
      dtype=object)

In [50]:
df_osdf_test = intake.open_esm_datastore(osdf_catalog)
df_osdf_test.df['path'].values

array(['osdf:///ncar/rda/d345001/hourly6/HR.zarr',
       'osdf:///ncar/rda/d345001/hourly6/TSA.zarr',
       'osdf:///ncar/rda/d345001/hourly6/EFLX_LH_TOT.zarr',
       'osdf:///ncar/rda/d345001/hourly6/ER.zarr',
       'osdf:///ncar/rda/d345001/weekly/VS.zarr',
       'osdf:///ncar/rda/d345001/weekly/PS.zarr',
       'osdf:///ncar/rda/d345001/weekly/Q.zarr',
       'osdf:///ncar/rda/d345001/weekly/US.zarr',
       'osdf:///ncar/rda/d345001/weekly/CLDICE.zarr',
       'osdf:///ncar/rda/d345001/weekly/T.zarr',
       'osdf:///ncar/rda/d345001/weekly/CLDLIQ.zarr'], dtype=object)

In [42]:
data_var = 'PS'
col_subset_https = df_https_test.search(variable=data_var)
col_subset_osdf  = df_osdf_test.search(variable=data_var)

In [48]:
dsets_https = col_subset_https.to_dataset_dict(zarr_kwargs={"consolidated": True})
#
print(f"\nDataset dictionary keys:\n {dsets_https.keys()}")
# Load the first dataset and display a summary.
dataset_key = list(dsets_https.keys())[0]
#
ds_https = dsets_https[dataset_key]


--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.frequency.component.vertical_levels'



Dataset dictionary keys:
 dict_keys(['PS.weekly.atm.1'])


In [49]:
dsets_osdf  = col_subset_osdf.to_dataset_dict()
#ds_osdf = dsets_osdf[dataset_key]


--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.frequency.component.vertical_levels'


ESMDataSourceError: Failed to load dataset with key='PS.weekly.atm.1'
                 You can use `cat['PS.weekly.atm.1'].df` to inspect the assets/files for this key.
                 

## Data Access Speed tests
- We will now test how long it takes to access data (via OSDF) for various sizes using one of the above arrays

#### Test 0 : Single data point, Memory = 4 bytes.

In [17]:
historical_smbb_test0 = historical_smbb.isel(lat=0,lon=0,time=0,member_id=0)
historical_smbb_test0 

Unnamed: 0,Array,Chunk
Bytes,4 B,4 B
Shape,(),()
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
Array Chunk Bytes 4 B 4 B Shape () () Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4 B
Shape,(),()
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
%%timeit -r2 -n3 -o
historical_smbb_test0.compute()

#### Test 1: Whole globe, 5 membes for 1 time step, Memory ~ 1Mb

In [19]:
historical_smbb_test1 = historical_smbb.isel(time=0).isel(member_id =1+ np.arange(5))
historical_smbb_test1

Unnamed: 0,Array,Chunk
Bytes,1.05 MiB,216.00 kiB
Shape,"(5, 192, 288)","(1, 192, 288)"
Dask graph,5 chunks in 4 graph layers,5 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.05 MiB 216.00 kiB Shape (5, 192, 288) (1, 192, 288) Dask graph 5 chunks in 4 graph layers Data type float32 numpy.ndarray",288  192  5,

Unnamed: 0,Array,Chunk
Bytes,1.05 MiB,216.00 kiB
Shape,"(5, 192, 288)","(1, 192, 288)"
Dask graph,5 chunks in 4 graph layers,5 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [20]:
%%timeit -r2 -n3 -o
historical_smbb_test1.compute()

#### Test 2: Whole globe + all member_ids for 1 time step, Memory ~ 10Mb

In [21]:
historical_smbb_test2 = historical_smbb.isel(time=1)
historical_smbb_test2

Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.55 MiB 216.00 kiB Shape (50, 192, 288) (1, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",288  192  50,

Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [22]:
%%timeit -r2 -n3 -o
historical_smbb_test2.compute()

#### Test 3: Whole globe, all members and 10 time steps, Memory ~ 100 Mb

In [23]:
historical_smbb_test3 = historical_smbb.isel(time= 2 + np.arange(10))
historical_smbb_test3

Unnamed: 0,Array,Chunk
Bytes,105.47 MiB,2.11 MiB
Shape,"(50, 10, 192, 288)","(1, 10, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 105.47 MiB 2.11 MiB Shape (50, 10, 192, 288) (1, 10, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",50  1  288  192  10,

Unnamed: 0,Array,Chunk
Bytes,105.47 MiB,2.11 MiB
Shape,"(50, 10, 192, 288)","(1, 10, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [24]:
%%timeit -r2 -n3 -o
historical_smbb_test3.compute()

#### Test 4: Whole globe, all members and 100 time steps, Memory: 1 Gb

In [25]:
historical_smbb_test4 = historical_smbb.isel(time=20 + np.arange(100))
historical_smbb_test4

Unnamed: 0,Array,Chunk
Bytes,1.03 GiB,21.09 MiB
Shape,"(50, 100, 192, 288)","(1, 100, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.03 GiB 21.09 MiB Shape (50, 100, 192, 288) (1, 100, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",50  1  288  192  100,

Unnamed: 0,Array,Chunk
Bytes,1.03 GiB,21.09 MiB
Shape,"(50, 100, 192, 288)","(1, 100, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [26]:
%%timeit -r2 -n3 -o
historical_smbb_test4.compute()

#### Test 5: Whole globe, all members and 1000 time steps, Memory: 10 Gb

In [27]:
historical_smbb_test5 = historical_smbb.isel(time=120 + np.arange(1000))
historical_smbb_test5

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,128.67 MiB
Shape,"(50, 1000, 192, 288)","(1, 610, 192, 288)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.30 GiB 128.67 MiB Shape (50, 1000, 192, 288) (1, 610, 192, 288) Dask graph 100 chunks in 3 graph layers Data type float32 numpy.ndarray",50  1  288  192  1000,

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,128.67 MiB
Shape,"(50, 1000, 192, 288)","(1, 610, 192, 288)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
%timeit -r2 -n3 -o
historical_smbb_test5.compute()

In [None]:
###########################################################################

In [33]:
#Try using a specific cache
sdsc_cache='https://sdsc-cache.nationalresearchplatform.org:8443/aws-opendata/us-west-2/ncar-cesm2-lens/atm/monthly/'+\
            'cesm2LE-historical-smbb-TREFHTMX.zarr'

In [34]:
%%time
test_1 = xr.open_zarr(sdsc_cache).TREFHTMX.isel(time=0)
test_1

CPU times: user 27.5 ms, sys: 8.92 ms, total: 36.4 ms
Wall time: 704 ms


Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.55 MiB 216.00 kiB Shape (50, 192, 288) (1, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",288  192  50,

Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
