# Access AWS CESM2 using the AWS open data origin data and benchmark

In [1]:
# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import intake
import numpy as np
import pandas as pd
import xarray as xr
import re
import aiohttp

In [2]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 

In [3]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [4]:
init_year0  = '1991'
init_year1  = '2020'
final_year0 = '2071'
final_year1 = '2100'

In [5]:
# This overwrites the default scheduler with a single-threaded scheduler
dask.config.set(scheduler='synchronous')  

<dask.config.set at 0x14e4e217d1c0>

In [6]:
# File paths
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
#
rda_url        =  'https://data.rda.ucar.edu/'
intake_url = rda_url + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'

## Open intake catalog and load files after spinning up a cluster

In [7]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '4GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=4GB',
    queue = 'casper',
    walltime = '5:00:00',
    log_directory = rda_scratch+'/dask/logs',
    #interface = 'ib0'
    interface = 'ext'
)

In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44775/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44775/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.98:39465,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44775/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [9]:
cluster.scale(4)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44775/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.98:39465,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44775/status,Total threads: 0
Started: Just now,Total memory: 0 B


# Access the data from the AWS bucket using intake

In [10]:
osdf_catalog = intake.open_esm_datastore(
   intake_url 
)
osdf_catalog

Unnamed: 0,unique
Unnamed: 0,322
variable,53
long_name,51
component,4
experiment,2
forcing_variant,2
frequency,3
vertical_levels,3
spatial_domain,3
units,20


In [11]:
osdf_catalog.df['path'].head().values

array(['osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNS.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNSC.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLUT.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNS.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNSC.zarr'],
      dtype=object)

In [12]:
osdf_catalog_temp = osdf_catalog.search(variable ='TREFHTMX', frequency ='daily')
osdf_catalog_temp

Unnamed: 0,unique
Unnamed: 0,4
variable,1
long_name,1
component,1
experiment,2
forcing_variant,2
frequency,1
vertical_levels,1
spatial_domain,1
units,1


In [13]:
%%time
#dsets = osdf_catalog_temp.to_dataset_dict(storage_options={'anon':True})
dsets = osdf_catalog_temp.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.frequency.forcing_variant'


CPU times: user 1.37 s, sys: 255 ms, total: 1.62 s
Wall time: 59 s


In [14]:
%%time
dsets.keys()

CPU times: user 3 μs, sys: 1 μs, total: 4 μs
Wall time: 5.48 μs


dict_keys(['atm.historical.daily.smbb', 'atm.ssp370.daily.smbb', 'atm.ssp370.daily.cmip6', 'atm.historical.daily.cmip6'])

In [15]:
historical_smbb = dsets['atm.historical.daily.smbb']
historical_smbb = historical_smbb.TREFHTMX
historical_smbb

Unnamed: 0,Array,Chunk
Bytes,620.30 GiB,153.98 MiB
Shape,"(50, 60225, 192, 288)","(1, 730, 192, 288)"
Dask graph,4150 chunks in 2 graph layers,4150 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 620.30 GiB 153.98 MiB Shape (50, 60225, 192, 288) (1, 730, 192, 288) Dask graph 4150 chunks in 2 graph layers Data type float32 numpy.ndarray",50  1  288  192  60225,

Unnamed: 0,Array,Chunk
Bytes,620.30 GiB,153.98 MiB
Shape,"(50, 60225, 192, 288)","(1, 730, 192, 288)"
Dask graph,4150 chunks in 2 graph layers,4150 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Data Access Speed tests
- We will now test how long it takes to access data (via OSDF) for various sizes using one of the above arrays

#### Test 0 : Single data point, Memory = 4 bytes.

In [16]:
historical_smbb_test0 = historical_smbb.isel(lat=0,lon=0,time=0,member_id=0)
historical_smbb_test0 

Unnamed: 0,Array,Chunk
Bytes,4 B,4 B
Shape,(),()
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
Array Chunk Bytes 4 B 4 B Shape () () Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,4 B,4 B
Shape,(),()
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [17]:
%%timeit -r2 -n3 -o
historical_smbb_test0.compute()

The slowest run took 41.06 times longer than the fastest. This could mean that an intermediate result is being cached.
115 ms ± 110 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


<TimeitResult : 115 ms ± 110 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)>

#### Test 1: Whole globe, 5 membes for 1 time step, Memory ~ 1Mb

In [18]:
historical_smbb_test1 = historical_smbb.isel(time=0).isel(member_id =1 + np.arange(5))
historical_smbb_test1

Unnamed: 0,Array,Chunk
Bytes,1.05 MiB,216.00 kiB
Shape,"(5, 192, 288)","(1, 192, 288)"
Dask graph,5 chunks in 4 graph layers,5 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.05 MiB 216.00 kiB Shape (5, 192, 288) (1, 192, 288) Dask graph 5 chunks in 4 graph layers Data type float32 numpy.ndarray",288  192  5,

Unnamed: 0,Array,Chunk
Bytes,1.05 MiB,216.00 kiB
Shape,"(5, 192, 288)","(1, 192, 288)"
Dask graph,5 chunks in 4 graph layers,5 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [19]:
%%timeit -r2 -n3 -o
historical_smbb_test1.compute()

The slowest run took 44.87 times longer than the fastest. This could mean that an intermediate result is being cached.
183 ms ± 175 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


<TimeitResult : 183 ms ± 175 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)>

#### Test 2: Whole globe + all member_ids for 1 time step, Memory ~ 10Mb

In [20]:
historical_smbb_test2 = historical_smbb.isel(time=1)
historical_smbb_test2

Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.55 MiB 216.00 kiB Shape (50, 192, 288) (1, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",288  192  50,

Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [21]:
%%timeit -r2 -n3 -o
historical_smbb_test2.compute()

The slowest run took 76.50 times longer than the fastest. This could mean that an intermediate result is being cached.
1.01 s ± 989 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


<TimeitResult : 1.01 s ± 989 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)>

#### Test 3: Whole globe, all members and 10 time steps, Memory ~ 100 Mb

In [22]:
historical_smbb_test3 = historical_smbb.isel(time= 2 + np.arange(10))
historical_smbb_test3

Unnamed: 0,Array,Chunk
Bytes,105.47 MiB,2.11 MiB
Shape,"(50, 10, 192, 288)","(1, 10, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 105.47 MiB 2.11 MiB Shape (50, 10, 192, 288) (1, 10, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",50  1  288  192  10,

Unnamed: 0,Array,Chunk
Bytes,105.47 MiB,2.11 MiB
Shape,"(50, 10, 192, 288)","(1, 10, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [23]:
%%timeit -r2 -n3 -o
historical_smbb_test3.compute()

6.14 s ± 14.5 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


<TimeitResult : 6.14 s ± 14.5 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)>

#### Test 4: Whole globe, all members and 100 time steps, Memory: 1 Gb

In [24]:
historical_smbb_test4 = historical_smbb.isel(time=20 + np.arange(100))
historical_smbb_test4

Unnamed: 0,Array,Chunk
Bytes,1.03 GiB,21.09 MiB
Shape,"(50, 100, 192, 288)","(1, 100, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.03 GiB 21.09 MiB Shape (50, 100, 192, 288) (1, 100, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",50  1  288  192  100,

Unnamed: 0,Array,Chunk
Bytes,1.03 GiB,21.09 MiB
Shape,"(50, 100, 192, 288)","(1, 100, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [25]:
%%timeit -r2 -n3 -o
historical_smbb_test4.compute()

7.41 s ± 263 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


<TimeitResult : 7.41 s ± 263 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)>

#### Test 5: Whole globe, all members and 1000 time steps, Memory: 10 Gb

In [26]:
historical_smbb_test5 = historical_smbb.isel(time= 130 + np.arange(1000))
historical_smbb_test5

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,126.56 MiB
Shape,"(50, 1000, 192, 288)","(1, 600, 192, 288)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.30 GiB 126.56 MiB Shape (50, 1000, 192, 288) (1, 600, 192, 288) Dask graph 100 chunks in 3 graph layers Data type float32 numpy.ndarray",50  1  288  192  1000,

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,126.56 MiB
Shape,"(50, 1000, 192, 288)","(1, 600, 192, 288)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [27]:
%%timeit -r2 -n3 -o
historical_smbb_test5.compute()

4min 23s ± 455 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


<TimeitResult : 4min 23s ± 455 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)>

In [None]:
###########################################################################

In [33]:
#Try using a specific cache
sdsc_cache='https://sdsc-cache.nationalresearchplatform.org:8443/aws-opendata/us-west-2/ncar-cesm2-lens/atm/monthly/'+\
            'cesm2LE-historical-smbb-TREFHTMX.zarr'

In [34]:
%%time
test_1 = xr.open_zarr(sdsc_cache).TREFHTMX.isel(time=0)
test_1

CPU times: user 27.5 ms, sys: 8.92 ms, total: 36.4 ms
Wall time: 704 ms


Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.55 MiB 216.00 kiB Shape (50, 192, 288) (1, 192, 288) Dask graph 50 chunks in 3 graph layers Data type float32 numpy.ndarray",288  192  50,

Unnamed: 0,Array,Chunk
Bytes,10.55 MiB,216.00 kiB
Shape,"(50, 192, 288)","(1, 192, 288)"
Dask graph,50 chunks in 3 graph layers,50 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
