# Access AWS CESM2 using the AWS open data origin data 

In [1]:
# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import intake
import numpy as np
import pandas as pd
import xarray as xr
# import s3fs
import seaborn as sns
import re
# import nest_asyncio
# nest_asyncio.apply()
import aiohttp

In [2]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 

In [3]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [4]:
# file = PelicanMap('/chtc/PUBLIC/eturetsky/ncar-subset/ncar/monthly/cesm2LE-historical-cmip6-FLUT.zarr', pelfs)
# test = xr.open_dataset(file, engine='zarr')
# test

In [5]:
# File paths
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
#
rda_data           = '/gpfs/csfs1/collections/rda/data/'
new_intake_path    = rda_data + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'
new_intake_csvpath = rda_data + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.csv'

### Set up osdf url to use with PelicanFS
- We should one of the two pelicanFS FSSpec protocols ('osdf' or 'pelican') instead of the https protocol.
- We will use the 'osdf' protocol and modify the existing CESM2-LENS catalog
- So, the urls will look like: osdf_discovery_url + namespace prefix + path to file or object
- In this case, the urls will look like osdf:///aws-opendata/us-west-2/ncar-cesm2-lens + path to individual zarr stores

In [6]:
s3_link  = 's3://'
# osdf_url = 'https://osdf-director.osg-htc.org/aws-opendata/us-west-2/'
osdf_url = 'osdf:///aws-opendata/us-west-2/'
#
rda_url        =  'https://data.rda.ucar.edu/'
new_intake_url = rda_url + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'

In [7]:
%%time
pelfs = OSDFFileSystem() # OSDFFileSystem is already aware of the osdf discovery url
#pelfs = PelicanFileSystem("pelican://osg-htc.org")
# pelfs.ls('/ncar/rda/')

CPU times: user 1.46 ms, sys: 0 ns, total: 1.46 ms
Wall time: 1.47 ms


In [8]:
%%time
# pelfs = PelicanFileSystem("https://osdf-director.osg-htc.org/")
# pelfs.ls('/aws-opendata/us-west-2/ncar-cesm2-lens/')

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 4.77 μs


In [9]:
# Try to create a file url following Emma's example
cesm2_lens_path  = '/aws-opendata/us-west-2/ncar-cesm2-lens/'
#osdf_director    = 'https://osdf-director.osg-htc.org/'
zarr_path        = cesm2_lens_path + 'atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr'
#
pel_zarr         = PelicanMap(zarr_path, pelfs)
print(zarr_path)

/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr


- Note the extra `/' in between the (pelican_director url + cesm2_lens_path). This is required

In [10]:
%%time
# Now, try to access the zarr store using open_zarr
test_zarr = xr.open_zarr(pel_zarr).TREFHT
#test_zarr = xr.open_mfdataset(pel_zarr, engine='zarr') 
test_zarr

CPU times: user 539 ms, sys: 181 ms, total: 721 ms
Wall time: 19.3 s


Unnamed: 0,Array,Chunk
Bytes,620.30 GiB,153.98 MiB
Shape,"(50, 60225, 192, 288)","(1, 730, 192, 288)"
Dask graph,4150 chunks in 2 graph layers,4150 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 620.30 GiB 153.98 MiB Shape (50, 60225, 192, 288) (1, 730, 192, 288) Dask graph 4150 chunks in 2 graph layers Data type float32 numpy.ndarray",50  1  288  192  60225,

Unnamed: 0,Array,Chunk
Bytes,620.30 GiB,153.98 MiB
Shape,"(50, 60225, 192, 288)","(1, 730, 192, 288)"
Dask graph,4150 chunks in 2 graph layers,4150 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [11]:
%%time
#Using HTTPS protocol
#test = xr.open_dataset('https://osdf-director.osg-htc.org/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr',\
#                       engine='zarr').TREFHT
#Using pelicanfs' OSDF protocol
test = xr.open_dataset('osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr',\
                       engine='zarr').TREFHT
test

CPU times: user 103 ms, sys: 3.44 ms, total: 107 ms
Wall time: 430 ms


In [12]:
# %%time
# test.values

# Comments
- PelicanFs doesn't seem to support the 'ls' command
- However, we can load the zarr store using the full url/ pelfs 
- So, let us try using an intake catalog
- The original intake catalog can be found at 'https://raw.githubusercontent.com/NCAR/cesm2-le-aws/main/intake-catalogs/aws-cesm2-le.json'
- Let us modify this catalog, by pre-pending the s3 path to the object ncar-cesm2-lens with the osdf-director + namespace
- The size of this catalog is > 100 MB. So, let us not upload it to a github repo
- Let us save the catalog to a folder on NCAR's disk storage

## Modify the intake catalog

In [13]:
# Open collection description file using intake
catalog = intake.open_esm_datastore(
    'https://raw.githubusercontent.com/NCAR/cesm2-le-aws/main/intake-catalogs/aws-cesm2-le.json'
)
catalog

Unnamed: 0,unique
variable,53
long_name,51
component,4
experiment,2
forcing_variant,2
frequency,3
vertical_levels,3
spatial_domain,3
units,20
start_time,4


In [13]:
# df_s3 = catalog.df
# df_s3['path']

In [14]:
# # Change s3 paths to osdf paths
# df_s3['path'] = df_s3['path'].str.replace(s3_link, '')
# df_s3['path'] = osdf_url + df_s3['path'] 
# df_s3['path'].head().values

In [15]:
# %%time
# df_s3.to_csv(new_intake_csvpath)

## Test the new intake catalog after spinning up a cluster

In [14]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '4GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=4GB',
    queue = 'casper',
    walltime = '5:00:00',
    log_directory = rda_scratch+'/dask/logs',
    #interface = 'ib0'
    interface = 'ext'
)

In [15]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.115:42503,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [16]:
cluster.scale(4)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.115:42503,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


# Access the data from the AWS bucket using intake to compare

In [17]:
osdf_catalog = intake.open_esm_datastore(
   new_intake_url 
)
osdf_catalog

Unnamed: 0,unique
Unnamed: 0,322
variable,53
long_name,51
component,4
experiment,2
forcing_variant,2
frequency,3
vertical_levels,3
spatial_domain,3
units,20


In [18]:
osdf_catalog.df['path'].head().values

array(['osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNS.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNSC.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLUT.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNS.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNSC.zarr'],
      dtype=object)

In [21]:
osdf_catalog_temp = osdf_catalog.search(variable ='TREFHTMX', frequency ='daily')
osdf_catalog_temp

Unnamed: 0,unique
Unnamed: 0,4
variable,1
long_name,1
component,1
experiment,2
forcing_variant,2
frequency,1
vertical_levels,1
spatial_domain,1
units,1


In [22]:
%%time
#dsets = osdf_catalog_temp.to_dataset_dict(storage_options={'anon':True})
dsets = osdf_catalog_temp.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.frequency.forcing_variant'


ESMDataSourceError: Failed to load dataset with key='atm.historical.daily.smbb'
                 You can use `cat['atm.historical.daily.smbb'].df` to inspect the assets/files for this key.
                 

In [23]:
%%time
dsets.keys()

NameError: name 'dsets' is not defined

In [24]:
historical_smbb = dsets['atm.historical.daily.smbb']
historical_smbb

NameError: name 'dsets' is not defined

In [None]:
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0).plot()

In [None]:
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0).values