# Access AWS CESM2 from AWS and compute climatology average

In [1]:
# Imports
import geocat.comp as gc
import intake
import numpy as np
import pandas as pd
import xarray as xr
# import seaborn as sns
import re
import aiohttp

In [2]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 

In [3]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [4]:
year0  = '1991'
year1  = '2020'
year0_str = str(year0)
year1_str = str(year1)

#Boulder coordinates
boulder_lat = 40.0150 
boulder_lon = (360-105.2705)%360
print(boulder_lat,boulder_lon)

40.015 254.7295


In [5]:
# File paths
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
#
rda_url        =  'https://data.rda.ucar.edu/'
intake_url     = rda_url + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'

### Spin up a cluster

In [6]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '4GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=4GB',
    queue = 'casper',
    walltime = '3:00:00',
    log_directory = rda_scratch+'/dask/logs',
    #interface = 'ib0'
    interface = 'ext'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34375 instead


In [7]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/34375/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/34375/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.98:37661,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/34375/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [8]:
cluster.scale(2)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/34375/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.98:37661,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/34375/status,Total threads: 0
Started: Just now,Total memory: 0 B


### Load CESM2 temperature data and apply geocat-comp's climatology average

In [9]:
osdf_catalog = intake.open_esm_datastore(intake_url)
osdf_catalog

Unnamed: 0,unique
Unnamed: 0,322
variable,53
long_name,51
component,4
experiment,2
forcing_variant,2
frequency,3
vertical_levels,3
spatial_domain,3
units,20


In [10]:
osdf_catalog.df['path'].head().values

array(['osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNS.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNSC.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLUT.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNS.zarr',
       'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNSC.zarr'],
      dtype=object)

In [11]:
osdf_catalog_temp = osdf_catalog.search(variable ='TREFHT', frequency ='daily',forcing_variant='cmip6')
osdf_catalog_temp

Unnamed: 0,unique
Unnamed: 0,2
variable,1
long_name,1
component,1
experiment,2
forcing_variant,1
frequency,1
vertical_levels,1
spatial_domain,1
units,1


In [12]:
%%time
#dsets = osdf_catalog_temp.to_dataset_dict(storage_options={'anon':True})
dsets = osdf_catalog_temp.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.frequency.forcing_variant'


CPU times: user 467 ms, sys: 84.7 ms, total: 551 ms
Wall time: 23.5 s


In [13]:
%%time
dsets.keys()

CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 5.25 μs


dict_keys(['atm.ssp370.daily.cmip6', 'atm.historical.daily.cmip6'])

In [14]:
historical_cmip6 = dsets['atm.historical.daily.cmip6']
historical_cmip6 = historical_cmip6.TREFHT
historical_cmip6

Unnamed: 0,Array,Chunk
Bytes,620.30 GiB,153.98 MiB
Shape,"(50, 60225, 192, 288)","(1, 730, 192, 288)"
Dask graph,4150 chunks in 2 graph layers,4150 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 620.30 GiB 153.98 MiB Shape (50, 60225, 192, 288) (1, 730, 192, 288) Dask graph 4150 chunks in 2 graph layers Data type float32 numpy.ndarray",50  1  288  192  60225,

Unnamed: 0,Array,Chunk
Bytes,620.30 GiB,153.98 MiB
Shape,"(50, 60225, 192, 288)","(1, 730, 192, 288)"
Dask graph,4150 chunks in 2 graph layers,4150 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


#### To illustrate how the function works select small subset
- Choose data between year0 and year1
- Choose data from only one member
- Choose data for Boulder

In [15]:
%%time
historical_cmip6_30years = historical_cmip6.isel(member_id=0).sel(lat =boulder_lat,lon=boulder_lon,method='nearest').\
                             sel(time = slice(f'{year0_str}-01-01', f'{year1_str}-12-31'))
historical_cmip6_30years

CPU times: user 46.2 ms, sys: 0 ns, total: 46.2 ms
Wall time: 46.5 ms


Unnamed: 0,Array,Chunk
Bytes,34.22 kiB,2.85 kiB
Shape,"(8760,)","(730,)"
Dask graph,13 chunks in 5 graph layers,13 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 34.22 kiB 2.85 kiB Shape (8760,) (730,) Dask graph 13 chunks in 5 graph layers Data type float32 numpy.ndarray",8760  1,

Unnamed: 0,Array,Chunk
Bytes,34.22 kiB,2.85 kiB
Shape,"(8760,)","(730,)"
Dask graph,13 chunks in 5 graph layers,13 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [16]:
historical_cmip6_30years.values

2024-10-25 11:00:36,108 - distributed.protocol.core - CRITICAL - Failed to Serialize
Traceback (most recent call last):
  File "/glade/u/home/harshah/.conda/envs/osdf/lib/python3.12/site-packages/distributed/protocol/core.py", line 109, in dumps
    frames[0] = msgpack.dumps(msg, default=_encode_default, use_bin_type=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/glade/u/home/harshah/.conda/envs/osdf/lib/python3.12/site-packages/msgpack/__init__.py", line 35, in packb
    return Packer(**kwargs).pack(o)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/glade/u/home/harshah/.conda/envs/osdf/lib/python3.12/site-packages/msgpack/fallback.py", line 885, in pack
    self._pack(obj)
  File "/glade/u/home/harshah/.conda/envs/osdf/lib/python3.12/site-packages/msgpack/fallback.py", line 861, in _pack
    self._pack(obj[i], nest_limit - 1)
  File "/glade/u/home/harshah/.conda/envs/osdf/lib/python3.12/site-packages/msgpack/fallback.py", line 864, in _pack

FutureCancelledError: ('getitem-ea52235805ccd7ddea05566a3ede2fa3', 10) cancelled for reason: scheduler-connection-lost.
Client lost the connection to the scheduler. Please check your connection and re-run your work.

In [None]:
%%time
hist_cmip6_monthly = gc.climatology_average(historical_cmip6_30years,freq='month')
hist_cmip6_monthly

In [None]:
%%time
hist_cmip6_monthly.values

## cluster.close()