# Access CMIP6 zarr data from AWS using the osdf protocol and compute Equilibrium Climate Sensitivity (ECS)
- This workflow is an adaptation of https://gallery.pangeo.io/repos/pangeo-gallery/cmip6/ECS_Gregory_method.html
- We use the [Gregory method](https://agupubs.onlinelibrary.wiley.com/doi/epdf/10.1029/2003GL018747) to compute ECS

## Table of Contents
- [Section 1: Introduction](#Section-1:-Introduction) 
- [Section 2: Select Dask Cluster](#Section-2:-Select-Dask-Cluster) 
- [Section 3: Data Loading](#Section-3:-Data-Loading) 
- [Section 4: Data Analysis](#Section-4:-Data-Analysis) 

## Section 1: Introduction
- Load python packkages
- Load catalog url

In [1]:
from matplotlib import pyplot as plt
import xarray as xr
import numpy as np
import dask
from dask.diagnostics import progress
from tqdm.autonotebook import tqdm
import intake
import fsspec
import seaborn as sns
import re
import aiohttp
from dask_jobqueue import PBSCluster
import pandas as pd
from xhistogram.xarray import histogram

  from tqdm.autonotebook import tqdm


In [2]:
# import fsspec.implementations.http as fshttp
from pelicanfs.core import OSDFFileSystem,PelicanMap 

In [3]:
rda_scratch = '/glade/campaign/collections/rda/scratch/harshah'
rda_url     =  'https://data.rda.ucar.edu/'
cat_url     = rda_url +  'd850001/catalogs/osdf/cmip6-aws/cmip6-osdf-zarr.json'
# cat_url     = 'https://cmip6-pds.s3.amazonaws.com/pangeo-cmip6.json'

## Section 2: Select Dask Cluster

#### Select the Dask cluster type
The default will be LocalCluster as that can run on any system.

If running on a HPC computer with a PBS Scheduler, set to True. Otherwise, set to False.

In [4]:
USE_PBS_SCHEDULER = True

If running on Jupyter server with Dask Gateway configured, set to True. Otherwise, set to False.

In [5]:
USE_DASK_GATEWAY = False

#### Python function for a PBS cluster

In [6]:
# Create a PBS cluster object
def get_pbs_cluster():
    """ Create cluster through dask_jobqueue.   
    """
    from dask_jobqueue import PBSCluster
    cluster = PBSCluster(
        job_name = 'dask-osdf-24',
        cores = 1,
        memory = '4GiB',
        processes = 1,
        local_directory = rda_scratch + '/dask/spill',
        log_directory = rda_scratch + '/dask/logs/',
        resource_spec = 'select=1:ncpus=1:mem=4GB',
        queue = 'casper',
        walltime = '3:00:00',
        #interface = 'ib0'
        interface = 'ext'
    )
    return cluster

#### Python function for a Gateway Cluster

In [7]:
def get_gateway_cluster():
    """ Create cluster through dask_gateway
    """
    from dask_gateway import Gateway

    gateway = Gateway()
    cluster = gateway.new_cluster()
    cluster.adapt(minimum=2, maximum=4)
    return cluster

In [8]:
def get_local_cluster():
    """ Create cluster using the Jupyter server's resources
    """
    from distributed import LocalCluster, performance_report
    cluster = LocalCluster()    

    cluster.scale(6)
    return cluster

#### Python logic for a Local Cluster
This uses True/False boolean logic based on the variables set in the previous cells

In [9]:
# Obtain dask cluster in one of three ways
if USE_PBS_SCHEDULER:
    cluster = get_pbs_cluster()
elif USE_DASK_GATEWAY:
    cluster = get_gateway_cluster()
else:
    cluster = get_local_cluster()

# Connect to cluster
from distributed import Client
client = Client(cluster)

In [10]:
# Scale the cluster and display cluster dashboard URL
n_workers =8
cluster.scale(n_workers)
client.wait_for_workers(n_workers = n_workers)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Workers: 8
Total threads: 8,Total memory: 32.00 GiB

0,1
Comm: tcp://128.117.208.94:44533,Workers: 8
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Total threads: 8
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://128.117.208.173:36975,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/41043/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:40231,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-3reiw4rl,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-3reiw4rl
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 128.77 MiB,Spilled bytes: 0 B
Read bytes: 1.06 GiB,Write bytes: 732.05 MiB

0,1
Comm: tcp://128.117.208.173:41521,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/46261/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:46589,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-e52xkfso,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-e52xkfso
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 124.70 MiB,Spilled bytes: 0 B
Read bytes: 1.24 GiB,Write bytes: 752.14 MiB

0,1
Comm: tcp://128.117.208.173:43773,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44879/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:34687,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-9f9ttkqr,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-9f9ttkqr
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 50.7%,Last seen: Just now
Memory usage: 124.70 MiB,Spilled bytes: 0 B
Read bytes: 1.42 GiB,Write bytes: 0.93 GiB

0,1
Comm: tcp://128.117.208.173:39683,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/33203/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:42369,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-t4lqur7t,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-t4lqur7t
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 124.81 MiB,Spilled bytes: 0 B
Read bytes: 1.44 GiB,Write bytes: 777.01 MiB

0,1
Comm: tcp://128.117.208.173:36039,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/42801/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:35931,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-91guto8g,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-91guto8g
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 124.86 MiB,Spilled bytes: 0 B
Read bytes: 1.44 GiB,Write bytes: 777.47 MiB

0,1
Comm: tcp://128.117.208.173:42811,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/33751/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:46359,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-4uevg1ck,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-4uevg1ck
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 51.50 MiB,Spilled bytes: 0 B
Read bytes: 2.17 GiB,Write bytes: 1.72 MiB

0,1
Comm: tcp://128.117.208.173:35227,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/33959/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:33255,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-hxlwxmnt,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-hxlwxmnt
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 51.54 MiB,Spilled bytes: 0 B
Read bytes: 1.34 MiB,Write bytes: 1.35 GiB

0,1
Comm: tcp://128.117.208.173:37525,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/36907/status,Memory: 4.00 GiB
Nanny: tcp://128.117.208.173:35107,
Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-pm9i8c5e,Local directory: /glade/campaign/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-pm9i8c5e
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 51.50 MiB,Spilled bytes: 0 B
Read bytes: 1.18 MiB,Write bytes: 1.00 GiB


## Section 3: Data Loading
- Load catalog and select data subset

In [11]:
col = intake.open_esm_datastore(cat_url)
col

Unnamed: 0,unique
activity_id,18
institution_id,36
source_id,88
experiment_id,170
member_id,657
table_id,37
variable_id,709
grid_label,10
zstore,522217
dcpp_init_year,60


In [12]:
[eid for eid in col.df['experiment_id'].unique() if 'ssp' in eid]

['esm-ssp585-ssp126Lu',
 'ssp126-ssp370Lu',
 'ssp370-ssp126Lu',
 'ssp585',
 'ssp245',
 'ssp370-lowNTCF',
 'ssp370SST-ssp126Lu',
 'ssp370SST',
 'ssp370pdSST',
 'ssp370SST-lowCH4',
 'ssp370SST-lowNTCF',
 'ssp126',
 'ssp119',
 'ssp370',
 'esm-ssp585',
 'ssp245-nat',
 'ssp245-GHG',
 'ssp460',
 'ssp434',
 'ssp534-over',
 'ssp245-aer',
 'ssp245-stratO3',
 'ssp245-cov-fossil',
 'ssp245-cov-modgreen',
 'ssp245-cov-strgreen',
 'ssp245-covid',
 'ssp585-bgc']

In [13]:
query = dict(
    experiment_id=['abrupt-4xCO2','piControl'], # pick the `abrupt-4xCO2` and `piControl` forcing experiments
    table_id='Amon',                            # choose to look at atmospheric variables (A) saved at monthly resolution (mon)
    variable_id=['tas', 'rsut','rsdt','rlut'],  # choose to look at near-surface air temperature (tas) as our variable
    member_id = 'r1i1p1f1',                     # arbitrarily pick one realization for each model (i.e. just one set of initial conditions)
)

col_subset = col.search(require_all_on=["source_id"], **query)
col_subset.df.groupby("source_id")[
    ["experiment_id", "variable_id", "table_id"]
].nunique()

Unnamed: 0_level_0,experiment_id,variable_id,table_id
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACCESS-CM2,2,4,1
ACCESS-ESM1-5,2,4,1
AWI-CM-1-1-MR,2,4,1
BCC-CSM2-MR,2,4,1
BCC-ESM1,2,4,1
CAMS-CSM1-0,2,4,1
CAS-ESM2-0,2,4,1
CESM2,2,4,1
CESM2-FV2,2,4,1
CESM2-WACCM,2,4,1


In [14]:
def drop_all_bounds(ds):
    """Drop coordinates like 'time_bounds' from datasets,
    which can lead to issues when merging."""
    drop_vars = [vname for vname in ds.coords
                 if (('_bounds') in vname ) or ('_bnds') in vname]
    return ds.drop_vars(drop_vars)

def open_dsets(df):
    """Open datasets from cloud storage and return xarray dataset."""
    dsets = [xr.open_zarr(fsspec.get_mapper(ds_url), consolidated=True)
             .pipe(drop_all_bounds)
             for ds_url in df.zstore]
    try:
        ds = xr.merge(dsets, join='exact')
        return ds
    except ValueError:
        return None

def open_delayed(df):
    """A dask.delayed wrapper around `open_dsets`.
    Allows us to open many datasets in parallel."""
    return dask.delayed(open_dsets)(df)

In [15]:
from collections import defaultdict

dsets = defaultdict(dict)
for group, df in col_subset.df.groupby(by=['source_id', 'experiment_id']):
    dsets[group[0]][group[1]] = open_delayed(df)

In [16]:
%time open_dsets(df)

CPU times: user 2.14 s, sys: 114 ms, total: 2.25 s
Wall time: 11.5 s


Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,69.19 MiB
Shape,"(6000, 192, 288)","(328, 192, 288)"
Dask graph,19 chunks in 2 graph layers,19 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.24 GiB 69.19 MiB Shape (6000, 192, 288) (328, 192, 288) Dask graph 19 chunks in 2 graph layers Data type float32 numpy.ndarray",288  192  6000,

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,69.19 MiB
Shape,"(6000, 192, 288)","(328, 192, 288)"
Dask graph,19 chunks in 2 graph layers,19 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,105.05 MiB
Shape,"(6000, 192, 288)","(498, 192, 288)"
Dask graph,13 chunks in 2 graph layers,13 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.24 GiB 105.05 MiB Shape (6000, 192, 288) (498, 192, 288) Dask graph 13 chunks in 2 graph layers Data type float32 numpy.ndarray",288  192  6000,

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,105.05 MiB
Shape,"(6000, 192, 288)","(498, 192, 288)"
Dask graph,13 chunks in 2 graph layers,13 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,85.85 MiB
Shape,"(6000, 192, 288)","(407, 192, 288)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.24 GiB 85.85 MiB Shape (6000, 192, 288) (407, 192, 288) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",288  192  6000,

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,85.85 MiB
Shape,"(6000, 192, 288)","(407, 192, 288)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,74.88 MiB
Shape,"(6000, 192, 288)","(355, 192, 288)"
Dask graph,17 chunks in 2 graph layers,17 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.24 GiB 74.88 MiB Shape (6000, 192, 288) (355, 192, 288) Dask graph 17 chunks in 2 graph layers Data type float32 numpy.ndarray",288  192  6000,

Unnamed: 0,Array,Chunk
Bytes,1.24 GiB,74.88 MiB
Shape,"(6000, 192, 288)","(355, 192, 288)"
Dask graph,17 chunks in 2 graph layers,17 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [17]:
dsets_ = dask.compute(dict(dsets))[0]

## Section 4: Data Analysis
- Reduce data via Global Mean
- Grab some observations ?

In [18]:
def get_lat_name(ds):
    """Figure out what is the latitude coordinate for each dataset."""
    for lat_name in ['lat', 'latitude']:
        if lat_name in ds.coords:
            return lat_name
    raise RuntimeError("Couldn't find a latitude coordinate")

def global_mean(ds):
    """Return global mean of a whole dataset."""
    lat = ds[get_lat_name(ds)]
    weight = np.cos(np.deg2rad(lat))
    weight /= weight.mean()
    other_dims = set(ds.dims) - {'time'}
    return (ds * weight).mean(other_dims)

In [19]:
expts = ['piControl', 'abrupt-4xCO2']
expt_da = xr.DataArray(expts, dims='experiment_id',
                       coords={'experiment_id': expts})

dsets_aligned = {}

for k, v in tqdm(dsets_.items()):
    expt_dsets = v.values()
    if any([d is None for d in expt_dsets]):
        print(f"Missing experiment for {k}")
        continue

    for ds in expt_dsets:
        ds.coords['year'] = ds.time.dt.year - ds.time.dt.year[0]

    # workaround for
    # https://github.com/pydata/xarray/issues/2237#issuecomment-620961663
    dsets_ann_mean = [v[expt].pipe(global_mean).swap_dims({'time': 'year'}).drop_vars('time').coarsen(year=12).mean()
                      for expt in expts]

    # align everything with the 4xCO2 experiment
    dsets_aligned[k] = xr.concat(dsets_ann_mean, join='right',dim=expt_da)

  0%|          | 0/41 [00:00<?, ?it/s]

Missing experiment for ACCESS-ESM1-5
Missing experiment for CAS-ESM2-0
Missing experiment for EC-Earth3-Veg
Missing experiment for FIO-ESM-2-0
Missing experiment for GFDL-CM4
Missing experiment for MPI-ESM-1-2-HAM


In [20]:
%%time
dsets_aligned_ = dask.compute(dsets_aligned)[0]

CPU times: user 3min 22s, sys: 16.7 s, total: 3min 39s
Wall time: 30min 45s


In [21]:
source_ids = list(dsets_aligned_.keys())
source_da = xr.DataArray(source_ids, dims='source_id',coords={'source_id': source_ids})

big_ds = xr.concat([ds.reset_coords(drop=True) for ds in dsets_aligned_.values()],
                   dim=source_da)
big_ds

### Calculated Derived Variables

In [22]:
big_ds['imbalance'] = big_ds['rsdt'] - big_ds['rsut'] - big_ds['rlut']

ds_mean = big_ds[['tas', 'imbalance']].sel(experiment_id='piControl').mean(dim='year')
ds_anom = big_ds[['tas', 'imbalance']] - ds_mean

# add some metadata
ds_anom.tas.attrs['long_name'] = 'Global Mean Surface Temp Anom'
ds_anom.tas.attrs['units'] = 'K'
ds_anom.imbalance.attrs['long_name'] = 'Global Mean Radiative Imbalance'
ds_anom.imbalance.attrs['units'] = 'W m$^{-2}$'

ds_anom

In [23]:
###############################################################################

In [24]:
# %%time
# dsets_osdf  = col_subset.to_dataset_dict()
# print(f"\nDataset dictionary keys:\n {dsets_osdf.keys()}")

In [25]:
cluster.close()