# Access CMIP6 zarr data from AWS using the osdf protocol and plot surface air temperature
- This workflow is inspired by https://gallery.pangeo.io/repos/pangeo-gallery/cmip6/global_mean_surface_temp.html

In [None]:
from matplotlib import pyplot as plt
import xarray as xr
import numpy as np
import dask
from dask.diagnostics import progress
from tqdm.autonotebook import tqdm
import intake
import fsspec
import seaborn as sns
import re
import aiohttp
from dask_jobqueue import PBSCluster

In [33]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import OSDFFileSystem,PelicanMap 

In [34]:
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
rda_url       =  'https://data.rda.ucar.edu/'
cat_url   = rda_url +  'harshah/intake_catalogs/cmip6-aws/cmip6-osdf-zarr.json'

### Spin up cluster

In [35]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '4GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=4GB',
    queue = 'casper',
    walltime = '1:00:00',
    log_directory = rda_scratch+'/dask/logs',
    #interface = 'ib0'
    interface = 'ext'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43607 instead


In [36]:
cluster.scale(4)
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/43607/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.96:42247,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/43607/status,Total threads: 0
Started: Just now,Total memory: 0 B


### Load catalog and select data subset

In [37]:
col = intake.open_esm_datastore(cat_url)
col

Unnamed: 0,unique
activity_id,18
institution_id,36
source_id,88
experiment_id,170
member_id,657
table_id,37
variable_id,709
grid_label,10
zstore,522217
dcpp_init_year,60


In [38]:
[eid for eid in col.df['experiment_id'].unique() if 'ssp' in eid]

['esm-ssp585-ssp126Lu',
 'ssp126-ssp370Lu',
 'ssp370-ssp126Lu',
 'ssp585',
 'ssp245',
 'ssp370-lowNTCF',
 'ssp370SST-ssp126Lu',
 'ssp370SST',
 'ssp370pdSST',
 'ssp370SST-lowCH4',
 'ssp370SST-lowNTCF',
 'ssp126',
 'ssp119',
 'ssp370',
 'esm-ssp585',
 'ssp245-nat',
 'ssp245-GHG',
 'ssp460',
 'ssp434',
 'ssp534-over',
 'ssp245-aer',
 'ssp245-stratO3',
 'ssp245-cov-fossil',
 'ssp245-cov-modgreen',
 'ssp245-cov-strgreen',
 'ssp245-covid',
 'ssp585-bgc']

In [47]:
# there is currently a significant amount of data for these runs
expts = ['historical', 'ssp245', 'ssp370']

query = dict(
    experiment_id=expts,
    table_id='Amon',
    #variable_id=['tas'],
    variable_id = ['hfls'],
    member_id = 'r1i1p1f1',
    #activity_id = 'CMIP',
)

col_subset = col.search(require_all_on=["source_id"], **query)
col_subset

Unnamed: 0,unique
activity_id,2
institution_id,20
source_id,25
experiment_id,3
member_id,1
table_id,1
variable_id,1
grid_label,3
zstore,75
dcpp_init_year,0


In [48]:
col_subset.df.groupby("source_id")[
    ["experiment_id", "variable_id", "table_id","activity_id"]
].nunique()

Unnamed: 0_level_0,experiment_id,variable_id,table_id,activity_id
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ACCESS-CM2,3,1,1,2
AWI-CM-1-1-MR,3,1,1,2
BCC-CSM2-MR,3,1,1,2
CAMS-CSM1-0,3,1,1,2
CESM2-WACCM,3,1,1,2
CMCC-CM2-SR5,3,1,1,2
CanESM5,3,1,1,2
EC-Earth3,3,1,1,2
EC-Earth3-Veg,3,1,1,2
EC-Earth3-Veg-LR,3,1,1,2


In [49]:
col_subset.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
0,CMIP,CSIRO-ARCCSS,ACCESS-CM2,historical,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20191108
1,ScenarioMIP,CSIRO-ARCCSS,ACCESS-CM2,ssp370,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20191108
2,ScenarioMIP,CSIRO-ARCCSS,ACCESS-CM2,ssp245,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20191108
3,ScenarioMIP,AWI,AWI-CM-1-1-MR,ssp245,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20190529
4,ScenarioMIP,AWI,AWI-CM-1-1-MR,ssp370,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20190529
...,...,...,...,...,...,...,...,...,...,...,...
70,ScenarioMIP,NCC,NorESM2-MM,ssp370,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20191108
71,ScenarioMIP,NCC,NorESM2-MM,ssp245,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20191108
72,CMIP,AS-RCEC,TaiESM1,historical,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20200623
73,ScenarioMIP,AS-RCEC,TaiESM1,ssp370,r1i1p1f1,Amon,hfls,gn,osdf:///aws-opendata/us-west-2/cmip6-pds/CMIP6...,,20201014


In [50]:
dsets_osdf  = col_subset.to_dataset_dict()
print(f"\nDataset dictionary keys:\n {dsets_osdf.keys()}")


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


ESMDataSourceError: Failed to load dataset with key='ScenarioMIP.NOAA-GFDL.GFDL-ESM4.ssp245.Amon.gr1'
                 You can use `cat['ScenarioMIP.NOAA-GFDL.GFDL-ESM4.ssp245.Amon.gr1'].df` to inspect the assets/files for this key.
                 

In [51]:
# Try with a single zarr store
osdf_fs   = OSDFFileSystem(direct_reads = True) # OSDFFileSystem is already aware of the osdf discovery url
# zarr_path = '/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNS.zarr'
zarr_path = '/aws-opendata/us-west-2/cmip6-pds/CMIP6/CMIP/AS-RCEC/TaiESM1/1pctCO2/r1i1p1f1/Amon/hfls/gn/v20200225/'
#
pel_zarr  = PelicanMap(zarr_path, osdf_fs)
ds_test   = xr.open_zarr(pel_zarr)
ds_test

Unnamed: 0,Array,Chunk
Bytes,3.00 kiB,3.00 kiB
Shape,"(192, 2)","(192, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 3.00 kiB 3.00 kiB Shape (192, 2) (192, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  192,

Unnamed: 0,Array,Chunk
Bytes,3.00 kiB,3.00 kiB
Shape,"(192, 2)","(192, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.50 kiB,4.50 kiB
Shape,"(288, 2)","(288, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.50 kiB 4.50 kiB Shape (288, 2) (288, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  288,

Unnamed: 0,Array,Chunk
Bytes,4.50 kiB,4.50 kiB
Shape,"(288, 2)","(288, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,28.12 kiB,28.12 kiB
Shape,"(1800, 2)","(1800, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 28.12 kiB 28.12 kiB Shape (1800, 2) (1800, 2) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",2  1800,

Unnamed: 0,Array,Chunk
Bytes,28.12 kiB,28.12 kiB
Shape,"(1800, 2)","(1800, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,379.69 MiB,61.59 MiB
Shape,"(1800, 192, 288)","(292, 192, 288)"
Dask graph,7 chunks in 2 graph layers,7 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 379.69 MiB 61.59 MiB Shape (1800, 192, 288) (292, 192, 288) Dask graph 7 chunks in 2 graph layers Data type float32 numpy.ndarray",288  192  1800,

Unnamed: 0,Array,Chunk
Bytes,379.69 MiB,61.59 MiB
Shape,"(1800, 192, 288)","(292, 192, 288)"
Dask graph,7 chunks in 2 graph layers,7 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [44]:
# dsets_ = dask.compute(dict(dsets))[0]

In [15]:
#calculate global means

def get_lat_name(ds):
    for lat_name in ['lat', 'latitude']:
        if lat_name in ds.coords:
            return lat_name
    raise RuntimeError("Couldn't find a latitude coordinate")

def global_mean(ds):
    lat = ds[get_lat_name(ds)]
    weight = np.cos(np.deg2rad(lat))
    weight /= weight.mean()
    other_dims = set(ds.dims) - {'time'}
    return (ds * weight).mean(other_dims)