# NA CORDEX diagnostic plots

- This notebook is adapted from the NA CORDEX notebook on AWS
- http://ncar-aws-www.s3-website-us-west-2.amazonaws.com/plot-zarr-diagnostics.html

### Input Data Access

- This notebook illustrates how to compute surface ocean heat content using potential temperature data from CESM2 Large Ensemble Dataset (https://www.cesm.ucar.edu/community-projects/lens2) hosted on NCAR's glade storage.
- This data is open access and is accessed via OSDF

In [1]:
# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import intake
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
import re
# import nest_asyncio
# nest_asyncio.apply()
# import xesmf as xe
import matplotlib.pyplot as plt

In [2]:
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 
# import cf_units as cf

In [3]:
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

In [4]:
init_year0  = '1991'
init_year1  = '2020'
final_year0 = '2071'
final_year1 = '2100'

In [5]:
def to_daily(ds):
    year = ds.time.dt.year
    day = ds.time.dt.dayofyear

    # assign new coords
    ds = ds.assign_coords(year=("time", year.data), day=("time", day.data))

    # reshape the array to (..., "day", "year")
    return ds.set_index(time=("year", "day")).unstack("time")

In [6]:
# rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
rda_scratch = '/glade/campaign/collections/rda/scratch/harshah'
rda_url     =  'https://data.rda.ucar.edu/'
cat_url     = rda_url + 'harshah/intake_catalogs/osdf/na-cordex/na-cordex.json'
print(cat_url)

https://data.rda.ucar.edu/harshah/intake_catalogs/osdf/na-cordex/na-cordex.json


## Create a PBS cluster

In [7]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '8GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    log_directory = rda_scratch + '/dask/logs/',
    resource_spec = 'select=1:ncpus=1:mem=8GB',
    queue = 'casper',
    walltime = '5:00:00',
    #interface = 'ib0'
    interface = 'ext'
)

In [8]:
cluster.scale(5)

In [9]:
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/39741/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.97:35679,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/39741/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Load NA CORDEX data from RDA using an intake catalog

In [10]:
# # Have the catalog interpret the "na-cordex-models" column as a list of values.
# col = intake.open_esm_datastore(cat_url, csv_kwargs={"converters": {"na-cordex-models": ast.literal_eval}},)
# col
#
col = intake.open_esm_datastore(cat_url)
col

Unnamed: 0,unique
Unnamed: 0,330
variable,15
standard_name,10
long_name,18
units,10
spatial_domain,1
grid,2
spatial_resolution,2
scenario,6
start_time,3


In [11]:
# # Produce a catalog content summary.
# import pprint

# uniques = col.unique(
#     columns=["variable", "scenario", "grid", "na-cordex-models", "bias_correction"]
# )
# pprint.pprint(uniques, compact=True, indent=4)

### Load data into xarray

In [12]:
data_var = 'tmax'

col_subset = col.search(
    variable=data_var,
    grid="NAM-44i",
    scenario="eval",
    bias_correction="raw",
)

col_subset

Unnamed: 0,unique
Unnamed: 0,1
variable,1
standard_name,1
long_name,1
units,1
spatial_domain,1
grid,1
spatial_resolution,1
scenario,1
start_time,1


In [13]:
col_subset.df['path'].values

array(['osdf:///ncar/rda/d316009/day/tmax.eval.day.NAM-44i.raw.zarr'],
      dtype=object)

In [14]:
# Load catalog entries for subset into a dictionary of xarray datasets, and open the first one.
dsets = col_subset.to_dataset_dict(
    zarr_kwargs={"consolidated": True}, storage_options={"anon": True}
)
print(f"\nDataset dictionary keys:\n {dsets.keys()}")

# Load the first dataset and display a summary.
dataset_key = list(dsets.keys())[0]
store_name = dataset_key + ".zarr"

ds = dsets[dataset_key]
ds

# Note that the summary includes a 'member_id' coordinate, which is a renaming of the 
# 'na-cordex-models' column in the catalog.


--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.frequency.scenario.grid.bias_correction'


ESMDataSourceError: Failed to load dataset with key='tmax.day.eval.NAM-44i.raw'
                 You can use `cat['tmax.day.eval.NAM-44i.raw'].df` to inspect the assets/files for this key.
                 

In [17]:
col_subset['tmax.day.eval.NAM-44i.raw'].df

Unnamed: 0.1,Unnamed: 0,variable,standard_name,long_name,units,spatial_domain,grid,spatial_resolution,scenario,start_time,end_time,frequency,vertical_levels,bias_correction,na-cordex-models,path,_data_format_
0,223,tmax,air_temperature,Daily Maximum Near-Surface Air Temperature,degC,north_america,NAM-44i,0.50 deg,eval,1979-01-01T12:00:00,2015-12-31T12:00:00,day,1,raw,"['ERA-Int.CRCM5-UQAM', 'ERA-Int.RegCM4', 'ERA-...",osdf:///ncar/rda/d316009/day/tmax.eval.day.NAM...,zarr


In [18]:
col_subset['tmax.day.eval.NAM-44i.raw'].df['path'].values

array(['osdf:///ncar/rda/d316009/day/tmax.eval.day.NAM-44i.raw.zarr'],
      dtype=object)

In [None]:
##################################################################

In [None]:
cesm_temp = cesm_cat.search(variable ='TEMP', frequency ='monthly')
cesm_temp

In [None]:
cesm_temp.df['path'].values

In [None]:
dsets_cesm = cesm_temp.to_dataset_dict()

In [None]:
cesm_temp.keys()

In [None]:
historical       = dsets_cesm['ocn.historical.monthly.cmip6']
future_smbb      = dsets_cesm['ocn.ssp370.monthly.smbb']
future_cmip6     = dsets_cesm['ocn.ssp370.monthly.cmip6']

In [None]:
# %%time
# merge_ds_cmip6 = xr.concat([historical, future_cmip6], dim='time')
# merge_ds_cmip6 = merge_ds_cmip6.dropna(dim='member_id')

In [None]:
historical

#### Change units

In [None]:
orig_units = cf.Unit(historical.z_t.attrs['units'])
orig_units

In [None]:
def change_units(ds, variable_str, variable_bounds_str, target_unit_str):
    orig_units = cf.Unit(ds[variable_str].attrs['units'])
    target_units = cf.Unit(target_unit_str)
    variable_in_new_units = xr.apply_ufunc(orig_units.convert, ds[variable_bounds_str], target_units, dask='parallelized', output_dtypes=[ds[variable_bounds_str].dtype])
    return variable_in_new_units

In [None]:
historical['z_t']

In [None]:
depth_levels_in_m = change_units(historical, 'z_t', 'z_t', 'm')
hist_temp_in_degK = change_units(historical, 'TEMP', 'TEMP', 'degK')
fut_cmip6_temp_in_degK = change_units(future_cmip6, 'TEMP', 'TEMP', 'degK')
fut_smbb_temp_in_degK = change_units(future_smbb, 'TEMP', 'TEMP', 'degK')
#
hist_temp_in_degK  = hist_temp_in_degK.assign_coords(z_t=("z_t", depth_levels_in_m['z_t'].data))
hist_temp_in_degK["z_t"].attrs["units"] = "m"
hist_temp_in_degK

In [None]:
depth_levels_in_m.isel(z_t=slice(0, -1))

In [None]:
#Compute depth level deltas using z_t levels
depth_level_deltas = depth_levels_in_m.isel(z_t=slice(1, None)).values - depth_levels_in_m.isel(z_t=slice(0, -1)).values
# Optionally, if you want to keep it as an xarray DataArray, re-wrap the result
depth_level_deltas = xr.DataArray(depth_level_deltas, dims=["z_t"], coords={"z_t": depth_levels_in_m.z_t.isel(z_t=slice(0, -1))})
depth_level_deltas                                                                                        

# Compute Ocean Heat content for ocean surface
- Ocean surface is considered to be the top 100m
- The formula for this is: $$ H = \rho C \int_0^z T(z) dz $$


Where H is ocean heat content, the value we are trying to calculate,

$\rho$ is the density of sea water, $1026 kg/m^3$  ,

$C$ is the specific heat of sea water, $3990 J/(kg K)$  ,

$z$ is the depth limit of the calculation in meters,

and $T(z)$ is the temperature at each depth in degrees Kelvin.

In [None]:
def calc_ocean_heat(delta_level, temperature):
    rho = 1026 #kg/m^3
    c_p = 3990 #J/(kg K)
    weighted_temperature = delta_level * temperature
    heat = weighted_temperature.sum(dim="z_t")*rho*c_p
    return heat

In [None]:
# Remember that the coordinate z_t still has values in cm
hist_temp_ocean_surface = hist_temp_in_degK.where(hist_temp_in_degK['z_t'] < 1e4,drop=True)
hist_temp_ocean_surface

In [None]:
depth_level_deltas_surface = depth_level_deltas.where(depth_level_deltas['z_t'] <1e4, drop= True)
depth_level_deltas_surface

In [None]:
hist_ocean_heat = calc_ocean_heat(depth_level_deltas_surface,hist_temp_ocean_surface)
hist_ocean_heat