This script calculates correlations between various parameters and saves them to their own netcdf file

In [1]:
# import functions
# OS interaction and time
import os
import sys
import cftime
import datetime
import time
import glob
import dask
import dask.bag as db
import calendar

# math and data
import numpy as np
import netCDF4 as nc
import xarray as xr
import scipy as sp
import pandas as pd
import pickle as pickle
from sklearn import linear_model
import matplotlib.patches as mpatches
from shapely.geometry.polygon import LinearRing
import statsmodels.stats.multitest as multitest

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.colors as mcolors
from matplotlib.gridspec import GridSpec

from matplotlib.ticker import FormatStrFormatter
from mpl_toolkits.axes_grid1.axes_divider import HBoxDivider
import mpl_toolkits.axes_grid1.axes_size as Size
from mpl_toolkits.axes_grid1 import make_axes_locatable

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.util import add_cyclic_point

# random
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

In [2]:
# specify directories
my_era5_path = '/glade/u/home/zcleveland/scratch/ERA5/'  # path to subset data
cp_in_path = '/glade/u/home/zcleveland/scratch/ERA5/cp/'  # path to subset CP data
corr_out_path = '/glade/u/home/zcleveland/scratch/ERA5/correlations/'  # path to correlation calculation folder
sub_script_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/scripts/subsetting/'  # path to subsetting scripts
plot_script_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/scripts/plotting/'  # path to plotting scripts
fig_out_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/plots/'  # path to generated figures
temp_scratch_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/temp/'  # path to temp directory in scratch

In [3]:
# define list of variables
var_list = [
    # 'lsp',  # large scale precipitation (m of water) - accumu
    # 'cp',  # convective precipitation (m of water) - accumu
    # 'tp',  # total precipitation (m of water) - accumu -- DERIVED
    # 'sd',  # snow depth  (m of water equivalent) - instan
    # 'msl',  # mean sea level pressure (Pa) - instan
    # 'tcc',  # total cloud cover (0-1) - instan
    # 'stl1',  # soil temp layer 1 (K) - instan
    # 'stl2',  # soil temp layer 2 (K) - instan
    # 'stl3',  # soil temp layer 3 (K) - instan
    # 'stl4',  # soil temp layer 4 (K) - instan
    # 'swvl1',  # soil volume water content layer 1 (m^3 m^-3) - instan
    # 'swvl2',  # soil volume water content layer 2 (m^3 m^-3) - instan
    # 'swvl3',  # soil volume water content layer 3 (m^3 m^-3) - instan
    # 'swvl4',  # soil volume water content layer 4 (m^3 m^-3) - instan
    # '2t',  # 2 meter temp (K) - instan
    # '2d',  # 2 meter dew point (K) - instan
    # 'ishf',  # instant surface heat flux (W m^-2) - instan
    # 'ie',  # instant moisture flux (kg m^-2 s^-1) - instan
    # 'sshf',  # surface sensible heat flux (J m^-2) - accumu
    # 'slhf',  # surface latent heat flux (J m^-2) - accumu
    # 'ssr',  # surface net solar radiation (J m^-2) - accumu
    # 'str',  # surface net thermal radiation (J m^-2) - accumu
    # 'sro',  # surface runoff (m) - accumu
    # 'sf',  # total snowfall (m of water equivalent) - accumu
    # 'cape',  # convective available potential energy (J kg^-1) - instan
    # 'tcw',  # total column water (kg m^-2) - sfc (sum total of solid, liquid, and vapor in a column)
    'ssrd',  # surface solar radiation downwards (J m^-2) - accumu
    'strd',  # surface thermal radiation downwards (J m^-2) - accumu
]

In [4]:
# define a function to calculate the correlation between 
# any given parameter and the NAM onset date
def calc_onset_correlation(var='swvl1',months=345, cp_flag=False):

    # create string to make directory path for figure save
    if cp_flag:
        var_region = 'cp'
    else:
        var_region = 'dsw'

    # create list of months over which to average
    var_months_list = [int(m) for m in str(months)]  # turn var integer into list (e.g. 678 -> [6,7,8])
    # make string for month letters from var_range (e.g. [6,7,8] -> 'JJA')
    var_months = ''.join([calendar.month_name[m][0] for m in var_months_list])

    # path to save figures
    out_fn = f'corr_{var}_onset_{var_months}_{var_region}.nc'
    out_fp = os.path.join(corr_out_path, var_region, out_fn)

    # check existence of file already
    if os.path.exists(out_fp):
        print(f'File already exists for: {out_fn}')
        print('\nSkipping . . .')
        return
    
    # open onset dataset
    onset_ds = xr.open_dataset(os.path.join(my_era5_path, 'dsw/NAM_onset.nc'))
    onset_ds['year'] = onset_ds['year'].dt.year # convert to only year.  e.g. 2012-01-01 -> 2012
    onset_da = onset_ds['date']
    onset_data = onset_da.dt.dayofyear.astype('float32')

    # open var datasets
    if cp_flag:
        var_files = glob.glob(f'{my_era5_path}{var_region}/*{var}*.nc')
    else:
        var_files = glob.glob(f'{my_era5_path}{var_region}/*/*{var}*.nc')
    var_ds = xr.open_mfdataset(var_files)

    # pull out actual variable name in the dataset since they can be different names/capitalized
    var_name = [v for v in var_ds.data_vars.keys() if f'{var.upper()}' in v][0]
    var_da = var_ds[var_name]

    # get data from var
    if 'AVG' in var_name:
        mon_mean = var_da.resample(time='1M').mean()
        var_mon_mean = mon_mean.sel(time=mon_mean['time.month'].isin(var_months_list))
        var_data = var_mon_mean.groupby('time.year').mean(dim='time')
    else:
        mon_sum = var_da.resample(time='1M').sum()
        var_mon_sum = mon_sum.sel(time=mon_sum['time.month'].isin(var_months_list))
        var_data = var_mon_sum.groupby('time.year').sum(dim='time')

    # calculate correlation
    var_corr = xr.corr(onset_data, var_data, dim='year')

    # save correlation as netcdf file
    var_corr.to_netcdf(out_fp)


In [62]:
# calculate correlations for dsw
for var in var_list:
    calc_onset_correlation(var=var, months=345, cp_flag=False)

  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))
  return func(*(_execute_task(a, cache) for a in args))


In [5]:
# calculate correlations for cp
for var in var_list:
    calc_onset_correlation(var=var, months=345, cp_flag=True)