In [1]:
# import functions
# OS interaction and time
import os
import sys
import cftime
import datetime
import time
import glob
import dask
import dask.bag as db
import calendar

# math and data
import numpy as np
import netCDF4 as nc
import xarray as xr
import scipy as sp
import scipy.linalg
from scipy.signal import detrend
import pandas as pd
import pickle as pickle
from sklearn import linear_model
import matplotlib.patches as mpatches
from shapely.geometry.polygon import LinearRing
import statsmodels.stats.multitest as multitest

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.colors as mcolors
from matplotlib.gridspec import GridSpec
import matplotlib.image as mpimg

from matplotlib.ticker import FormatStrFormatter
from mpl_toolkits.axes_grid1.axes_divider import HBoxDivider
import mpl_toolkits.axes_grid1.axes_size as Size
from mpl_toolkits.axes_grid1 import make_axes_locatable

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.util import add_cyclic_point

# random
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

In [2]:
my_era5_path = '/glade/u/home/zcleveland/scratch/ERA5/'  # path to subset data
misc_data_path = '/glade/u/home/zcleveland/scratch/misc_data/'  # path to misc data
sub_script_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/scripts/subsetting/'  # path to subsetting scripts
plot_script_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/scripts/plotting/'  # path to plotting scripts
plot_out_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/plots/'  # path to generated plots
temp_scratch_path = '/glade/u/home/zcleveland/NAM_soil-moisture/ERA5_analysis/temp/'  # path to temp directory in scratch

In [3]:
# Variable lists
# surface instantaneous variables
sfc_instan_list = [
    'sd',  # snow depth  (m of water equivalent)
    'msl',  # mean sea level pressure (Pa)
    'tcc',  # total cloud cover (0-1)
    'stl1',  # soil temp layer 1 (K)
    'stl2',  # soil temp layer 2 (K)
    'stl3',  # soil temp layer 3 (K)
    'stl4',  # soil temp layer 4 (K)
    'swvl1',  # soil volume water content layer 1 (m^3 m^-3)
    'swvl2',  # soil volume water content layer 2 (m^3 m^-3)
    'swvl3',  # soil volume water content layer 3 (m^3 m^-3)
    'swvl4',  # soil volume water content layer 4 (m^3 m^-3)
    '2t',  # 2 meter temp (K)
    '2d',  # 2 meter dew point (K)
    'ishf',  # instant surface heat flux (W m^-2)
    'ie',  # instant moisture flux (kg m^-2 s^-1)
    'cape',  # convective available potential energy (J kg^-1)
    'tcw',  # total column water (kg m^-2) -- sum total of solid, liquid, and vapor in a column
    'sstk',  # sea surface temperature (K)
    'vipile',  # vertical integral of potential, internal, and latent energy (J m^-2)
    'viwve',  # vertical integral of eastward water vapour flux (kg m^-1 s^-1) - positive south -> north
    'viwvn',  # vertical integral of northward water vapour flux (kg m^-1 s^-1) - positive west -> east
    'viwvd',  # vertical integral of divergence of moisture flux (kg m^-2 s^-1) - positive divergencve
    'z_thick_1000-500',  # geopotential height thickness (m) - difference between two height levels
]

# surface accumulation variables
sfc_accumu_list = [
    'lsp',  # large scale precipitation (m of water)
    'cp',  # convective precipitation (m of water)
    'tp',  # total precipitation (m of water) -- DERIVED
    'sshf',  # surface sensible heat flux (J m^-2)
    'slhf',  # surface latent heat flux (J m^-2)
    'ssr',  # surface net solar radiation (J m^-2)
    'str',  # surface net thermal radiation (J m^-2)
    'sro',  # surface runoff (m)
    'sf',  # total snowfall (m of water equivalent)
    'ssrd',  # surface solar radiation downwards (J m^-2)
    'strd',  # surface thermal radiation downwards (J m^-2)
    'ttr',  # top net thermal radiation (OLR, J m^-2) -- divide by time (s) for W m^-2
]

# pressure level variables
pl_var_list = [
    # 'pv',  # potential vorticity (K m^2 kg^-1 s^-1)
    # 'crwc',  # specific rain water content (kg kg^-1)
    # 'cswc',  # specific snow water content (kg kg^-1)
    'z',  # geopotential (m^2 s^2)
    'z_height',  # geopotential height (m)
    't',  # temperature (K)
    'u',  # u component of wind(m s^-1)
    'v',  # v component of wind (m s^-1)
    'q',  # specific humidity (kg kg^-1)
    'w',  # vertical velo|city (Pa s^-1)
    # 'vo',  # vorticity - relative (s^-1)
    # 'd',  # divergence (s^-1)
    'r',  # relative humidity (%)
    # 'clwc',  # specific cloud liquid water content
    # 'ciwc',  # specific cloud ice water content
    # 'cc',  # fraction of cloud cover (0-1)
]

# invariant data
invar_var_list = [
    'cl',  # lake cover (0-1)
    'dl',  # lake depth (m)
    'cvl',  # low vegetation cover (0-1)
    'cvh',  # high vegetation cover (0-1)
    'tvl',  # type of low vegetation ~
    'tvh',  # type of high begetation ~
    'slt',  # soil type ~
    'sdfor',  # standard deviation of filtered subgrid orography (m)
    'z_sfc',  # geopotential of surface (m^2 s^-2)
    'sdor',  # standard deviation of orography ~
    'isor',  # anisotropy of subgridscale orography ~
    'anor',  # angle of subgridscale orography (radians)
    'slor',  # slope of subgridscale orography ~
    'lsm',  # land-sea mask (0-1)
    'elevation',  # elevation of terrain (m)
]

# NAM variables
NAM_var_list = [
    'onset',
    'retreat',
    'length',
    'precipitation',
    'precipitation-rate'
]

# all var in one list
var_list = sfc_instan_list + sfc_accumu_list + pl_var_list

# region average list
region_avg_list = [
    'cp',
    'mr',
    'son',
    'chi',
    'moj',
    'MeNmAz',
]

# variables that are fluxes and need to be multiplied by -1 for easier understanding
flux_var_list = [
    'sshf',  # surface sensible heat flux (J m^-2)
    'slhf',  # surface latent heat flux (J m^-2)
    'ttr',  # top net thermal radiation (OLR, J m^-2) -- divide by time (s) for W m^-2
    'ishf',  # instant surface heat flux (W m^-2)
    'ie',  # instant moisture flux (kg m^-2 s^-1)
    'str',  # surface thermal radiation (J m^-2)
]

# misc variables
misc_var_list = [
    'nino-3',
]

In [4]:
# Variable dictionaries

# dictionary of variables and their names
var_dict = {
    'sd': 'Snow Depth',
    'msl': 'Mean Sea Level Pressure',
    'tcc': 'Total Cloud Cover',
    'stl1': 'Soil Temp Layer 1',
    'stl2': 'Soil Temp Layer 2',
    'stl3': 'Soil Temp Layer 3',
    'stl4': 'Soil Temp Layer 4',
    'swvl1': 'Soil Volume Water Content Layer 1',
    'swvl2': 'Soil Volume Water Content Layer 2',
    'swvl3': 'Soil Volume Water Content Layer 3',
    'swvl4': 'Soil Volume Water Content Layer 4',
    '2t': '2 Meter Temp',
    '2d': '2 Meter Dew Point',
    'ishf': 'Instant Surface Heat Flux',
    'ie': 'Instant Moisture Flux',
    'cape': 'Convective Available Potential Energy',
    'tcw': 'Total Column Water',
    'sstk': 'Sea Surface Temperature',
    'vipile': 'vertical integral of potential, internal, and latent energy',
    'viwve': 'vertical integral of eastward water vapour flux',
    'viwvn': 'vertical integral of northward water vapour flux',
    'viwvd': 'vertical integral of divergence of moisture flux',
    'lsp': 'Large Scale Precipitation',
    'cp': 'Convective Precipitation',
    'tp': 'Total Precipitation',
    'sshf': 'Surface Sensible Heat Flux',
    'slhf': 'Surface Latent Heat Flux',
    'ssr': 'Surface Net Solar Radiation',
    'str': 'Surface Net Thermal Radiation',
    'sro': 'Surface Runoff',
    'sf': 'Total Snowfall',
    'ssrd': 'Surface Solar Radiation Downwards',
    'strd': 'Surface Thermal Radiation Downwards',
    'ttr': 'Top Net Thermal Radiation (OLR)',
    'z': 'Geopotential',
    'z_height': 'Geopotential Height',
    'z_thick': 'Geopotential Height Thickness',
    't': 'Temperature',
    'u': 'U Component of Wind',
    'v': 'V Component of Wind',
    'q': 'Specific Humidity',
    'w': 'Vertical Velocity',
    'r': 'Relative Humidity',
    'onset': 'NAM Onset',
    'retreat': 'NAM Retreat',
    'length': 'NAM Length',
    'precipitation': 'Yearly NAM Season Precipitation',
    'precipitation-rate': 'NAM Precipitation Rate',
    'nino-3': r'Ni$\tilda{n}$o-3 Index',
}

# variable units in latex format for plotting
var_units = {
    'sd': r'(m)',
    'msl': r'(Pa)',
    'tcc': r'(0-1)',
    'stl1': r'(K)',
    'stl2': r'(K)',
    'stl3': r'(K)',
    'stl4': r'(K)',
    'swvl1': r'$(m^3 m^{-3})$',
    'swvl2': r'$(m^3 m^{-3})$',
    'swvl3': r'$(m^3 m^{-3})$',
    'swvl4': r'$(m^3 m^{-3})$',
    '2t': r'(K)',
    '2d': r'(K)',
    'ishf': r'$(W m^{-2})$',
    'ie': r'$(kg m^{-2} s^{-1})$',
    'cape': r'$(J kg^{-1})$',
    'tcw': r'$(kg m^{-2})$',
    'sstk': r'(K)',
    'vipile': r'$(J m^{-2})$',
    'viwve': r'$(kg m^{-1} s^{-1})$',
    'viwvn': r'$(kg m^{-1} s^{-1})$',
    'viwvd': r'$(kg m^{-2} s^{-1})$',
    'lsp': r'(m)',
    'cp': r'(m)',
    'tp': r'(m)',
    'sshf': r'$(J m^{-2})$',
    'slhf': r'$(J m^{-2})$',
    'ssr': r'$(J m^{-2})$',
    'str': r'$(J m^{-2})$',
    'sro': r'(m)',
    'sf': r'(m)',
    'ssrd': r'$(J m^{-2})$',
    'strd': r'$(J m^{-2})$',
    'ttr': r'$(J m^{-2})$',
    'z': r'$(m^2 s^{-2})$',
    'z_height': '$(m)$',
    'z_thick': '$(m)$',
    't': r'(K)',
    'u': r'$(m s^{-1})$',
    'v': r'$(m s^{-1})$',
    'q': r'$(kg kg^{-1})$',
    'w': r'$(Pa s^{-1})$',
    'r': r'(%)',
    'onset': '',
    'retreat': '',
    'length': r'# of days',
    'precipitation': r'(m)',
    'precipitation-rate': r'(m day^{-1}, NAM Season Precip / NAM Length)',
    'nino-3': r'(Ni$\tilda{n}$o-3 Index Anomaly)',
}

# dictionary of regions and their names
region_avg_dict = {
    'cp': 'Colorado Plateau',
    'mr': 'Mogollon Rim',
    'son': 'Sonoran Desert',
    'chi': 'Chihuahuan Desert',
    'moj': 'Mojave Desert',
    'MeNmAz': 'MEX, NM, AZ Border',
    'baja': r'Coast of Baja, CA (5$\degree$ x 5$\degree$)',
}

# dictionary of regions and their coordinate boundaries
# [WEST, EAST, NORTH, SOUTH] -- WEST and EAST are on 0-360 latitude grid system
region_avg_coords = {
    'cp': [249, 253, 39, 35],
    'mr': [249, 251, 33, 34],
    'son': [246, 250, 28, 32],
    'chi': [252, 256, 29, 33],
    'moj': [243, 247, 33, 37],
    'MeNmAz': [246, 256, 38, 28],
    'baja': [242, 247, 27, 22],
}

# dictionary of colors for the plot of each region
region_colors_dict = {
    'cp': 'blue',
    'mr': 'darkorange',
    'son': 'green',
    'chi': 'red',
    'moj': 'purple',
    'MeNmAz': 'brown',
    'baja': 'yellow',
    'dsw': 'black'
}

In [5]:
# define a function to get var files, open dataset, and subset if needed
def get_var_data(var, region='dsw', months=[i for i in range(1,13)], **kwargs):
    r"""
    Retrieves the data for a given variable from my subet ERA5 dataset.  User can choose to return a dataset or data array
    and whether to subset that data based on a region or time.  Any subset data is returned as a data array.

    Parameters
    ----------
    var : str
            The variable desired
    region : str
            The region desired
    months : list, int
            A list of months desired [1, 2, ..., 12]

    Returns
    -------
    var_data : xarray Data Array
            A data array containing the desired data, either in full or subset based on user input

    Kwargs
    ------
    subset_flag : bool
            True or False.  Whether to subset the data or not
    level : int
            The pressure level desired.  Only applied for pressure level data
    type : str
            Specify whether to return a dataset or data array
    mean_flag : bool
            True or False.  Whether to compute the mean (or sum) over the specified months
    group_type : str
            How to group data prior to computing mean or sum across time.
            Options include 'year', 'month', 'dayofyear', etc.

    See Also
    --------
    get_var_files : returns all files for specified variable
    open_var_data : opens the variable dataset or data array
    subset_var_data : subsets data array based on user input

    Notes
    -----

    """

    files = get_var_files(var, region, **kwargs)
    var_data = open_var_data(files, var, **kwargs)
    if kwargs.get('subset_flag', False):
        return subset_var_data(var_data, var, months, region, **kwargs)
    return var_data

In [6]:
# define a function to get the files for a given variable/region
def get_var_files(var, region, **kwargs):

    # grab files for sfc var
    if ((var in sfc_instan_list) or (var in sfc_accumu_list)):
        # dsw
        if region != 'global':
            files = glob.glob(f'{my_era5_path}dsw/*/{var.lower()}_*_dsw.nc')

        elif region == 'global':
            files = glob.glob(f'{my_era5_path}global/*/{var.lower()}_*_dsw.nc')

    # grab files for pl var
    elif var in pl_var_list:
        files = glob.glob(f'{my_era5_path}dsw/*/pl/{var.lower()}_*_dsw.nc')

    # grab files for NAM var
    elif var in NAM_var_list:
        files = glob.glob(f'{my_era5_path}dsw/NAM_{var}.nc')

    elif var in misc_var_list:
        files = glob.glob(f'{misc_data_path}{var}/{var}*.nc')

    elif var in invar_var_list:
        files = glob.glob(f'{my_era5_path}invariants/{var}_invariant.nc')

    # if something went wrong
    else:
        print('something went wrong finding files')
        files = []

    files.sort()
    return files

In [7]:
# define a function to open variable datasets
def open_var_data(files, var, **kwargs):
    # get kwargs
    type = kwargs.get('type', 'da')  # default to returning a data array

    # open dataset
    ds = xr.open_mfdataset(files)
    if type == 'ds':
        return ds

    # pull out actual variable name in the dataset since they can be different names/capitalized
    var_name = [v for v in ds.data_vars.keys() if f'{var.upper()}' in v.upper()][0]
    return ds[var_name]

In [8]:
# define a function to open subset an input data set (or array) by:
# latitude/longitude
# time
# averages
def subset_var_data(var_data, var, months, region, **kwargs):

    if isinstance(var_data, xr.Dataset):
        # pull out actual variable name in the dataset since they can be different names/capitalized
        var_name = [v for v in var_data.data_vars.keys() if f'{var.upper()}' in v.upper()][0]
        da = var_data[var_name]
    elif isinstance(var_data, xr.DataArray):
        da = var_data
    else:
        print('something wrong with var_data in subset_var_data')
        return None

    # subset to regional average if region is specified
    if region in region_avg_list:
        lats = slice(region_avg_coords[region][2], region_avg_coords[region][3])
        lons = slice(region_avg_coords[region][0], region_avg_coords[region][1])
        da = da.sel(latitude=lats, longitude=lons).mean(dim=['latitude', 'longitude'], skipna=True)

    # subset to level if var is a pl var
    if var.lower() in pl_var_list:
        level = kwargs.get('level', None)
        if level is not None:
            da = da.sel(level=level)

    # just return da if var is NAM var, convert to dayofyear for onset and retreat dates
    if var.lower() in NAM_var_list:
        if ((var.lower() == 'onset') or (var.lower() == 'retreat')):
            return da.dt.dayofyear
        else:
            return da

    # subset the data specified by months
    da_sub = da.sel(time=da['time.month'].isin(months))

    # subset further and compute mean/sum if specified by mean_flag
    mean_flag = kwargs.get('mean_flag', False)
    if mean_flag:
        groupby_type = f"time.{kwargs.get('group_type', 'year')}"

        if var.lower() in sfc_accumu_list:
            return da_sub.groupby(groupby_type).sum(dim='time')
        else:
            return da_sub.groupby(groupby_type).mean(dim='time')

    # if mean_flag is False, jsut return whole data array
    else:
        return da_sub

In [9]:
# define a function to check if inputs are list or not
def ensure_var_list(x):

    if not isinstance(x, list):
        return [x]
    return x

In [10]:
# define a function to turn a list of integers into months
def month_num_to_name(var, months, **kwargs):

    # make string for month letters from var_range (e.g. [6,7,8] -> 'JJA')
    if var in NAM_var_list:
        var_months = ''  # don't use months for onset, retreat, length
    elif len(months) == 1:
        var_months = calendar.month_name[months[0]]  # use full month name if only 1 month
    elif len(months) == 12 and kwargs.get('mean_flag', True):
        var_months = 'YEAR'
    elif ((len(months) > 1) & (len(months) <= 12)):
        var_months = ''.join([calendar.month_name[m][0] for m in months])  # make string of months, i.e. 3, 4, 5 is MAM
    return var_months

In [11]:
# define a function to detrend the data

# MANUALLY DETREND WITH LINEAR REGRESSION
def detrend_data(arr):

    # set up x array for the years
    arr_time = np.arange(0,len(arr))

    # mask out nan values
    mask = np.isfinite(arr)
    arr_time_mask = arr_time[mask]
    arr_mask = arr[mask]

    # make sure the array is not full of non-finite values
    if len(arr_mask) == 0:
        arr_detrend = np.empty(len(arr))
        arr_detrend[:] = np.nan

    else:
        # compute linear regression
        result = sp.stats.linregress(arr_time_mask, arr_mask)
        m, b = result.slope, result.intercept

        # detrend the data
        arr_detrend = arr - (m*arr_time + b)

    return arr_detrend


# define a function to mask data for detrending or correlating
def apply_detrend(da, **kwargs):

    input_dims = kwargs.get('input_dims', 'time')
    # load data
    da.load()

    da_detrend = xr.apply_ufunc(
        detrend_data, da,
        input_core_dims=[[input_dims]],
        output_core_dims=[[input_dims]],
        vectorize=True,
        dask='parallelized',
        output_dtypes=[da.dtype]
    )

    return da_detrend

In [12]:
# define a function to regress data
def regress_data(arr1, arr2):

    # mask out nan values
    mask = np.isfinite(arr1) & np.isfinite(arr2)
    arr1_mask = arr1[mask]
    arr2_mask = arr2[mask]

    if len(arr1_mask) < 2:  # check if there are enough data points
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

    else:
        # compute linear regression
        res = sp.stats.linregress(arr1_mask, arr2_mask)
        return res.slope, res.intercept, res.rvalue, res.pvalue, res.stderr, res.intercept_stderr


# define a function to mask data for detrending or correlating
def apply_regression(da1, da2, **kwargs):

    input_dims = kwargs.get('input_dims', 'time')
    # load data
    da1.load()
    da2.load()

    result = xr.apply_ufunc(
        regress_data, da1, da2,
        input_core_dims=[[input_dims], [input_dims]],
        output_core_dims=[[], [], [], [], [], []],
        vectorize=True,
        dask='parallelized',
        output_dtypes=[float, float, float, float, float, float]
    )
    regression_ds = xr.Dataset({
        'slope': result[0],
        'intercept': result[1],
        'rvalue': result[2],
        'pvalue': result[3],
        'stderr': result[4],
        'intercept_stderr': result[5]
    })
    # regress_da = xr.DataArray(result)
    return regression_ds

In [13]:
# define a function to calculate the Pearson correlation and p-value statistic
def compute_corr_pval(arr1, arr2):
    # mask out nan and inf values
    mask = np.isfinite(arr1) & np.isfinite(arr2)
    filtered_arr1 = arr1[mask]
    filtered_arr2 = arr2[mask]

    if len(filtered_arr1) < 2:  # check if there are enough data points
        return np.nan, np.nan

    corr, pval = sp.stats.pearsonr(filtered_arr1, filtered_arr2)
    return corr, pval


# define a function to apply the ufunc to the data
def apply_correlation(da1, da2):
    da1.load()
    da2.load()
    result = xr.apply_ufunc(
        compute_corr_pval, da1, da2,
        input_core_dims=[['year'], ['year']],
        output_core_dims=[[],[]],
        vectorize=True,
        dask='parallelized',
        output_dtypes=[float, float]
    )
    corr_da = result[0]
    pval_da = result[1]

    corr_ds = xr.merge([corr_da.rename('pearson_r'), pval_da.rename('p_value')])
    return corr_ds

In [14]:
# define a function to calculate the Pearson correlation and p-value statistic
def compute_coherence(arr1, arr2):
    # mask out nan and inf values
    mask = np.isfinite(arr1) & np.isfinite(arr2)
    filtered_arr1 = arr1[mask]
    filtered_arr2 = arr2[mask]

    if len(filtered_arr1) < 2:  # check if there are enough data points
        return np.nan, np.nan

    corr, pval = sp.signal.coherence(filtered_arr1, filtered_arr2)
    return corr, pval


# define a function to apply the ufunc to the data
def apply_coherence(da1, da2):
    da1.load()
    da2.load()
    result = xr.apply_ufunc(
        compute_coherence, da1, da2,
        input_core_dims=[['year'], ['year']],
        output_core_dims=[[],[]],
        vectorize=True,
        dask='parallelized',
        output_dtypes=[float, float]
    )
    corr_da = result[0]
    pval_da = result[1]

    corr_ds = xr.merge([corr_da.rename('pearson_r'), pval_da.rename('p_value')])
    return corr_ds

In [62]:
# define a function to calculate the principal components of a data array
def calc_pcs(da, **kwargs):

    # # normalize da with mean and std deviation along time dimension
    # da_mean = np.mean(da, axis=0)
    # da_std = np.std(da, axis=0)
    # da_norm = (da - da_mean) / da_std
    da_norm = da

    # calculate covariance matrix
    da_cov = np.cov(da_norm, rowvar=False)

    # perform eigen decomposition
    eigenvalues, eigenvectors = sp.linalg.eigh(da_cov)

    # sort eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # calculated principal components
    pcs = np.dot(da_norm, eigenvectors)

    return pcs, eigenvalues, eigenvectors


# define a function to calculate the explained variance of one varialbe by another
def calc_explained_variance(da, pcs, **kwargs):

    # regress da onto pcs
    regression = np.linalg.lstsq(pcs, da, rcond=None)[0]

    # calculate explained variance by da for each PC
    da_explained = np.dot(pcs, regression)

    # calculate total variance of original pcs
    total_variance = np.var(pcs, axis=0)

    # calculate proportion of variance explained by da for each PC
    explained_variance_ratio = np.var(da_explained, axis=0) / total_variance

    return explained_variance_ratio


# define the main function to calculate the EOF that identifies the
# variance of da2 explained by da1
def calc_eof(da1, da2, **kwargs):

    # ensure da has dimensions (time, space), where space is (lat, lon)
    # da1 = da1.stack(space=('latitude', 'longitude'))
    da2_stacked = da2.stack(space=('latitude', 'longitude'))

    # convert to numpy arrays for processing


    # get pcs, eigenvalues, and eigenvectors
    pcs, eigenvalues, eigenvectors = calc_pcs(da2_stacked)

    # get explained variance ratio
    evr = calc_explained_variance(da1, pcs)

    # reshape variance ratio back to spatial dimensions
    evr = evr.reshape((da2.sizes['latitude'], da2.sizes['longitude']))

    evr_da = xr.DataArray(evr, dims=['latitude', 'longitude'],
                          coords={'latitude': da2.coords['latitude'], 'longitude': da2.coords['longitude']})

    # evr = explained_variance_ratio.unstack()
    return evr_da