In [None]:
## example code from Hendrik

import xarray as xr
import numpy as np
from scipy import stats
import dask

#stats.personr only works on dataarrays, so select your variable at one point
data = xr.open_dataset('subset.nc').sst 

#maybe mfdataset helps for huge datasets and then parallization; 39 is the size of time, you want all time points to be in one chunk
# data = xr.open_mfdataset('subset.nc',chunks={'time':39,'lat':100,'lon':100}).sst 

#stats.pearsonr doesnt allow nan inputs, so put nan to 0; it then returns nan again for correlation along constant arrays
data = data.fillna(0)

def correlation(data1,data2):
    # data1 is the spatial field you want to correlate to
    # data2 is your single time series
    # calculates the correlation coefficient and p_value
    # returns the result as a numpy array, because the initial output of the function is of a weird PearsonRResult class, which doesnt work in apply_ufunc
    result = stats.pearsonr(data1,data2)
    return np.stack((result[0],result[1]), axis=-1)

# apply_ufunc takes the function you want to apply and then the necessary input arguments to that function
# so data is your spatial field and then your single time series (I just selected one pointfrom my field)
# the input_core_dimensions basically mean along which dimension your function is applied on
# the output dimension is necesarry because the correlation output is of size 2
# dask='parallelized' makes it faster, but needs some additional arguments for your output

result = xr.apply_ufunc(correlation,data,data.isel(lat=50,lon=50),
                        input_core_dims=[['time'],['time']],
                        output_core_dims=[['statistic']],vectorize=True,
                        dask='parallelized',output_dtypes=[np.dtype(float)],
                        dask_gufunc_kwargs={'output_sizes':{'statistic':2}})

# make xarray dataset of the output, because the output has r and p along one extra dimension, so assign them to single variables
statistics = xr.Dataset(coords={'lat':result.lat,'lon':result.lon}, data_vars = {
    'corrcoef':result[:,:,0],
    'p_value':result[:,:,1]
})

# necessary if you use mfdatasets, so you finally compute the correlation for each chunk
# statistics = statistics.compute()

statistics