In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from scipy.stats import pearsonr

# Influence of smoothing on correlations

To illustrate the effect of smoothing on the null-distribution of correlation coefficients we can run a small simulation study:

We generate a number of white noise samples that we correlate to generate a baseline null-distribution that emulates what a classical t-test would test against. We than filter the same time series with increasing $\sigma$ and recalculate the correlations to obtain null-distributions for the smoothed data.

In [None]:
n_obs = 2000   # number of observations
n_rep = 10000  # number of repetitions

# Generate n_rep white noise time series with length n_obs
x = np.random.randn(n_rep, n_obs)
y = np.random.randn(n_rep, n_obs)

# Calculate correlation between pairs of unfiltered time series
# This will serve as our baseline
r_null_unfilted = np.array([pearsonr(xi, yi)[0] for xi, yi in zip(x, y)])

# Filter width we want to look at
sigma_filt = np.array((2, 5, 10, 20))
# Output array
r_null = np.zeros((len(sigma_filt), n_rep))

# Filter white noise with the different filter lengths and save correlation coefficients for later
for i, s in enumerate(sigma_filt):
    xf = gaussian_filter1d(np.random.randn(n_rep, n_obs), s)
    yf = gaussian_filter1d(np.random.randn(n_rep, n_obs), s)
    r_null[i] = np.array([pearsonr(xi, yi)[0] for xi, yi in zip(xf, yf)])

First, lets take a look at histograms for the simulations

In [None]:
bins = np.linspace(-0.5, 0.5, 50)
plt.hist(r_null_unfilted, bins, histtype='step', density=True, color='k', label='unfilted')
for i, s in enumerate(sigma_filt):
    plt.hist(r_null[i], bins, histtype='step', density=True, label='$\sigma=%.1f$' % s)
    
plt.xlabel('$r$')
plt.ylabel('Probability density')
plt.legend()
plt.title('Null-distribution for $r$, gaussian smoothing')

We can use the empirical distributions to calculate the value for $r$ that is the 95th percentile in the unsmoothed data. This value needs to be exeeded for the correlation to be significant at the (1 - 0.95) = 0.05 significance level.

In [None]:
r_95 = np.percentile(r_null_unfilted, 95)
print('Empircial r_crit for unsmoothed data: %.3f', r_95)

Using the samples, we just generated, we can check the fraction of correlations that exeeds this threshhold. This gives an indication of how often we would call a random correlation significant, if we were to use the threshold of the white noise hypothesis. We call this the false-positive rate.

In [None]:
fpr = np.mean(np.abs(r_null) >= r_95, axis=1)
print('P(r>=r_95) for smoothed data:', fpr)

To illustrate the rapid increase in the false-positive rate we can plot these values against the filter widths:

In [None]:
plt.plot(sigma_filt, np.mean(np.abs(r_null) > r_95, axis=1), 'k.-')
plt.xlabel('Width of gaussian filter ($\sigma$)')
plt.ylabel('False positive rate at $\\alpha=%.2f$' % (0.05))