Add example for the t-test.

vnmabus · Dec 21, 2022 · 6b31d5d · 6b31d5d
1 parent dd5a956
commit 6b31d5d
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 0 deletions.
diff --git a/docs/refs.bib b/docs/refs.bib
@@ -116,3 +116,21 @@ @article{szekely++_2007_measuring
   zmnumber = {1129.62059},
   keywords = {distance correlation,distance covariance,multivariate independence}
 }
+
+@article{szekely+rizzo_2013_distance,
+  title = {The Distance Correlation T-Test of Independence in High Dimension},
+  author = {Sz{\'e}kely, G{\'a}bor J. and Rizzo, Maria L.},
+  year = {2013},
+  month = may,
+  journal = {Journal of Multivariate Analysis},
+  volume = {117},
+  pages = {193--213},
+  issn = {0047-259X},
+  doi = {10.1016/j.jmva.2013.02.012},
+  url = {https://www.sciencedirect.com/science/article/pii/S0047259X13000262},
+  urldate = {2022-05-27},
+  abstract = {Distance correlation is extended to the problem of testing the independence of random vectors in high dimension. Distance correlation characterizes independence and determines a test of multivariate independence for random vectors in arbitrary dimension. In this work, a modified distance correlation statistic is proposed, such that under independence the distribution of a transformation of the statistic converges to Student t, as dimension tends to infinity. Thus we obtain a distance correlation t-test for independence of random vectors in arbitrarily high dimension, applicable under standard conditions on the coordinates that ensure the validity of certain limit theorems. This new test is based on an unbiased estimator of distance covariance, and the resulting t-test is unbiased for every sample size greater than three and all significance levels. The transformed statistic is approximately normal under independence for sample size greater than nine, providing an informative sample coefficient that is easily interpretable for high dimensional data.},
+  langid = {english},
+  keywords = {dCor,dCov,Distance correlation,Distance covariance,High dimension,Multivariate independence}
+}
+
diff --git a/examples/plot_dcor_t_test.py b/examples/plot_dcor_t_test.py
@@ -0,0 +1,123 @@
+"""
+The distance correlation t-test of independence
+===============================================
+
+Example that shows the usage of the distance correlation t-test.
+
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scipy.stats
+
+import dcor
+
+# sphinx_gallery_thumbnail_number = 3
+
+# %%
+# Given matching samples of two random vectors with arbitrary dimensions, the
+# distance covariance can be used to construct an asymptotic test of
+# independence.
+# For a introduction to the independence tests see
+# :ref:`sphx_glr_auto_examples_plot_dcov_test.py`.
+
+# %%
+# We can consider the same case with independent observations:
+
+n_samples = 1000
+random_state = np.random.default_rng(83110)
+
+x = random_state.uniform(0, 1, size=n_samples)
+y = random_state.normal(0, 1, size=n_samples)
+
+plt.scatter(x, y, s=1)
+plt.show()
+
+dcor.independence.distance_correlation_t_test(x, y)
+
+# %%
+# We can also consider the case with nonlinear dependencies:
+
+u = random_state.uniform(-1, 1, size=n_samples)
+
+y = (
+    np.cos(u * np.pi)
+    + random_state.normal(0, 0.01, size=n_samples)
+)
+x = (
+    np.sin(u * np.pi)
+    + random_state.normal(0, 0.01, size=n_samples)
+)
+
+plt.scatter(x, y, s=1)
+plt.show()
+
+dcor.independence.distance_correlation_t_test(x, y)
+
+# %%
+# As we can observe, this test also correctly rejects the null hypothesis in
+# the second case and not in the first case.
+
+# %%
+# The test illustrated here is an asymptotic test, that relies in the
+# approximation of the statistic distribution to the Student's
+# t-distribution under the null hypothesis, when the dimension of the data
+# goes to infinity.
+# This test is thus faster than permutation tests, as it does not require the
+# use of permutations of the data, and it is also deterministic for a given
+# dataset.
+# However, the test should be applied only for high-dimensional data, at least
+# in theory.
+
+# %%
+# We will now plot for the case of normal distributions the histogram of the
+# statistic, and compute the Type I error, as seen in
+# :footcite:t:`szekely+rizzo_2013_distance`.
+# Users are encouraged to download this example and increase that number to
+# obtain better estimates of the Type I error.
+# In order to replicate the original results, one should set the value of
+# ``n_tests`` to 1000.
+
+n_tests = 100
+dim = 30
+significance = 0.1
+n_obs_list = [25, 30, 35, 50, 70, 100]
+
+table = pd.DataFrame()
+table["n_obs"] = n_obs_list
+
+dist_results = []
+for n_obs in n_obs_list:
+    n_errors = 0
+    statistics = []
+    for _ in range(n_tests):
+        x = random_state.normal(0, 1, size=(n_samples, dim))
+        y = random_state.normal(0, 1, size=(n_samples, dim))
+
+        test_result = dcor.independence.distance_correlation_t_test(x, y)
+        statistics.append(test_result.statistic)
+        if test_result.pvalue < significance:
+            n_errors += 1
+
+    error_prob = n_errors / n_tests
+    dist_results.append(error_prob)
+
+table["Type I error"] = dist_results
+
+# Plot the last distribution of the statistic
+df = len(x) * (len(x) - 3) / 2
+
+plt.hist(statistics, bins=12, density=True)
+
+distribution = scipy.stats.t(df=df)
+u = np.linspace(distribution.ppf(0.01), distribution.ppf(0.99), 100)
+plt.plot(u, distribution.pdf(u))
+plt.show()
+
+table
+
+# %%
+# Bibliography
+# ------------
+# .. footbibliography::