Fix hypothesis module.

vnmabus · May 17, 2022 · 42819e2 · 42819e2
1 parent 841e08d
commit 42819e2
Show file tree

Hide file tree

Showing 6 changed files with 123 additions and 71 deletions.
diff --git a/dcor/_hypothesis.py b/dcor/_hypothesis.py
@@ -1,54 +1,107 @@
-import collections
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Iterator
 
 import numpy as np
+from dcor._utils import ArrayType
 from joblib import Parallel, delayed
+
 from ._utils import _random_state_init
 
-HypothesisTest = collections.namedtuple('HypothesisTest', ['p_value',
-                                        'statistic'])
 
+@dataclass
+class HypothesisTest():
+    pvalue: float
+    statistic: ArrayType
+
+    @property
+    def p_value(self) -> float:
+        """Old name for pvalue."""
+        warnings.warn(
+            "Attribute \"p_value\" deprecated, use \"pvalue\" instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.pvalue
+
+    def __iter__(self) -> Iterator[Any]:
+        warnings.warn(
+            "HypothesisTest will cease to be iterable.",
+            DeprecationWarning,
+        )
+        return iter((self.pvalue, self.statistic))
+
+    def __len__(self) -> int:
+        warnings.warn(
+            "HypothesisTest will cease to be sized.",
+            DeprecationWarning,
+        )
+        return 2
+
+
+def _permuted_statistic(
+    matrix: ArrayType,
+    statistic_function: Callable[[ArrayType], ArrayType],
+    permutation: np.typing.NDArray[int],
+) -> ArrayType:
+
+    permuted_matrix = matrix[np.ix_(permutation, permutation)]
+
+    return statistic_function(permuted_matrix)
 
-def _permutation_test_with_sym_matrix(matrix, statistic_function,
-                                      num_resamples, random_state,n_jobs=1):
+
+def _permutation_test_with_sym_matrix(
+    matrix: ArrayType,
+    *,
+    statistic_function: Callable[[ArrayType], ArrayType],
+    num_resamples: int,
+    random_state: np.random.RandomState | np.random.Generator | int | None,
+    n_jobs: int | None = None,
+) -> HypothesisTest:
     """
     Execute a permutation test in a symmetric matrix.
 
-    Parameters
-    ----------
-    matrix: array_like
-        Matrix that will perform the permutation test.
-    statistic_function: callable
-        Function that computes the desired statistic from the matrix.
-    num_resamples: int
-        Number of permutations resamples to take in the permutation test.
-    random_state: {None, int, array_like, numpy.random.RandomState}
-        Random state to generate the permutations.
-
-    Returns
-    -------
-    HypothesisTest
+    Parameters:
+        matrix: Matrix that will perform the permutation test.
+        statistic_function: Function that computes the desired statistic from
+            the matrix.
+        num_resamples: Number of permutations resamples to take in the
+            permutation test.
+        random_state: Random state to generate the permutations.
+        n_jobs: Number of jobs executed in parallel by Joblib.
+
+    Returns:
         Results of the hypothesis test.
+
     """
     matrix = np.asarray(matrix)
     random_state = _random_state_init(random_state)
 
     statistic = statistic_function(matrix)
 
-    def bootstrapPerms(mat):
-        permuted_index = random_state.permutation(mat.shape[0])
-
-        permuted_matrix = mat[
-            np.ix_(permuted_index, permuted_index)]
-
-        return statistic_function(permuted_matrix)
+    permutations = (
+        random_state.permutation(matrix.shape[0])
+        for _ in range(num_resamples)
+    )
 
-    bootstrap_statistics = Parallel(n_jobs=n_jobs)(delayed(bootstrapPerms)(matrix) for bootstrap in range(num_resamples))
-    bootstrap_statistics = np.array(bootstrap_statistics, dtype=statistic.dtype)
+    bootstrap_statistics = Parallel(n_jobs=n_jobs)(
+        delayed(_permuted_statistic)(
+            matrix,
+            statistic_function,
+            permutation,
+        ) for permutation in permutations
+    )
+    bootstrap_statistics = np.array(
+        bootstrap_statistics,
+        dtype=statistic.dtype,
+    )
 
     extreme_results = bootstrap_statistics > statistic
-    p_value = (np.sum(extreme_results) + 1.0) / (num_resamples + 1)
+    pvalue = (np.sum(extreme_results) + 1.0) / (num_resamples + 1)
 
     return HypothesisTest(
-        p_value=p_value,
-        statistic=statistic
+        pvalue=pvalue,
+        statistic=statistic,
     )
diff --git a/dcor/homogeneity.py b/dcor/homogeneity.py
@@ -8,10 +8,9 @@
 
 import numpy as _np
 
-from . import _energy, _hypothesis
-from . import distances as _distances
-from ._utils import _transform_to_2d
+from . import _energy, _hypothesis, distances as _distances
 from ._energy import EstimationStatistic
+from ._utils import _transform_to_2d
 
 
 def _energy_test_statistic_coefficient(n, m):
@@ -132,7 +131,7 @@ def _energy_test_statistic_multivariate_from_distance_matrix(
                 distance_xx=distance_xx, distance_yy=distance_yy,
                 distance_xy=distance_xy, n=n, m=m, average=average,
                 estimation_stat=estimation_stat
-                )
+            )
 
             energy += pairwise_energy
 
@@ -202,23 +201,23 @@ def energy_test(
     ...               [0, 1000, 1000, 1000],
     ...               [1000, 1000, 1000, 1000]])
     >>> dcor.homogeneity.energy_test(a, a)
-    HypothesisTest(p_value=1.0, statistic=0.0)
+    HypothesisTest(pvalue=1.0, statistic=0.0)
     >>> dcor.homogeneity.energy_test(a, b) # doctest: +ELLIPSIS
-    HypothesisTest(p_value=1.0, statistic=35.2766732...)
+    HypothesisTest(pvalue=1.0, statistic=35.2766732...)
     >>> dcor.homogeneity.energy_test(b, b)
-    HypothesisTest(p_value=1.0, statistic=0.0)
+    HypothesisTest(pvalue=1.0, statistic=0.0)
     >>> dcor.homogeneity.energy_test(a, b, num_resamples=5, random_state=0)
-    HypothesisTest(p_value=0.1666666..., statistic=35.2766732...)
+    HypothesisTest(pvalue=0.1666666..., statistic=35.2766732...)
     >>> dcor.homogeneity.energy_test(a, b, num_resamples=5, random_state=6)
-    HypothesisTest(p_value=0.3333333..., statistic=35.2766732...)
+    HypothesisTest(pvalue=0.3333333..., statistic=35.2766732...)
     >>> dcor.homogeneity.energy_test(a, c, num_resamples=7, random_state=0)
-    HypothesisTest(p_value=0.125, statistic=4233.8935035...)
+    HypothesisTest(pvalue=0.125, statistic=4233.8935035...)
 
     A different exponent for the Euclidean distance in the range
     :math:`(0, 2)` can be used:
 
     >>> dcor.homogeneity.energy_test(a, b, exponent=1.5) # doctest: +ELLIPSIS
-    HypothesisTest(p_value=1.0, statistic=171.0623923...)
+    HypothesisTest(pvalue=1.0, statistic=171.0623923...)
 
     """
 
@@ -254,7 +253,7 @@ def statistic_function(distance_matrix):
             sizes=sample_sizes,
             average=average,
             estimation_stat=estimation_stat
-            )
+        )
 
     return _hypothesis._permutation_test_with_sym_matrix(
         sample_distances,

diff --git a/dcor/independence.py b/dcor/independence.py
@@ -69,21 +69,21 @@ def distance_covariance_test(
     ...               [1, 1, 1, 1],
     ...               [1, 1, 0, 1]])
     >>> dcor.independence.distance_covariance_test(a, a)
-    HypothesisTest(p_value=1.0, statistic=208.0)
+    HypothesisTest(pvalue=1.0, statistic=208.0)
     >>> dcor.independence.distance_covariance_test(a, b)
     ...                                      # doctest: +ELLIPSIS
-    HypothesisTest(p_value=1.0, statistic=11.75323056...)
+    HypothesisTest(pvalue=1.0, statistic=11.75323056...)
     >>> dcor.independence.distance_covariance_test(b, b)
-    HypothesisTest(p_value=1.0, statistic=1.3604610...)
+    HypothesisTest(pvalue=1.0, statistic=1.3604610...)
     >>> dcor.independence.distance_covariance_test(a, b,
     ... num_resamples=5, random_state=0)
-    HypothesisTest(p_value=0.5, statistic=11.7532305...)
+    HypothesisTest(pvalue=0.5, statistic=11.7532305...)
     >>> dcor.independence.distance_covariance_test(a, b,
     ... num_resamples=5, random_state=13)
-    HypothesisTest(p_value=0.3333333..., statistic=11.7532305...)
+    HypothesisTest(pvalue=0.3333333..., statistic=11.7532305...)
     >>> dcor.independence.distance_covariance_test(a, a,
     ... num_resamples=7, random_state=0)
-    HypothesisTest(p_value=0.125, statistic=208.0)
+    HypothesisTest(pvalue=0.125, statistic=208.0)
 
     """
     x = _transform_to_2d(x)
@@ -179,22 +179,22 @@ def partial_distance_covariance_test(
     ...               [1000, 1000, 0, 1000]])
     >>> dcor.independence.partial_distance_covariance_test(a, a, b)
     ...                                       # doctest: +ELLIPSIS
-    HypothesisTest(p_value=1.0, statistic=142.6664416...)
+    HypothesisTest(pvalue=1.0, statistic=142.6664416...)
     >>> dcor.independence.partial_distance_covariance_test(a, b, c)
     ...                                      # doctest: +ELLIPSIS
-    HypothesisTest(p_value=1.0, statistic=7.2690070...e-15)
+    HypothesisTest(pvalue=1.0, statistic=7.2690070...e-15)
     >>> dcor.independence.partial_distance_covariance_test(b, b, c)
     ...                                      # doctest: +ELLIPSIS
-    HypothesisTest(p_value=1.0, statistic=2.2533380...e-30)
+    HypothesisTest(pvalue=1.0, statistic=2.2533380...e-30)
     >>> dcor.independence.partial_distance_covariance_test(a, b, c,
     ... num_resamples=5, random_state=0)
-    HypothesisTest(p_value=0.1666666..., statistic=7.2690070...e-15)
+    HypothesisTest(pvalue=0.1666666..., statistic=7.2690070...e-15)
     >>> dcor.independence.partial_distance_covariance_test(a, b, c,
     ... num_resamples=5, random_state=13)
-    HypothesisTest(p_value=0.1666666..., statistic=7.2690070...e-15)
+    HypothesisTest(pvalue=0.1666666..., statistic=7.2690070...e-15)
     >>> dcor.independence.partial_distance_covariance_test(a, c, b,
     ... num_resamples=7, random_state=0)
-    HypothesisTest(p_value=1.0, statistic=-7.5701764...e-12)
+    HypothesisTest(pvalue=1.0, statistic=-7.5701764...e-12)
 
     """
     random_state = _random_state_init(random_state)
@@ -316,14 +316,14 @@ def distance_correlation_t_test(x, y):
     >>> with np.errstate(divide='ignore'):
     ...     dcor.independence.distance_correlation_t_test(a, a)
     ...                                      # doctest: +ELLIPSIS
-    HypothesisTest(p_value=0.0, statistic=inf)
+    HypothesisTest(pvalue=0.0, statistic=inf)
     >>> dcor.independence.distance_correlation_t_test(a, b)
     ...                                      # doctest: +ELLIPSIS
-    HypothesisTest(p_value=0.6327451..., statistic=-0.4430164...)
+    HypothesisTest(pvalue=0.6327451..., statistic=-0.4430164...)
     >>> with np.errstate(divide='ignore'):
     ...     dcor.independence.distance_correlation_t_test(b, b)
     ...                                      # doctest: +ELLIPSIS
-    HypothesisTest(p_value=0.0, statistic=inf)
+    HypothesisTest(pvalue=0.0, statistic=inf)
 
     """
     t_test = distance_correlation_t_statistic(x, y)
@@ -334,4 +334,4 @@ def distance_correlation_t_test(x, y):
 
     p_value = 1 - scipy.stats.t.cdf(t_test, df=df)
 
-    return _hypothesis.HypothesisTest(p_value=p_value, statistic=t_test)
+    return _hypothesis.HypothesisTest(pvalue=p_value, statistic=t_test)
diff --git a/dcor/tests/test_homogeneity.py b/dcor/tests/test_homogeneity.py
@@ -37,7 +37,7 @@ def test_same_distribution_same_parameters(self):
         result = dcor.homogeneity.energy_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertGreater(result.p_value, significance)
+        self.assertGreater(result.pvalue, significance)
 
     def test_same_distribution_different_means(self):
         """
@@ -66,7 +66,7 @@ def test_same_distribution_different_means(self):
         result = dcor.homogeneity.energy_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
 
     def test_same_distribution_different_covariances(self):
         """
@@ -95,7 +95,7 @@ def test_same_distribution_different_covariances(self):
         result = dcor.homogeneity.energy_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
 
     def test_different_distributions(self):
         """
@@ -118,7 +118,7 @@ def test_different_distributions(self):
         result = dcor.homogeneity.energy_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
 
     def test_different_means_median(self):
         """
@@ -158,7 +158,7 @@ def test_different_means_median(self):
         )
 
         # Check that we detected the heterogeneity
-        self.assertLess(median_result.p_value, significance)
+        self.assertLess(median_result.pvalue, significance)
 
     def test_different_distributions_median(self):
         """
@@ -182,4 +182,4 @@ def test_different_distributions_median(self):
             random_state=random_state
         )
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
diff --git a/dcor/tests/test_independence.py b/dcor/tests/test_independence.py
@@ -37,7 +37,7 @@ def test_independent_variables(self):
         result = dcor.independence.distance_covariance_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertGreater(result.p_value, significance)
+        self.assertGreater(result.pvalue, significance)
 
     def test_same_variable(self):
         """
@@ -65,7 +65,7 @@ def test_same_variable(self):
         result = dcor.independence.distance_covariance_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
 
     def test_function_variable(self):
         """
@@ -94,7 +94,7 @@ def test_function_variable(self):
         result = dcor.independence.distance_covariance_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
 
     def test_dependent_variables(self):
         """
@@ -124,4 +124,4 @@ def test_dependent_variables(self):
         result = dcor.independence.distance_covariance_test(
             a, b, num_resamples=num_resamples, random_state=random_state)
 
-        self.assertLess(result.p_value, significance)
+        self.assertLess(result.pvalue, significance)
diff --git a/setup.py b/setup.py
@@ -78,8 +78,7 @@
         'Natural Language :: English',
         'Operating System :: OS Independent',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
@@ -95,6 +94,7 @@
         'numpy',
         'numba>=0.51',
         'scipy',
+        'joblib',
     ],
     setup_requires=pytest_runner,
     tests_require=[