Skip to content

Commit

Permalink
Fix hypothesis module.
Browse files Browse the repository at this point in the history
  • Loading branch information
vnmabus committed May 17, 2022
1 parent 841e08d commit 42819e2
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 71 deletions.
115 changes: 84 additions & 31 deletions dcor/_hypothesis.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,107 @@
import collections
from __future__ import annotations

import warnings
from dataclasses import dataclass
from typing import Any, Callable, Iterator

import numpy as np
from dcor._utils import ArrayType
from joblib import Parallel, delayed

from ._utils import _random_state_init

HypothesisTest = collections.namedtuple('HypothesisTest', ['p_value',
'statistic'])

@dataclass
class HypothesisTest():
pvalue: float
statistic: ArrayType

@property
def p_value(self) -> float:
"""Old name for pvalue."""
warnings.warn(
"Attribute \"p_value\" deprecated, use \"pvalue\" instead.",
DeprecationWarning,
stacklevel=2,
)
return self.pvalue

def __iter__(self) -> Iterator[Any]:
warnings.warn(
"HypothesisTest will cease to be iterable.",
DeprecationWarning,
)
return iter((self.pvalue, self.statistic))

def __len__(self) -> int:
warnings.warn(
"HypothesisTest will cease to be sized.",
DeprecationWarning,
)
return 2


def _permuted_statistic(
matrix: ArrayType,
statistic_function: Callable[[ArrayType], ArrayType],
permutation: np.typing.NDArray[int],
) -> ArrayType:

permuted_matrix = matrix[np.ix_(permutation, permutation)]

return statistic_function(permuted_matrix)

def _permutation_test_with_sym_matrix(matrix, statistic_function,
num_resamples, random_state,n_jobs=1):

def _permutation_test_with_sym_matrix(
matrix: ArrayType,
*,
statistic_function: Callable[[ArrayType], ArrayType],
num_resamples: int,
random_state: np.random.RandomState | np.random.Generator | int | None,
n_jobs: int | None = None,
) -> HypothesisTest:
"""
Execute a permutation test in a symmetric matrix.
Parameters
----------
matrix: array_like
Matrix that will perform the permutation test.
statistic_function: callable
Function that computes the desired statistic from the matrix.
num_resamples: int
Number of permutations resamples to take in the permutation test.
random_state: {None, int, array_like, numpy.random.RandomState}
Random state to generate the permutations.
Returns
-------
HypothesisTest
Parameters:
matrix: Matrix that will perform the permutation test.
statistic_function: Function that computes the desired statistic from
the matrix.
num_resamples: Number of permutations resamples to take in the
permutation test.
random_state: Random state to generate the permutations.
n_jobs: Number of jobs executed in parallel by Joblib.
Returns:
Results of the hypothesis test.
"""
matrix = np.asarray(matrix)
random_state = _random_state_init(random_state)

statistic = statistic_function(matrix)

def bootstrapPerms(mat):
permuted_index = random_state.permutation(mat.shape[0])

permuted_matrix = mat[
np.ix_(permuted_index, permuted_index)]

return statistic_function(permuted_matrix)
permutations = (
random_state.permutation(matrix.shape[0])
for _ in range(num_resamples)
)

bootstrap_statistics = Parallel(n_jobs=n_jobs)(delayed(bootstrapPerms)(matrix) for bootstrap in range(num_resamples))
bootstrap_statistics = np.array(bootstrap_statistics, dtype=statistic.dtype)
bootstrap_statistics = Parallel(n_jobs=n_jobs)(
delayed(_permuted_statistic)(
matrix,
statistic_function,
permutation,
) for permutation in permutations
)
bootstrap_statistics = np.array(
bootstrap_statistics,
dtype=statistic.dtype,
)

extreme_results = bootstrap_statistics > statistic
p_value = (np.sum(extreme_results) + 1.0) / (num_resamples + 1)
pvalue = (np.sum(extreme_results) + 1.0) / (num_resamples + 1)

return HypothesisTest(
p_value=p_value,
statistic=statistic
pvalue=pvalue,
statistic=statistic,
)
23 changes: 11 additions & 12 deletions dcor/homogeneity.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@

import numpy as _np

from . import _energy, _hypothesis
from . import distances as _distances
from ._utils import _transform_to_2d
from . import _energy, _hypothesis, distances as _distances
from ._energy import EstimationStatistic
from ._utils import _transform_to_2d


def _energy_test_statistic_coefficient(n, m):
Expand Down Expand Up @@ -132,7 +131,7 @@ def _energy_test_statistic_multivariate_from_distance_matrix(
distance_xx=distance_xx, distance_yy=distance_yy,
distance_xy=distance_xy, n=n, m=m, average=average,
estimation_stat=estimation_stat
)
)

energy += pairwise_energy

Expand Down Expand Up @@ -202,23 +201,23 @@ def energy_test(
... [0, 1000, 1000, 1000],
... [1000, 1000, 1000, 1000]])
>>> dcor.homogeneity.energy_test(a, a)
HypothesisTest(p_value=1.0, statistic=0.0)
HypothesisTest(pvalue=1.0, statistic=0.0)
>>> dcor.homogeneity.energy_test(a, b) # doctest: +ELLIPSIS
HypothesisTest(p_value=1.0, statistic=35.2766732...)
HypothesisTest(pvalue=1.0, statistic=35.2766732...)
>>> dcor.homogeneity.energy_test(b, b)
HypothesisTest(p_value=1.0, statistic=0.0)
HypothesisTest(pvalue=1.0, statistic=0.0)
>>> dcor.homogeneity.energy_test(a, b, num_resamples=5, random_state=0)
HypothesisTest(p_value=0.1666666..., statistic=35.2766732...)
HypothesisTest(pvalue=0.1666666..., statistic=35.2766732...)
>>> dcor.homogeneity.energy_test(a, b, num_resamples=5, random_state=6)
HypothesisTest(p_value=0.3333333..., statistic=35.2766732...)
HypothesisTest(pvalue=0.3333333..., statistic=35.2766732...)
>>> dcor.homogeneity.energy_test(a, c, num_resamples=7, random_state=0)
HypothesisTest(p_value=0.125, statistic=4233.8935035...)
HypothesisTest(pvalue=0.125, statistic=4233.8935035...)
A different exponent for the Euclidean distance in the range
:math:`(0, 2)` can be used:
>>> dcor.homogeneity.energy_test(a, b, exponent=1.5) # doctest: +ELLIPSIS
HypothesisTest(p_value=1.0, statistic=171.0623923...)
HypothesisTest(pvalue=1.0, statistic=171.0623923...)
"""

Expand Down Expand Up @@ -254,7 +253,7 @@ def statistic_function(distance_matrix):
sizes=sample_sizes,
average=average,
estimation_stat=estimation_stat
)
)

return _hypothesis._permutation_test_with_sym_matrix(
sample_distances,
Expand Down
32 changes: 16 additions & 16 deletions dcor/independence.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,21 @@ def distance_covariance_test(
... [1, 1, 1, 1],
... [1, 1, 0, 1]])
>>> dcor.independence.distance_covariance_test(a, a)
HypothesisTest(p_value=1.0, statistic=208.0)
HypothesisTest(pvalue=1.0, statistic=208.0)
>>> dcor.independence.distance_covariance_test(a, b)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=1.0, statistic=11.75323056...)
HypothesisTest(pvalue=1.0, statistic=11.75323056...)
>>> dcor.independence.distance_covariance_test(b, b)
HypothesisTest(p_value=1.0, statistic=1.3604610...)
HypothesisTest(pvalue=1.0, statistic=1.3604610...)
>>> dcor.independence.distance_covariance_test(a, b,
... num_resamples=5, random_state=0)
HypothesisTest(p_value=0.5, statistic=11.7532305...)
HypothesisTest(pvalue=0.5, statistic=11.7532305...)
>>> dcor.independence.distance_covariance_test(a, b,
... num_resamples=5, random_state=13)
HypothesisTest(p_value=0.3333333..., statistic=11.7532305...)
HypothesisTest(pvalue=0.3333333..., statistic=11.7532305...)
>>> dcor.independence.distance_covariance_test(a, a,
... num_resamples=7, random_state=0)
HypothesisTest(p_value=0.125, statistic=208.0)
HypothesisTest(pvalue=0.125, statistic=208.0)
"""
x = _transform_to_2d(x)
Expand Down Expand Up @@ -179,22 +179,22 @@ def partial_distance_covariance_test(
... [1000, 1000, 0, 1000]])
>>> dcor.independence.partial_distance_covariance_test(a, a, b)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=1.0, statistic=142.6664416...)
HypothesisTest(pvalue=1.0, statistic=142.6664416...)
>>> dcor.independence.partial_distance_covariance_test(a, b, c)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=1.0, statistic=7.2690070...e-15)
HypothesisTest(pvalue=1.0, statistic=7.2690070...e-15)
>>> dcor.independence.partial_distance_covariance_test(b, b, c)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=1.0, statistic=2.2533380...e-30)
HypothesisTest(pvalue=1.0, statistic=2.2533380...e-30)
>>> dcor.independence.partial_distance_covariance_test(a, b, c,
... num_resamples=5, random_state=0)
HypothesisTest(p_value=0.1666666..., statistic=7.2690070...e-15)
HypothesisTest(pvalue=0.1666666..., statistic=7.2690070...e-15)
>>> dcor.independence.partial_distance_covariance_test(a, b, c,
... num_resamples=5, random_state=13)
HypothesisTest(p_value=0.1666666..., statistic=7.2690070...e-15)
HypothesisTest(pvalue=0.1666666..., statistic=7.2690070...e-15)
>>> dcor.independence.partial_distance_covariance_test(a, c, b,
... num_resamples=7, random_state=0)
HypothesisTest(p_value=1.0, statistic=-7.5701764...e-12)
HypothesisTest(pvalue=1.0, statistic=-7.5701764...e-12)
"""
random_state = _random_state_init(random_state)
Expand Down Expand Up @@ -316,14 +316,14 @@ def distance_correlation_t_test(x, y):
>>> with np.errstate(divide='ignore'):
... dcor.independence.distance_correlation_t_test(a, a)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=0.0, statistic=inf)
HypothesisTest(pvalue=0.0, statistic=inf)
>>> dcor.independence.distance_correlation_t_test(a, b)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=0.6327451..., statistic=-0.4430164...)
HypothesisTest(pvalue=0.6327451..., statistic=-0.4430164...)
>>> with np.errstate(divide='ignore'):
... dcor.independence.distance_correlation_t_test(b, b)
... # doctest: +ELLIPSIS
HypothesisTest(p_value=0.0, statistic=inf)
HypothesisTest(pvalue=0.0, statistic=inf)
"""
t_test = distance_correlation_t_statistic(x, y)
Expand All @@ -334,4 +334,4 @@ def distance_correlation_t_test(x, y):

p_value = 1 - scipy.stats.t.cdf(t_test, df=df)

return _hypothesis.HypothesisTest(p_value=p_value, statistic=t_test)
return _hypothesis.HypothesisTest(pvalue=p_value, statistic=t_test)
12 changes: 6 additions & 6 deletions dcor/tests/test_homogeneity.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_same_distribution_same_parameters(self):
result = dcor.homogeneity.energy_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertGreater(result.p_value, significance)
self.assertGreater(result.pvalue, significance)

def test_same_distribution_different_means(self):
"""
Expand Down Expand Up @@ -66,7 +66,7 @@ def test_same_distribution_different_means(self):
result = dcor.homogeneity.energy_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)

def test_same_distribution_different_covariances(self):
"""
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_same_distribution_different_covariances(self):
result = dcor.homogeneity.energy_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)

def test_different_distributions(self):
"""
Expand All @@ -118,7 +118,7 @@ def test_different_distributions(self):
result = dcor.homogeneity.energy_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)

def test_different_means_median(self):
"""
Expand Down Expand Up @@ -158,7 +158,7 @@ def test_different_means_median(self):
)

# Check that we detected the heterogeneity
self.assertLess(median_result.p_value, significance)
self.assertLess(median_result.pvalue, significance)

def test_different_distributions_median(self):
"""
Expand All @@ -182,4 +182,4 @@ def test_different_distributions_median(self):
random_state=random_state
)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)
8 changes: 4 additions & 4 deletions dcor/tests/test_independence.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_independent_variables(self):
result = dcor.independence.distance_covariance_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertGreater(result.p_value, significance)
self.assertGreater(result.pvalue, significance)

def test_same_variable(self):
"""
Expand Down Expand Up @@ -65,7 +65,7 @@ def test_same_variable(self):
result = dcor.independence.distance_covariance_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)

def test_function_variable(self):
"""
Expand Down Expand Up @@ -94,7 +94,7 @@ def test_function_variable(self):
result = dcor.independence.distance_covariance_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)

def test_dependent_variables(self):
"""
Expand Down Expand Up @@ -124,4 +124,4 @@ def test_dependent_variables(self):
result = dcor.independence.distance_covariance_test(
a, b, num_resamples=num_resamples, random_state=random_state)

self.assertLess(result.p_value, significance)
self.assertLess(result.pvalue, significance)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@
'Natural Language :: English',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Software Development :: Libraries :: Python Modules',
],
Expand All @@ -95,6 +94,7 @@
'numpy',
'numba>=0.51',
'scipy',
'joblib',
],
setup_requires=pytest_runner,
tests_require=[
Expand Down

0 comments on commit 42819e2

Please sign in to comment.