Skip to content

Commit

Permalink
Merge pull request #27 from TMiguelT/u-statistics
Browse files Browse the repository at this point in the history
Add u-statistic argument, with test
  • Loading branch information
vnmabus committed Mar 3, 2021
2 parents a5f12a7 + cabc8e8 commit bb12b34
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 10 deletions.
2 changes: 1 addition & 1 deletion dcor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
mean_product, u_product,
u_projection,
u_complementary_projection)
from ._energy import energy_distance # noqa
from ._energy import energy_distance, EstimationStatistic # noqa
from ._partial_dcor import (partial_distance_covariance, # noqa
partial_distance_correlation)
from ._rowwise import rowwise, RowwiseMode
Expand Down
64 changes: 61 additions & 3 deletions dcor/_energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,45 @@

import numpy as np

from enum import Enum, auto

from . import distances
from ._utils import _transform_to_2d


class EstimationStatistic(Enum):
"""
A type of estimation statistic used for calculating energy distance.
"""
@classmethod
def from_string(cls, item):
"""
Allows EstimationStatistic.from_string('u'),
EstimationStatistic.from_string('V'),
EstimationStatistic.from_string('V_STATISTIC'),
EstimationStatistic.from_string('u_statistic') etc
"""
upper = item.upper()
if upper == 'U':
return cls.U_STATISTIC
elif upper == 'V':
return cls.V_STATISTIC
else:
return cls[upper]

U_STATISTIC = auto()
"""
Hoeffding's unbiased U-statistics
(does not include the distance from each point to itself)
"""

V_STATISTIC = auto()
"""
von Mises's biased V-statistics
(does include the distance from each point to itself)
"""


def _check_valid_energy_exponent(exponent):
if not 0 < exponent < 2:
warning_msg = ('The energy distance is not guaranteed to be '
Expand All @@ -19,7 +54,8 @@ def _check_valid_energy_exponent(exponent):


def _energy_distance_from_distance_matrices(
distance_xx, distance_yy, distance_xy, average=None):
distance_xx, distance_yy, distance_xy, average=None,
estimation_stat=EstimationStatistic.V_STATISTIC):
"""
Compute energy distance with precalculated distance matrices.
Expand All @@ -28,19 +64,34 @@ def _energy_distance_from_distance_matrices(
average: Callable[[ArrayLike], float]
A function that will be used to calculate an average of distances.
This defaults to np.mean.
estimation_stat: Union[str, EstimationStatistic]
If EstimationStatistic.U_STATISTIC, calculate energy distance using
Hoeffding's unbiased U-statistics. Otherwise, use von Mises's biased
V-statistics.
If this is provided as a string, it will first be converted to
an EstimationStatistic enum instance.
"""
if isinstance(estimation_stat, str):
estimation_stat = EstimationStatistic.from_string(estimation_stat)

if average is None:
average = np.mean

if estimation_stat == EstimationStatistic.U_STATISTIC:
# If using u-statistics, we exclude the central diagonal of 0s for the
# within-sample distances
distance_xx = distance_xx[np.triu_indices_from(distance_xx, k=1)]
distance_yy = distance_yy[np.triu_indices_from(distance_yy, k=1)]

return (
2 * average(distance_xy) -
average(distance_xx) -
average(distance_yy)
)


def energy_distance(x, y, *, average=None, exponent=1):
def energy_distance(x, y, *, average=None, exponent=1,
estimation_stat=EstimationStatistic.V_STATISTIC):
"""
Estimator for energy distance.
Expand All @@ -61,6 +112,12 @@ def energy_distance(x, y, *, average=None, exponent=1):
average: Callable[[ArrayLike], float]
A function that will be used to calculate an average of distances.
This defaults to np.mean.
estimation_stat: Union[str, EstimationStatistic]
If EstimationStatistic.U_STATISTIC, calculate energy distance using
Hoeffding's unbiased U-statistics. Otherwise, use von Mises's biased
V-statistics.
If this is provided as a string, it will first be converted to
an EstimationStatistic enum instance.
Returns
-------
Expand Down Expand Up @@ -111,4 +168,5 @@ def energy_distance(x, y, *, average=None, exponent=1):
distance_yy=distance_yy,
distance_xy=distance_xy,
average=average,
estimation_stat=estimation_stat
)
40 changes: 34 additions & 6 deletions dcor/homogeneity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from . import _energy, _hypothesis
from . import distances as _distances
from ._utils import _transform_to_2d
from ._energy import EstimationStatistic


def _energy_test_statistic_coefficient(n, m):
Expand All @@ -19,17 +20,26 @@ def _energy_test_statistic_coefficient(n, m):


def _energy_test_statistic_from_distance_matrices(
distance_xx, distance_yy, distance_xy, n, m, average=None):
distance_xx, distance_yy, distance_xy, n, m, average=None,
estimation_stat=_energy.EstimationStatistic.V_STATISTIC):
"""Test statistic with precomputed distance matrices."""
energy_distance = _energy._energy_distance_from_distance_matrices(
distance_xx=distance_xx, distance_yy=distance_yy,
distance_xy=distance_xy, average=average
distance_xy=distance_xy, average=average,
estimation_stat=estimation_stat
)

return _energy_test_statistic_coefficient(n, m) * energy_distance


def energy_test_statistic(x, y, *, exponent=1, average=None):
def energy_test_statistic(
x,
y,
*,
exponent=1,
average=None,
estimation_stat=EstimationStatistic.V_STATISTIC
):
"""
Homogeneity statistic.
Expand All @@ -49,6 +59,12 @@ def energy_test_statistic(x, y, *, exponent=1, average=None):
average: Callable[[ArrayLike], float]
A function that will be used to calculate an average of distances.
This defaults to np.mean.
estimation_stat: Union[str, EstimationStatistic]
If EstimationStatistic.U_STATISTIC, calculate energy distance using
Hoeffding's unbiased U-statistics. Otherwise, use von Mises's biased
V-statistics.
If this is provided as a string, it will first be converted to
an EstimationStatistic enum instance.
Returns
-------
Expand Down Expand Up @@ -91,11 +107,12 @@ def energy_test_statistic(x, y, *, exponent=1, average=None):
y,
exponent=exponent,
average=average,
estimation_stat=estimation_stat
)


def _energy_test_statistic_multivariate_from_distance_matrix(
distance, indexes, sizes, average=None):
distance, indexes, sizes, average=None, estimation_stat='v'):
"""Statistic for several random vectors given the distance matrix."""
energy = 0.0

Expand All @@ -113,7 +130,9 @@ def _energy_test_statistic_multivariate_from_distance_matrix(

pairwise_energy = _energy_test_statistic_from_distance_matrices(
distance_xx=distance_xx, distance_yy=distance_yy,
distance_xy=distance_xy, n=n, m=m, average=average)
distance_xy=distance_xy, n=n, m=m, average=average,
estimation_stat=estimation_stat
)

energy += pairwise_energy

Expand All @@ -126,6 +145,7 @@ def energy_test(
exponent=1,
random_state=None,
average=None,
estimation_stat=_energy.EstimationStatistic.V_STATISTIC
):
"""
Test of homogeneity based on the energy distance.
Expand All @@ -150,6 +170,12 @@ def energy_test(
average: Callable[[ArrayLike], float]
A function that will be used to calculate an average of distances.
This defaults to np.mean.
estimation_stat: Union[str, EstimationStatistic]
If EstimationStatistic.U_STATISTIC, calculate energy distance using
Hoeffding's unbiased U-statistics. Otherwise, use von Mises's biased
V-statistics.
If this is provided as a string, it will first be converted to
an EstimationStatistic enum instance.
Returns
-------
Expand Down Expand Up @@ -225,7 +251,9 @@ def statistic_function(distance_matrix):
distance=distance_matrix,
indexes=sample_indexes,
sizes=sample_sizes,
average=average)
average=average,
estimation_stat=estimation_stat
)

return _hypothesis._permutation_test_with_sym_matrix(
sample_distances,
Expand Down
65 changes: 65 additions & 0 deletions dcor/tests/test_energy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
from dcor._energy import _energy_distance_from_distance_matrices
from dcor import EstimationStatistic
import unittest


class TestEnergyTest(unittest.TestCase):
"""Tests for energy statistics"""

def test_u_v_statistics(self):
"""
Test that we are correctly applying the provided type of estimation
statistics
Also tests that we can choose a statistic type using either a string
or an enum instance
"""
distance_within = np.array([
[0, 1, 1],
[1, 0, 1],
[1, 1, 0],
])
distance_between = np.array([
[1, 1, 1],
[1, 1, 1],
[1, 1, 1],
])

u_dist = _energy_distance_from_distance_matrices(
distance_xx=distance_within,
distance_yy=distance_within,
distance_xy=distance_between,
estimation_stat=EstimationStatistic.U_STATISTIC
)
u_dist_2 = _energy_distance_from_distance_matrices(
distance_xx=distance_within,
distance_yy=distance_within,
distance_xy=distance_between,
estimation_stat='U_STATISTIC'
)
u_dist_3 = _energy_distance_from_distance_matrices(
distance_xx=distance_within,
distance_yy=distance_within,
distance_xy=distance_between,
estimation_stat='u_statistic'
)
v_dist = _energy_distance_from_distance_matrices(
distance_xx=distance_within,
distance_yy=distance_within,
distance_xy=distance_between,
estimation_stat='v'
)

# The first 3 invocations should be identical
self.assertEqual(u_dist, u_dist_2)
self.assertEqual(u_dist, u_dist_3)

# V-statistics count the 0 terms on the diagonal, so the impact of the
# within-sample distance will be smaller, making the overall distance
# larger.
self.assertGreater(v_dist, u_dist)

# Also test for exact values
self.assertEqual(u_dist, 0)
self.assertAlmostEqual(v_dist, 2/3)

0 comments on commit bb12b34

Please sign in to comment.