Skip to content

Commit

Permalink
Added better scaling of kernels functions, more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
tommyod committed Apr 1, 2018
1 parent 5976c3f commit 51bcf3e
Show file tree
Hide file tree
Showing 6 changed files with 657 additions and 75 deletions.
57 changes: 44 additions & 13 deletions KDEpy/kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@author: tommy
"""
import pytest
import numbers
import numpy as np
from KDEpy.kernel_funcs import _kernel_functions
Expand Down Expand Up @@ -94,7 +95,8 @@ def evaluate_naive(self, grid_points, weights=None):

return evaluated

def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data):
def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data,
tolerance):
"""
Evaluate the sorted weights.
Expand All @@ -103,18 +105,40 @@ def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data):
Runs in O(log(`data`)) + O(`datapoints close to grid_point`).
"""

# The relationship between the desired bandwidth and the original
# bandwidth of the kernel function
bw_scale = self.bw / abs(self.kernel.right_bw - self.kernel.left_bw)
# ---------------------------------------------------------------------
# -------- Compute the support to the left and right of the kernel ----
# ---------------------------------------------------------------------

# Compute the bandwidth to the left and to the right of the center
left_bw = self.kernel.left_bw * bw_scale
right_bw = self.kernel.right_bw * bw_scale

j = np.searchsorted(data_sorted, grid_point + right_bw, side='right')
# If the kernel has finite support, find the support by scaling the
# variance. This is done when a call is made later on.
if self.kernel.finite_support:
left_support = self.kernel.support[0]
right_support = self.kernel.support[1]
else:
# Compute the support up to a tolerance
# This code assumes the kernel is symmetric about 0
# TODO: Extend this code for non-symmetric kernels

# Scale relative tolerance to the height of the kernel at 0
tolerance = self.kernel(0, bw=self.bw) * tolerance
# Sample the x values and the function to the left of 0
x_samples = np.linspace(-self.kernel.var * 10, 0, num=2**10)
sampled_func = self.kernel(x_samples, bw=self.bw) - tolerance
# Use binary search to find when the function equals the tolerance
i = np.searchsorted(sampled_func, 0, side='right') - 1
left_support, right_support = x_samples[i], abs(x_samples[i])
assert self.kernel(x_samples[i], bw=self.bw) <= tolerance

# ---------------------------------------------------------------------
# -------- Use binary search to only compute for points close by ------
# ---------------------------------------------------------------------

j = np.searchsorted(data_sorted, grid_point + right_support,
side='right')
# i = np.searchsorted(data_sorted[:j], grid_point - left_bw,
# side='left')
i = np.searchsorted(data_sorted, grid_point - left_bw, side='left')
i = np.searchsorted(data_sorted, grid_point + left_support,
side='left')
i = np.maximum(0, i)
j = np.minimum(len_data, j)

Expand All @@ -128,7 +152,7 @@ def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data):
weighted_estimates = np.dot(kernel_estimates, weights_subset)
return np.sum(weighted_estimates)

def evaluate_sorted(self, grid_points, weights=None):
def evaluate_sorted(self, grid_points, weights=None, tolerance=10e-6):
"""
Evaluated by sorting and using binary search.
Expand All @@ -150,7 +174,7 @@ def evaluate_sorted(self, grid_points, weights=None):

for i, grid_point in enumerate(grid_points):
evaluated[i] = self._eval_sorted(data_sorted, weights_sorted,
grid_point, len_data)
grid_point, len_data, tolerance)

# Normalize, but do not divide by zero
return evaluated
Expand All @@ -170,11 +194,18 @@ def main():

x = np.linspace(-5, 5)
data = np.random.random(10)
KDE(bw='silverman').fit(data).evaluate_naive(x)
y = KDE('box', bw=1).fit(data).evaluate_sorted(x)

print(y)


if __name__ == '__main__':
main()


if __name__ == "__main__":
# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=['.', '--doctest-modules', '-v'])



19 changes: 13 additions & 6 deletions KDEpy/kernel_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,13 @@ def __init__(self, function, var=1, support=(-3, 3)):
"""
self.function = function
self.var = var
self.support = support
self.finite_support = np.all(np.isfinite(np.array(self.support)))
self.finite_support = np.all(np.isfinite(np.array(support)))

# If the function has finite support, scale the support so that it
# corresponds to the support of the function when it is scaled to have
# unit variance.
self.support = tuple(supp / np.sqrt(self.var) for supp in support)

assert self.support[0] < self.support[1]

def evaluate(self, x, bw=1):
Expand All @@ -108,12 +113,12 @@ def evaluate(self, x, bw=1):
else:
x = np.asarray_chkfinite(x)

# Scale the function
# Scale the function, such that bw=1 corresponds to the function having
# a standard deviation (or variance) equal to 1
real_bw = bw / np.sqrt(self.var)
return self.function(x / real_bw) / real_bw

def __call__(self, *args, **kwargs):
return self.evaluate(*args, **kwargs)
__call__ = evaluate


gaussian = Kernel(gaussian, var=1, support=(-np.inf, np.inf))
Expand Down Expand Up @@ -142,4 +147,6 @@ def __call__(self, *args, **kwargs):
if __name__ == "__main__":
import pytest
# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=['.', '--doctest-modules', '-v'])
pytest.main(args=['.', '--doctest-modules', '-v'])

print(box.support)
56 changes: 54 additions & 2 deletions KDEpy/tests/test_kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
"""
import numpy as np
from KDEpy import KDE
import itertools
import pytest

kernel_funcs = list(KDE._available_kernels.values())


class TestNaiveKDE():

Expand Down Expand Up @@ -41,6 +44,7 @@ class TestNaiveKDE():
def test_against_R_density(self, kernel, bw, n, expected_result):
"""
Test against the following function call in R:
d <- density(c(0, 0.1, 1), kernel="{kernel}", bw={bw},
n={n}, from=-1, to=1);
d$y
Expand All @@ -50,7 +54,55 @@ def test_against_R_density(self, kernel, bw, n, expected_result):
y = KDE(kernel, bw=bw).fit(data).evaluate_naive(x)
assert np.allclose(y, expected_result, atol=10**(-2.7))


@pytest.mark.parametrize("bw, n, expected_result",
[(1, 3, np.array([0.17127129,
0.34595518,
0.30233275])),
(0.1, 5, np.array([2.56493684e-22,
4.97598466e-06,
2.13637668e+00,
4.56012216e-04,
1.32980760e+00])),
(0.01, 3, np.array([0.,
13.29807601,
13.29807601]))])
def test_against_scipy_density(self, bw, n, expected_result):
"""
Test against the following function call in SciPy:
data = np.array([0, 0.1, 1])
x = np.linspace(-1, 1, {n})
bw = {bw}/np.asarray(data).std(ddof=1)
density_estimate = gaussian_kde(dataset = data, bw_method = bw)
y = density_estimate.evaluate(x)
# Note that scipy weights its bandwidth by the covariance of the
# input data. To make the results comparable to the other methods,
# we divide the bandwidth by the sample standard deviation here.
"""
data = np.array([0, 0.1, 1])
x = np.linspace(-1, 1, num=n)
y = KDE(kernel='gaussian', bw=bw).fit(data).evaluate_naive(x)
assert np.allclose(y, expected_result)

@pytest.mark.parametrize("data, bw, n, kernel",
list(itertools.product([np.array([0, 0.1, 1]),
np.array([0, 10, 20])],
[0.1, 0.5, 1],
[1, 5, 10],
kernel_funcs)))
def test_naive_vs_sorted_eval(self, data, bw, n, kernel):
"""
Test the naive algorithm versus the sorted evaluation.
"""
x = np.linspace(np.min(data), np.max(data), num=n)
kde = KDE(kernel=kernel, bw=bw).fit(data)
assert np.allclose(kde.evaluate_naive(x),
kde.evaluate_sorted(x, tolerance=10e-3),
atol=10e-2, rtol=0)


if __name__ == "__main__":
# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=['.', '--doctest-modules', '-v'])
pytest.main(args=['.', '--doctest-modules', '-v'])

18 changes: 6 additions & 12 deletions KDEpy/tests/test_kernel_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,9 @@ def test_integral_unity(self, fname, function):
"""

if function.finite_support:
# When bw=1, the function is scaled by the standard deviation
# so that std(f) = 1. Divide by standard deviation to get
# the integration limits.
a, b = tuple(s / np.sqrt(function.var) for s in function.support)
a, b = function.support
else:
a, b = -30, 30
a, b = -5 * function.var, 5 * function.var
integral, abserr = quad(function, a=a, b=b)
assert np.isclose(integral, 1)

Expand All @@ -38,10 +35,9 @@ def test_monotonic_decreasing(self, fname, function):
"""

if function.finite_support:
a, b = tuple(s / np.sqrt(function.var) for s in function.support)
x = np.linspace(a, b)
x = np.linspace(*function.support)
else:
x = np.linspace(-50, 50)
x = np.linspace(-5 * function.var, 5 * function.var)
y = function(x)
diffs_left = np.diff(y[x <= 0])
diffs_right = np.diff(y[x >= 0])
Expand All @@ -56,15 +52,13 @@ def test_non_negative(self, fname, function):
"""

if function.finite_support:
a, b = tuple(s / np.sqrt(function.var) for s in function.support)
x = np.linspace(a, b)
x = np.linspace(*function.support)
else:
x = np.linspace(-20, 20)
x = np.linspace(-5 * function.var, 5 * function.var)
y = function(x)
assert np.all(y >= 0)


if __name__ == "__main__":
import pytest
# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=['.', '--doctest-modules', '-v'])
2 changes: 1 addition & 1 deletion sandbox/Existing implementations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down
580 changes: 539 additions & 41 deletions sandbox/Testing.ipynb

Large diffs are not rendered by default.

0 comments on commit 51bcf3e

Please sign in to comment.