Added better scaling of kernels functions, more tests

tommyod · Apr 1, 2018 · 51bcf3e · 51bcf3e
1 parent 5976c3f
commit 51bcf3e
Show file tree

Hide file tree

Showing 6 changed files with 657 additions and 75 deletions.
diff --git a/KDEpy/kde.py b/KDEpy/kde.py
@@ -5,6 +5,7 @@
 
 @author: tommy
 """
+import pytest
 import numbers
 import numpy as np
 from KDEpy.kernel_funcs import _kernel_functions
@@ -94,7 +95,8 @@ def evaluate_naive(self, grid_points, weights=None):
 
         return evaluated 
 
-    def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data):
+    def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data,
+                     tolerance):
         """
         Evaluate the sorted weights.
         
@@ -103,18 +105,40 @@ def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data):
         Runs in O(log(`data`)) + O(`datapoints close to grid_point`).
         """
 
-        # The relationship between the desired bandwidth and the original
-        # bandwidth of the kernel function
-        bw_scale = self.bw / abs(self.kernel.right_bw - self.kernel.left_bw)
+        # ---------------------------------------------------------------------
+        # -------- Compute the support to the left and right of the kernel ----
+        # ---------------------------------------------------------------------
 
-        # Compute the bandwidth to the left and to the right of the center
-        left_bw = self.kernel.left_bw * bw_scale
-        right_bw = self.kernel.right_bw * bw_scale
-
-        j = np.searchsorted(data_sorted, grid_point + right_bw, side='right')
+        # If the kernel has finite support, find the support by scaling the 
+        # variance. This is done when a call is made later on.
+        if self.kernel.finite_support:
+            left_support = self.kernel.support[0] 
+            right_support = self.kernel.support[1] 
+        else:
+            # Compute the support up to a tolerance
+            # This code assumes the kernel is symmetric about 0
+            # TODO: Extend this code for non-symmetric kernels
+
+            # Scale relative tolerance to the height of the kernel at 0
+            tolerance = self.kernel(0, bw=self.bw) * tolerance
+            # Sample the x values and the function to the left of 0
+            x_samples = np.linspace(-self.kernel.var * 10, 0, num=2**10)
+            sampled_func = self.kernel(x_samples, bw=self.bw) - tolerance
+            # Use binary search to find when the function equals the tolerance
+            i = np.searchsorted(sampled_func, 0, side='right') - 1
+            left_support, right_support = x_samples[i], abs(x_samples[i])
+            assert self.kernel(x_samples[i], bw=self.bw) <= tolerance
+
+        # ---------------------------------------------------------------------
+        # -------- Use binary search to only compute for points close by ------
+        # ---------------------------------------------------------------------
+
+        j = np.searchsorted(data_sorted, grid_point + right_support, 
+                            side='right')
         # i = np.searchsorted(data_sorted[:j], grid_point - left_bw, 
         # side='left')
-        i = np.searchsorted(data_sorted, grid_point - left_bw, side='left')
+        i = np.searchsorted(data_sorted, grid_point + left_support, 
+                            side='left')
         i = np.maximum(0, i)
         j = np.minimum(len_data, j)
 
@@ -128,7 +152,7 @@ def _eval_sorted(self, data_sorted, weights_sorted, grid_point, len_data):
         weighted_estimates = np.dot(kernel_estimates, weights_subset)
         return np.sum(weighted_estimates)
 
-    def evaluate_sorted(self, grid_points, weights=None):
+    def evaluate_sorted(self, grid_points, weights=None, tolerance=10e-6):
         """
         Evaluated by sorting and using binary search.
         
@@ -150,7 +174,7 @@ def evaluate_sorted(self, grid_points, weights=None):
 
         for i, grid_point in enumerate(grid_points):
             evaluated[i] = self._eval_sorted(data_sorted, weights_sorted, 
-                                             grid_point, len_data)
+                                             grid_point, len_data, tolerance)
 
         # Normalize, but do not divide by zero
         return evaluated
@@ -170,11 +194,18 @@ def main():
 
     x = np.linspace(-5, 5)
     data = np.random.random(10)
-    KDE(bw='silverman').fit(data).evaluate_naive(x)
+    y = KDE('box', bw=1).fit(data).evaluate_sorted(x)
 
+    print(y)
 
+
 if __name__ == '__main__':
     main()
 
+
+if __name__ == "__main__":
+    # --durations=10  <- May be used to show potentially slow tests
+    pytest.main(args=['.', '--doctest-modules', '-v'])
+
 
 
diff --git a/KDEpy/kernel_funcs.py b/KDEpy/kernel_funcs.py
@@ -93,8 +93,13 @@ def __init__(self, function, var=1, support=(-3, 3)):
         """
         self.function = function
         self.var = var
-        self.support = support
-        self.finite_support = np.all(np.isfinite(np.array(self.support)))
+        self.finite_support = np.all(np.isfinite(np.array(support)))
+
+        # If the function has finite support, scale the support so that it
+        # corresponds to the support of the function when it is scaled to have
+        # unit variance.
+        self.support = tuple(supp / np.sqrt(self.var) for supp in support)
+
         assert self.support[0] < self.support[1]
 
     def evaluate(self, x, bw=1):
@@ -108,12 +113,12 @@ def evaluate(self, x, bw=1):
         else:
             x = np.asarray_chkfinite(x)
 
-        # Scale the function
+        # Scale the function, such that bw=1 corresponds to the function having
+        # a standard deviation (or variance) equal to 1
         real_bw = bw / np.sqrt(self.var)
         return self.function(x / real_bw) / real_bw
 
-    def __call__(self, *args, **kwargs):
-        return self.evaluate(*args, **kwargs)
+    __call__ = evaluate
 
 
 gaussian = Kernel(gaussian, var=1, support=(-np.inf, np.inf))
@@ -142,4 +147,6 @@ def __call__(self, *args, **kwargs):
 if __name__ == "__main__":
     import pytest
     # --durations=10  <- May be used to show potentially slow tests
-    pytest.main(args=['.', '--doctest-modules', '-v'])
+    pytest.main(args=['.', '--doctest-modules', '-v'])
+
+    print(box.support)
diff --git a/KDEpy/tests/test_kde.py b/KDEpy/tests/test_kde.py
@@ -5,8 +5,11 @@
 """
 import numpy as np
 from KDEpy import KDE
+import itertools
 import pytest
 
+kernel_funcs = list(KDE._available_kernels.values())
+
 
 class TestNaiveKDE():
 
@@ -41,6 +44,7 @@ class TestNaiveKDE():
     def test_against_R_density(self, kernel, bw, n, expected_result):
         """
         Test against the following function call in R:
+            
             d <- density(c(0, 0.1, 1), kernel="{kernel}", bw={bw}, 
             n={n}, from=-1, to=1);
             d$y
@@ -50,7 +54,55 @@ def test_against_R_density(self, kernel, bw, n, expected_result):
         y = KDE(kernel, bw=bw).fit(data).evaluate_naive(x)
         assert np.allclose(y, expected_result, atol=10**(-2.7))
 
-
+    @pytest.mark.parametrize("bw, n, expected_result", 
+                             [(1, 3, np.array([0.17127129, 
+                                               0.34595518, 
+                                               0.30233275])),
+                              (0.1, 5, np.array([2.56493684e-22, 
+                                                 4.97598466e-06, 
+                                                 2.13637668e+00, 
+                                                 4.56012216e-04,
+                                                 1.32980760e+00])),
+                              (0.01, 3, np.array([0., 
+                                                  13.29807601, 
+                                                  13.29807601]))])
+    def test_against_scipy_density(self, bw, n, expected_result):
+        """
+        Test against the following function call in SciPy:
+            
+            data = np.array([0, 0.1, 1])
+            x = np.linspace(-1, 1, {n})
+            bw = {bw}/np.asarray(data).std(ddof=1)
+            density_estimate = gaussian_kde(dataset = data, bw_method = bw)
+            y = density_estimate.evaluate(x)
+            
+        # Note that scipy weights its bandwidth by the covariance of the
+        # input data.  To make the results comparable to the other methods,
+        # we divide the bandwidth by the sample standard deviation here.
+        """
+        data = np.array([0, 0.1, 1])
+        x = np.linspace(-1, 1, num=n)
+        y = KDE(kernel='gaussian', bw=bw).fit(data).evaluate_naive(x)
+        assert np.allclose(y, expected_result)
+
+    @pytest.mark.parametrize("data, bw, n, kernel",
+                             list(itertools.product([np.array([0, 0.1, 1]), 
+                                                     np.array([0, 10, 20])],
+                                                    [0.1, 0.5, 1], 
+                                                    [1, 5, 10],
+                                                    kernel_funcs)))
+    def test_naive_vs_sorted_eval(self, data, bw, n, kernel):
+        """
+        Test the naive algorithm versus the sorted evaluation.
+        """
+        x = np.linspace(np.min(data), np.max(data), num=n)
+        kde = KDE(kernel=kernel, bw=bw).fit(data)
+        assert np.allclose(kde.evaluate_naive(x), 
+                           kde.evaluate_sorted(x, tolerance=10e-3), 
+                           atol=10e-2, rtol=0)
+
+
 if __name__ == "__main__":
     # --durations=10  <- May be used to show potentially slow tests
-    pytest.main(args=['.', '--doctest-modules', '-v'])
+    pytest.main(args=['.', '--doctest-modules', '-v'])
+
diff --git a/KDEpy/tests/test_kernel_funcs.py b/KDEpy/tests/test_kernel_funcs.py
@@ -20,12 +20,9 @@ def test_integral_unity(self, fname, function):
         """
 
         if function.finite_support:
-            # When bw=1, the function is scaled by the standard deviation
-            # so that std(f) = 1. Divide by standard deviation to get
-            # the integration limits.
-            a, b = tuple(s / np.sqrt(function.var) for s in function.support)
+            a, b = function.support
         else:
-            a, b = -30, 30
+            a, b = -5 * function.var, 5 * function.var
         integral, abserr = quad(function, a=a, b=b)
         assert np.isclose(integral, 1)
 
@@ -38,10 +35,9 @@ def test_monotonic_decreasing(self, fname, function):
         """
 
         if function.finite_support:
-            a, b = tuple(s / np.sqrt(function.var) for s in function.support)
-            x = np.linspace(a, b)
+            x = np.linspace(*function.support)
         else:
-            x = np.linspace(-50, 50)
+            x = np.linspace(-5 * function.var, 5 * function.var)
         y = function(x)
         diffs_left = np.diff(y[x <= 0])
         diffs_right = np.diff(y[x >= 0])
@@ -56,15 +52,13 @@ def test_non_negative(self, fname, function):
         """
 
         if function.finite_support:
-            a, b = tuple(s / np.sqrt(function.var) for s in function.support)
-            x = np.linspace(a, b)
+            x = np.linspace(*function.support)
         else:
-            x = np.linspace(-20, 20)
+            x = np.linspace(-5 * function.var, 5 * function.var)
         y = function(x)
         assert np.all(y >= 0)
 
 
 if __name__ == "__main__":
-    import pytest
     # --durations=10  <- May be used to show potentially slow tests
     pytest.main(args=['.', '--doctest-modules', '-v'])
diff --git a/sandbox/Existing implementations.ipynb b/sandbox/Existing implementations.ipynb
@@ -359,7 +359,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,

diff --git a/sandbox/Testing.ipynb b/sandbox/Testing.ipynb