Merge pull request #52 from dougiesquire/BUG_density_with_nans

Fix bug with density calculation when NaNs are present
xgcm · Jun 15, 2021 · cfefa63 · cfefa63
2 parents 8a6765a + 9769f83
commit cfefa63
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 13 deletions.
diff --git a/doc/contributing.rst b/doc/contributing.rst
@@ -140,8 +140,12 @@ Preparing Pull Requests
 Release History
 ---------------
 
+
 v0.2.1 (not yet released)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Fixed bug with density calculation when NaNs are present :issue:`51`.
+  By `Dougie Squire <https://github.com/dougiesquire>`_.
 - Implemented various options for users for providing bins to
   xhistogram that mimic the numpy histogram API. This included
   adding a range argument to the xhistogram API :issue:`13`.

diff --git a/xhistogram/core.py b/xhistogram/core.py
@@ -436,7 +436,13 @@ def histogram(
             # Slower, but N-dimensional logic
             bin_areas = np.prod(np.ix_(*bin_widths))
 
-        h = bin_counts / bin_areas / bin_counts.sum()
+        # Sum over the last n_inputs axes, which correspond to the bins. All other axes
+        # are "bystander" axes. Sums must be done independently for each bystander axes
+        # so that nans are dealt with correctly (#51)
+        bin_axes = tuple(_range(-n_inputs, 0))
+        bin_count_sums = bin_counts.sum(axis=bin_axes)
+        bin_count_sums_shape = bin_count_sums.shape + len(bin_axes) * (1,)
+        h = bin_counts / bin_areas / reshape(bin_count_sums, bin_count_sums_shape)
     else:
         h = bin_counts
 

diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py
@@ -24,12 +24,17 @@
 @pytest.mark.parametrize("axis", [1, None])
 @pytest.mark.parametrize("bins", [10, np.linspace(-4, 4, 10), "auto"])
 @pytest.mark.parametrize("range_", [None, (-4, 4)])
-def test_histogram_results_1d(block_size, density, axis, bins, range_):
+@pytest.mark.parametrize("add_nans", [False, True])
+def test_histogram_results_1d(block_size, density, axis, bins, range_, add_nans):
     nrows, ncols = 5, 20
     # Setting the random seed here prevents np.testing.assert_allclose
     # from failing beow. We should investigate this further.
     np.random.seed(2)
     data = np.random.randn(nrows, ncols)
+    if add_nans:
+        N_nans = 20
+        data.ravel()[np.random.choice(data.size, N_nans, replace=False)] = np.nan
+    bins = np.linspace(-4, 4, 10)
 
     h, bin_edges = histogram(
         data, bins=bins, range=range_, axis=axis, block_size=block_size, density=density
@@ -53,12 +58,11 @@ def test_histogram_results_1d(block_size, density, axis, bins, range_):
         )
     else:
         expected = np.histogram(data, bins=bins, range=range_, density=density)[0]
-    norm = nrows if (density and axis) else 1
-    np.testing.assert_allclose(h, expected / norm)
+    np.testing.assert_allclose(h, expected)
 
     if density:
-        widths = np.diff(bin_edges)
-        integral = np.sum(h * widths)
+        widths = np.diff(bins)
+        integral = np.sum(h * widths, axis)
         np.testing.assert_allclose(integral, 1.0)
 
 
@@ -150,10 +154,15 @@ def test_histogram_results_2d_broadcasting(dask):
     np.testing.assert_array_equal(hist, h)
 
 
-def test_histogram_results_2d_density():
+@pytest.mark.parametrize("add_nans", [False, True])
+def test_histogram_results_2d_density(add_nans):
     nrows, ncols = 5, 20
     data_a = np.random.randn(nrows, ncols)
     data_b = np.random.randn(nrows, ncols)
+    if add_nans:
+        N_nans = 20
+        data_a.ravel()[np.random.choice(data_a.size, N_nans, replace=False)] = np.nan
+        data_b.ravel()[np.random.choice(data_b.size, N_nans, replace=False)] = np.nan
     nbins_a = 9
     bins_a = np.linspace(-4, 4, nbins_a + 1)
     nbins_b = 10
@@ -175,11 +184,17 @@ def test_histogram_results_2d_density():
     np.testing.assert_allclose(integral, 1.0)
 
 
-def test_histogram_results_3d_density():
+@pytest.mark.parametrize("add_nans", [False, True])
+def test_histogram_results_3d_density(add_nans):
     nrows, ncols = 5, 20
     data_a = np.random.randn(nrows, ncols)
     data_b = np.random.randn(nrows, ncols)
     data_c = np.random.randn(nrows, ncols)
+    if add_nans:
+        N_nans = 20
+        data_a.ravel()[np.random.choice(data_a.size, N_nans, replace=False)] = np.nan
+        data_b.ravel()[np.random.choice(data_b.size, N_nans, replace=False)] = np.nan
+        data_c.ravel()[np.random.choice(data_c.size, N_nans, replace=False)] = np.nan
     nbins_a = 9
     bins_a = np.linspace(-4, 4, nbins_a + 1)
     nbins_b = 10

diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py
@@ -197,11 +197,6 @@ def histogram(
 
     da_out = xr.DataArray(h_data, dims=output_dims, coords=all_coords, name=output_name)
 
-    if density:
-        # correct for overcounting the bins which weren't histogrammed along
-        n_bins_bystander_dims = da_out.isel(**{bd: 0 for bd in new_dims}).size
-        da_out = da_out * n_bins_bystander_dims
-
     return da_out
 
     # we need weights to be passed through apply_func's alignment algorithm,