Skip to content

Commit

Permalink
Merge pull request #52 from dougiesquire/BUG_density_with_nans
Browse files Browse the repository at this point in the history
Fix bug with density calculation when NaNs are present
  • Loading branch information
TomNicholas committed Jun 15, 2021
2 parents 8a6765a + 9769f83 commit cfefa63
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 13 deletions.
4 changes: 4 additions & 0 deletions doc/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ Preparing Pull Requests
Release History
---------------


v0.2.1 (not yet released)
~~~~~~~~~~~~~~~~~~~~~~~~~

- Fixed bug with density calculation when NaNs are present :issue:`51`.
By `Dougie Squire <https://github.com/dougiesquire>`_.
- Implemented various options for users for providing bins to
xhistogram that mimic the numpy histogram API. This included
adding a range argument to the xhistogram API :issue:`13`.
Expand Down
8 changes: 7 additions & 1 deletion xhistogram/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,13 @@ def histogram(
# Slower, but N-dimensional logic
bin_areas = np.prod(np.ix_(*bin_widths))

h = bin_counts / bin_areas / bin_counts.sum()
# Sum over the last n_inputs axes, which correspond to the bins. All other axes
# are "bystander" axes. Sums must be done independently for each bystander axes
# so that nans are dealt with correctly (#51)
bin_axes = tuple(_range(-n_inputs, 0))
bin_count_sums = bin_counts.sum(axis=bin_axes)
bin_count_sums_shape = bin_count_sums.shape + len(bin_axes) * (1,)
h = bin_counts / bin_areas / reshape(bin_count_sums, bin_count_sums_shape)
else:
h = bin_counts

Expand Down
29 changes: 22 additions & 7 deletions xhistogram/test/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,17 @@
@pytest.mark.parametrize("axis", [1, None])
@pytest.mark.parametrize("bins", [10, np.linspace(-4, 4, 10), "auto"])
@pytest.mark.parametrize("range_", [None, (-4, 4)])
def test_histogram_results_1d(block_size, density, axis, bins, range_):
@pytest.mark.parametrize("add_nans", [False, True])
def test_histogram_results_1d(block_size, density, axis, bins, range_, add_nans):
nrows, ncols = 5, 20
# Setting the random seed here prevents np.testing.assert_allclose
# from failing beow. We should investigate this further.
np.random.seed(2)
data = np.random.randn(nrows, ncols)
if add_nans:
N_nans = 20
data.ravel()[np.random.choice(data.size, N_nans, replace=False)] = np.nan
bins = np.linspace(-4, 4, 10)

h, bin_edges = histogram(
data, bins=bins, range=range_, axis=axis, block_size=block_size, density=density
Expand All @@ -53,12 +58,11 @@ def test_histogram_results_1d(block_size, density, axis, bins, range_):
)
else:
expected = np.histogram(data, bins=bins, range=range_, density=density)[0]
norm = nrows if (density and axis) else 1
np.testing.assert_allclose(h, expected / norm)
np.testing.assert_allclose(h, expected)

if density:
widths = np.diff(bin_edges)
integral = np.sum(h * widths)
widths = np.diff(bins)
integral = np.sum(h * widths, axis)
np.testing.assert_allclose(integral, 1.0)


Expand Down Expand Up @@ -150,10 +154,15 @@ def test_histogram_results_2d_broadcasting(dask):
np.testing.assert_array_equal(hist, h)


def test_histogram_results_2d_density():
@pytest.mark.parametrize("add_nans", [False, True])
def test_histogram_results_2d_density(add_nans):
nrows, ncols = 5, 20
data_a = np.random.randn(nrows, ncols)
data_b = np.random.randn(nrows, ncols)
if add_nans:
N_nans = 20
data_a.ravel()[np.random.choice(data_a.size, N_nans, replace=False)] = np.nan
data_b.ravel()[np.random.choice(data_b.size, N_nans, replace=False)] = np.nan
nbins_a = 9
bins_a = np.linspace(-4, 4, nbins_a + 1)
nbins_b = 10
Expand All @@ -175,11 +184,17 @@ def test_histogram_results_2d_density():
np.testing.assert_allclose(integral, 1.0)


def test_histogram_results_3d_density():
@pytest.mark.parametrize("add_nans", [False, True])
def test_histogram_results_3d_density(add_nans):
nrows, ncols = 5, 20
data_a = np.random.randn(nrows, ncols)
data_b = np.random.randn(nrows, ncols)
data_c = np.random.randn(nrows, ncols)
if add_nans:
N_nans = 20
data_a.ravel()[np.random.choice(data_a.size, N_nans, replace=False)] = np.nan
data_b.ravel()[np.random.choice(data_b.size, N_nans, replace=False)] = np.nan
data_c.ravel()[np.random.choice(data_c.size, N_nans, replace=False)] = np.nan
nbins_a = 9
bins_a = np.linspace(-4, 4, nbins_a + 1)
nbins_b = 10
Expand Down
5 changes: 0 additions & 5 deletions xhistogram/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,6 @@ def histogram(

da_out = xr.DataArray(h_data, dims=output_dims, coords=all_coords, name=output_name)

if density:
# correct for overcounting the bins which weren't histogrammed along
n_bins_bystander_dims = da_out.isel(**{bd: 0 for bd in new_dims}).size
da_out = da_out * n_bins_bystander_dims

return da_out

# we need weights to be passed through apply_func's alignment algorithm,
Expand Down

0 comments on commit cfefa63

Please sign in to comment.