Skip to content

Commit

Permalink
Merge pull request #44 from dougiesquire/refactor_bin_alignment_with_np
Browse files Browse the repository at this point in the history
Refactor approach for making final bin right-edge inclusive
  • Loading branch information
dougiesquire committed Jun 22, 2021
2 parents 3e7e1c1 + 4fdd716 commit 5c12e17
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 19 deletions.
40 changes: 21 additions & 19 deletions xhistogram/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from functools import reduce
from collections.abc import Iterable
from numpy import (
digitize,
searchsorted,
bincount,
reshape,
ravel_multi_index,
Expand Down Expand Up @@ -154,24 +154,26 @@ def _bincount_2d_vectorized(
nbins = [len(b) for b in bins]
hist_shapes = [nb + 1 for nb in nbins]

# a marginally faster implementation would be to use searchsorted,
# like numpy histogram itself does
# https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882
# for now we stick with `digitize` because it's easy to understand how it works

# Add small increment to the last bin edge to make the final bin right-edge inclusive
# Note, this is the approach taken by sklearn, e.g.
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
# but a better approach would be to use something like _search_sorted_inclusive() in
# numpy histogram. This is an additional motivation for moving to searchsorted
bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]

# the maximum possible value of of digitize is nbins
# for right=False:
# The maximum possible value of searchsorted is nbins
# For _searchsorted_inclusive:
# - 0 corresponds to a < b[0]
# - i corresponds to bins[i-1] <= a < b[i]
# - nbins corresponds to a a >= b[1]
each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
# - i corresponds to b[i-1] <= a < b[i]
# - nbins-1 corresponds to b[-2] <= a <= b[-1]
# - nbins corresponds to a >= b[-1]
def _searchsorted_inclusive(a, b):
"""
Like `searchsorted`, but where the last bin is also right-edge inclusive.
"""
# Similar to implementation in np.histogramdd
# see https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L1056
# This assumes the bins (b) are sorted
bin_indices = searchsorted(b, a, side="right")
on_edge = a == b[-1]
# Shift these points one bin to the left.
bin_indices[on_edge] -= 1
return bin_indices

each_bin_indices = [_searchsorted_inclusive(a, b) for a, b in zip(args, bins)]
# product of the bins gives the joint distribution
if N_inputs > 1:
bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
Expand Down Expand Up @@ -327,7 +329,7 @@ def histogram(
See Also
--------
numpy.histogram, numpy.bincount, numpy.digitize
numpy.histogram, numpy.bincount, numpy.searchsorted
"""

a0 = args[0]
Expand Down
21 changes: 21 additions & 0 deletions xhistogram/test/test_core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd

from itertools import combinations
import dask.array as dsa
Expand Down Expand Up @@ -328,3 +329,23 @@ def test_ensure_correctly_formatted_range(in_out):
else:
with pytest.raises(ValueError):
_ensure_correctly_formatted_range(range_in, n)


@pytest.mark.parametrize("block_size", [None, 1, 2])
@pytest.mark.parametrize("use_dask", [False, True])
def test_histogram_results_datetime(use_dask, block_size):
"""Test computing histogram of datetime objects"""
data = pd.date_range(start="2000-06-01", periods=5)
if use_dask:
data = dsa.asarray(data, chunks=(5,))
# everything should be in the second bin (index 1)
bins = np.array(
[
np.datetime64("1999-01-01"),
np.datetime64("2000-01-01"),
np.datetime64("2001-01-01"),
]
)
h = histogram(data, bins=bins, block_size=block_size)[0]
expected = np.histogram(data, bins=bins)[0]
np.testing.assert_allclose(h, expected)

0 comments on commit 5c12e17

Please sign in to comment.