Skip to content

Commit

Permalink
PERF: improve get_loc on unsorted, non-unique indexes
Browse files Browse the repository at this point in the history
  • Loading branch information
toobaz committed Feb 5, 2018
1 parent d5a7e7c commit cf36911
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 47 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Expand Up @@ -465,6 +465,7 @@ Performance Improvements
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
- Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`)
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)

.. _whatsnew_0230.docs:
Expand Down
35 changes: 10 additions & 25 deletions pandas/_libs/index.pyx
Expand Up @@ -183,32 +183,20 @@ cdef class IndexEngine:

cdef _maybe_get_bool_indexer(self, object val):
cdef:
ndarray[uint8_t] indexer
ndarray[object] values
int count = 0
Py_ssize_t i, n
int last_true
ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
ndarray[int64_t, ndim=1] found
int count

values = np.array(self._get_index_values(), copy=False)
n = len(values)

result = np.empty(n, dtype=bool)
indexer = result.view(np.uint8)
indexer = self._get_index_values() == val
found = np.where(indexer)[0]
count = len(found)

for i in range(n):
if values[i] == val:
count += 1
indexer[i] = 1
last_true = i
else:
indexer[i] = 0

if count == 0:
raise KeyError(val)
if count > 1:
return indexer
if count == 1:
return last_true
return found[0]

return result
raise KeyError(val)

def sizeof(self, deep=False):
""" return the sizeof our mapping """
Expand Down Expand Up @@ -542,9 +530,6 @@ cdef class PeriodEngine(Int64Engine):

return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)

cdef _get_index_values_for_bool_indexer(self):
return self._get_index_values().view('i8')


cpdef convert_scalar(ndarray arr, object value):
# we don't turn integers
Expand Down
33 changes: 11 additions & 22 deletions pandas/_libs/index_class_helper.pxi.in
Expand Up @@ -55,40 +55,29 @@ cdef class {{name}}Engine(IndexEngine):

cdef _maybe_get_bool_indexer(self, object val):
cdef:
ndarray[uint8_t, cast=True] indexer
ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
ndarray[int64_t, ndim=1] found
ndarray[{{ctype}}] values
int count = 0
Py_ssize_t i, n
int last_true

{{if name != 'Float64'}}
if not util.is_integer_object(val):
raise KeyError(val)
{{endif}}

values = self._get_index_values_for_bool_indexer()
n = len(values)
# A view is needed for some subclasses, such as PeriodEngine:
values = self._get_index_values().view('{{dtype}}')
indexer = values == val
found = np.where(indexer)[0]
count = len(found)

result = np.empty(n, dtype=bool)
indexer = result.view(np.uint8)

for i in range(n):
if values[i] == val:
count += 1
indexer[i] = 1
last_true = i
else:
indexer[i] = 0

if count == 0:
raise KeyError(val)
if count > 1:
return indexer
if count == 1:
return last_true
return found[0]

return result
raise KeyError(val)

cdef _get_index_values_for_bool_indexer(self):
return self._get_index_values()
{{endif}}

{{endfor}}

0 comments on commit cf36911

Please sign in to comment.