PERF: improve get_loc on unsorted, non-unique indexes

closes pandas-dev#19478
toobaz · Feb 5, 2018 · cf36911 · cf36911
1 parent d5a7e7c
commit cf36911
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 47 deletions.
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -465,6 +465,7 @@ Performance Improvements
 - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
 - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
 - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
+- Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`)
 - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
 
 .. _whatsnew_0230.docs:

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -183,32 +183,20 @@ cdef class IndexEngine:
 
     cdef _maybe_get_bool_indexer(self, object val):
         cdef:
-            ndarray[uint8_t] indexer
-            ndarray[object] values
-            int count = 0
-            Py_ssize_t i, n
-            int last_true
+            ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
+            ndarray[int64_t, ndim=1] found
+            int count
 
-        values = np.array(self._get_index_values(), copy=False)
-        n = len(values)
-
-        result = np.empty(n, dtype=bool)
-        indexer = result.view(np.uint8)
+        indexer = self._get_index_values() == val
+        found = np.where(indexer)[0]
+        count = len(found)
 
-        for i in range(n):
-            if values[i] == val:
-                count += 1
-                indexer[i] = 1
-                last_true = i
-            else:
-                indexer[i] = 0
-
-        if count == 0:
-            raise KeyError(val)
+        if count > 1:
+            return indexer
         if count == 1:
-            return last_true
+            return found[0]
 
-        return result
+        raise KeyError(val)
 
     def sizeof(self, deep=False):
         """ return the sizeof our mapping """
@@ -542,9 +530,6 @@ cdef class PeriodEngine(Int64Engine):
 
         return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)
 
-    cdef _get_index_values_for_bool_indexer(self):
-        return self._get_index_values().view('i8')
-
 
 cpdef convert_scalar(ndarray arr, object value):
     # we don't turn integers

diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in
@@ -55,40 +55,29 @@ cdef class {{name}}Engine(IndexEngine):
 
     cdef _maybe_get_bool_indexer(self, object val):
         cdef:
-            ndarray[uint8_t, cast=True] indexer
+            ndarray[cnp.uint8_t, ndim=1, cast=True] indexer
+            ndarray[int64_t, ndim=1] found
             ndarray[{{ctype}}] values
             int count = 0
-            Py_ssize_t i, n
-            int last_true
 
         {{if name != 'Float64'}}
         if not util.is_integer_object(val):
             raise KeyError(val)
         {{endif}}
 
-        values = self._get_index_values_for_bool_indexer()
-        n = len(values)
+        # A view is needed for some subclasses, such as PeriodEngine:
+        values = self._get_index_values().view('{{dtype}}')
+        indexer = values == val
+        found = np.where(indexer)[0]
+        count = len(found)
 
-        result = np.empty(n, dtype=bool)
-        indexer = result.view(np.uint8)
-
-        for i in range(n):
-            if values[i] == val:
-                count += 1
-                indexer[i] = 1
-                last_true = i
-            else:
-                indexer[i] = 0
-
-        if count == 0:
-            raise KeyError(val)
+        if count > 1:
+            return indexer
         if count == 1:
-            return last_true
+            return found[0]
 
-        return result
+        raise KeyError(val)
 
-    cdef _get_index_values_for_bool_indexer(self):
-        return self._get_index_values()
     {{endif}}
 
 {{endfor}}