Skip to content

Commit

Permalink
BUG: implement new engine for codes-based MultiIndex indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
toobaz committed Jan 4, 2018
1 parent fc7afb7 commit 085a1fe
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 10 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,9 @@ Indexing
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing NaN (:issue:`18485`)
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex` which would fail when levels had different dtypes (:issue:`18520`)
- Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`)
- Bug in :func:`Index.drop` when passing a list of both tuples and non-tuples (:issue:`18304`)
- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
Expand Down
114 changes: 104 additions & 10 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,101 @@
target_klass='MultiIndex or list of tuples'))


class BaseMultiIndexCodesEngine(object):
def __init__(self, levels, labels, offsets, **kwargs):
self._levels = levels
self._offsets = offsets

# Map each combination to an integer
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64')
lab_ints = self._codes_to_ints(codes)

# Initialize underlying index
self._base.__init__(self, lambda: lab_ints, len(lab_ints), **kwargs)

def get_indexer(self, target, method=None, limit=None):
level_codes = [self._levels[lev].get_indexer(codes, method=method) + 1
for lev, codes in enumerate(zip(*target))]

keys_int = self._codes_to_ints(np.array(level_codes, dtype='uint64').T)

if method is not None:
# keys must be sorted - the engine already is
order = np.argsort(keys_int)
keys_int = keys_int[order]
sup_meth = getattr(self._base, 'get_{}_indexer'.format(method))
indexer = sup_meth(self, keys_int, limit=limit)
indexer = indexer[order]
else:
indexer = self._base.get_indexer(self, keys_int)

return indexer

def get_loc(self, key):
if libindex.is_definitely_invalid_key(key):
raise TypeError("'{key}' is an invalid key".format(key=key))
if not isinstance(key, tuple):
raise KeyError(key)
try:
idces = [0 if isna(val) else self._levels[lev].get_loc(val) + 1
for lev, val in enumerate(key)]
except KeyError:
raise KeyError(key)
idces = np.array(idces, ndmin=2, dtype='uint64')

key_int = self._codes_to_ints(idces)[0]

return self._base.get_loc(self, key_int)

def get_indexer_non_unique(self, target):
# This needs to be overridden just because the default one works on
# target._values, and target can be itself a MultiIndex.

level_codes = [self._levels[lev].get_indexer(codes) + 1
for lev, codes in enumerate(zip(*target))]
codes = np.array(level_codes, dtype='uint64').T
keys_int = self._codes_to_ints(codes)

indexer = self._base.get_indexer_non_unique(self, keys_int)

return indexer

def __contains__(self, val):
try:
self.get_loc(val)
return True
except (KeyError, TypeError, ValueError):
return False


class MultiIndexUIntEngine(BaseMultiIndexCodesEngine, libindex.UInt64Engine):
"""
Manage a MultiIndex by mapping label combinations to positive integers.
"""
_base = libindex.UInt64Engine

def _codes_to_ints(self, codes):
# Shift:
codes <<= self._offsets
# Now sum and OR are in fact interchangeable:
return np.bitwise_or.reduce(codes, axis=1)


class MultiIndexPyIntEngine(BaseMultiIndexCodesEngine, libindex.ObjectEngine):
"""
In those (extreme) cases in which the number of possible label combinations
overflows the 64 bits integers, use an ObjectEngine containing Python
integers.
"""
_base = libindex.ObjectEngine

def _codes_to_ints(self, codes):
# Shift:
codes = codes.astype('object') << self._offsets
# Now sum and OR are in fact interchangeable:
return np.bitwise_or.reduce(codes, axis=1)


class MultiIndex(Index):
"""
A multi-level, or hierarchical, index object for pandas objects
Expand Down Expand Up @@ -691,16 +786,15 @@ def _get_level_number(self, level):

@cache_readonly
def _engine(self):
# Find powers of 2 which dominate level sizes - including -1 for NaN:
lev_bits = np.cumsum(np.ceil(np.log2([len(l) + 1 for l in
self.levels[::-1]])))[::-1]
offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint')

# choose our engine based on our size
# the hashing based MultiIndex for larger
# sizes, and the MultiIndexOjbect for smaller
# xref: https://github.com/pandas-dev/pandas/pull/16324
l = len(self)
if l > 10000:
return libindex.MultiIndexHashEngine(lambda: self, l)

return libindex.MultiIndexObjectEngine(lambda: self.values, l)
if lev_bits[0] > 64:
# The levels would overflow a 64 bit integer - use Python integers:
return MultiIndexPyIntEngine(self.levels, self.labels, offsets)
return MultiIndexUIntEngine(self.levels, self.labels, offsets)

@property
def values(self):
Expand Down Expand Up @@ -1889,7 +1983,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
if tolerance is not None:
raise NotImplementedError("tolerance not implemented yet "
'for MultiIndex')
indexer = self._get_fill_indexer(target, method, limit)
indexer = self._engine.get_indexer(target, method, limit)
elif method == 'nearest':
raise NotImplementedError("method='nearest' not implemented yet "
'for MultiIndex; see GitHub issue 9365')
Expand Down

0 comments on commit 085a1fe

Please sign in to comment.