Skip to content

Commit

Permalink
BUG: MultiIndex not dropping nan level and invalid code value (pandas…
Browse files Browse the repository at this point in the history
  • Loading branch information
jiangyue12392 authored and vaibhavhrt committed Jun 6, 2019
1 parent a69d56f commit 210e2dc
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 12 deletions.
37 changes: 36 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`)
df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
.. _whatsnew_0250.api_breaking.multi_indexing:


MultiIndex constructed from levels and codes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously.
Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes
would be reassigned as -1. (:issue:`19387`)

.. ipython:: python
mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
codes=[[0, -1, 1, 2, 3, 4]])
mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]])
*Previous Behavior*:

.. code-block:: ipython
In [1]: mi1
Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]],
codes=[[0, -1, 1, 2, 3, 4]])
In [2]: mi2
Out[2]: MultiIndex(levels=[[1, 2]],
codes=[[0, -2]])
*New Behavior*:

.. ipython:: python
mi1
mi2
.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once:

GroupBy.apply on ``DataFrame`` evaluates first group only once
Expand Down Expand Up @@ -536,7 +572,6 @@ MultiIndex

- Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`)
-
-

I/O
^^^
Expand Down
62 changes: 52 additions & 10 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None,
result.sortorder = sortorder

if verify_integrity:
result._verify_integrity()
new_codes = result._verify_integrity()
result._codes = new_codes

if _set_identity:
result._reset_identity()

return result

def _validate_codes(self, level: list, code: list):
"""
Reassign code values as -1 if their corresponding levels are NaN.
Parameters
----------
code : list
Code to reassign.
level : list
Level to check for missing values (NaN, NaT, None).
Returns
-------
code : new code where code value = -1 if it corresponds
to a level with missing values (NaN, NaT, None).
"""
null_mask = isna(level)
if np.any(null_mask):
code = np.where(null_mask[code], -1, code)
return code

def _verify_integrity(self, codes=None, levels=None):
"""
Expand All @@ -263,6 +287,11 @@ def _verify_integrity(self, codes=None, levels=None):
ValueError
If length of levels and codes don't match, if the codes for any
level would exceed level bounds, or there are any duplicate levels.
Returns
-------
codes : new codes where code value = -1 if it corresponds to a
NaN level.
"""
# NOTE: Currently does not check, among other things, that cached
# nlevels matches nor that sortorder matches actually sortorder.
Expand All @@ -272,22 +301,33 @@ def _verify_integrity(self, codes=None, levels=None):
if len(levels) != len(codes):
raise ValueError("Length of levels and codes must match. NOTE:"
" this index is in an inconsistent state.")
codes_length = len(self.codes[0])
codes_length = len(codes[0])
for i, (level, level_codes) in enumerate(zip(levels, codes)):
if len(level_codes) != codes_length:
raise ValueError("Unequal code lengths: %s" %
([len(code_) for code_ in codes]))
if len(level_codes) and level_codes.max() >= len(level):
raise ValueError("On level %d, code max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, level_codes.max(),
len(level)))
msg = ("On level {level}, code max ({max_code}) >= length of "
"level ({level_len}). NOTE: this index is in an "
"inconsistent state".format(
level=i, max_code=level_codes.max(),
level_len=len(level)))
raise ValueError(msg)
if len(level_codes) and level_codes.min() < -1:
raise ValueError("On level {level}, code value ({code})"
" < -1".format(
level=i, code=level_codes.min()))
if not level.is_unique:
raise ValueError("Level values must be unique: {values} on "
"level {level}".format(
values=[value for value in level],
level=i))

codes = [self._validate_codes(level, code)
for level, code in zip(levels, codes)]
new_codes = FrozenList(codes)
return new_codes

@classmethod
def from_arrays(cls, arrays, sortorder=None, names=None):
"""
Expand Down Expand Up @@ -586,7 +626,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True,
new_levels = FrozenList(new_levels)

if verify_integrity:
self._verify_integrity(levels=new_levels)
new_codes = self._verify_integrity(levels=new_levels)
self._codes = new_codes

names = self.names
self._levels = new_levels
Expand Down Expand Up @@ -676,7 +717,6 @@ def labels(self):

def _set_codes(self, codes, level=None, copy=False, validate=True,
verify_integrity=False):

if validate and level is None and len(codes) != self.nlevels:
raise ValueError("Length of codes must match number of levels")
if validate and level is not None and len(codes) != len(level):
Expand All @@ -696,9 +736,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
new_codes = FrozenList(new_codes)

if verify_integrity:
self._verify_integrity(codes=new_codes)
new_codes = self._verify_integrity(codes=new_codes)

self._codes = new_codes

self._tuples = None
self._reset_cache()

Expand Down Expand Up @@ -1763,9 +1804,10 @@ def __setstate__(self, state):

self._set_levels([Index(x) for x in levels], validate=False)
self._set_codes(codes)
new_codes = self._verify_integrity()
self._set_codes(new_codes)
self._set_names(names)
self.sortorder = sortorder
self._verify_integrity()
self._reset_identity()

def __getitem__(self, key):
Expand Down
41 changes: 40 additions & 1 deletion pandas/tests/indexes/multi/test_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ def test_constructor_mismatched_codes_levels(idx):
with pytest.raises(ValueError, match=msg):
MultiIndex(levels=levels, codes=codes)

length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\."
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\."
" NOTE: this index is in an inconsistent state")
label_error = r"Unequal code lengths: \[4, 2\]"
code_value_error = r"On level 0, code value \(-2\) < -1"

# important to check that it's looking at the right thing.
with pytest.raises(ValueError, match=length_error):
Expand All @@ -82,6 +83,44 @@ def test_constructor_mismatched_codes_levels(idx):
with pytest.raises(ValueError, match=label_error):
idx.copy().set_codes([[0, 0, 0, 0], [0, 0]])

# test set_codes with verify_integrity=False
# the setting should not raise any value error
idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]],
verify_integrity=False)

# code value smaller than -1
with pytest.raises(ValueError, match=code_value_error):
MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]])


def test_na_levels():
# GH26408
# test if codes are re-assigned value -1 for levels
# with mising values (NaN, NaT, None)
result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
codes=[[0, -1, 1, 2, 3, 4]])
expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
codes=[[-1, -1, -1, -1, 3, 4]])
tm.assert_index_equal(result, expected)

result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]],
codes=[[0, -1, 1, 2, 3, 4]])
expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]],
codes=[[-1, -1, 1, -1, 3, -1]])
tm.assert_index_equal(result, expected)

# verify set_levels and set_codes
result = MultiIndex(
levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels(
[[np.nan, 's', pd.NaT, 128, None]])
tm.assert_index_equal(result, expected)

result = MultiIndex(
levels=[[np.nan, 's', pd.NaT, 128, None]],
codes=[[1, 2, 2, 2, 2, 2]]).set_codes(
[[0, -1, 1, 2, 3, 4]])
tm.assert_index_equal(result, expected)


def test_labels_deprecated(idx):
# GH23752
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/indexes/multi/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,21 @@ def test_dropna():
with pytest.raises(ValueError, match=msg):
idx.dropna(how='xxx')

# GH26408
# test if missing values are dropped for mutiindex constructed
# from codes and values
idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2],
[np.nan, None, pd.NaT, "128", 2]],
codes=[[0, -1, 1, 2, 3, 4],
[0, -1, 3, 3, 3, 4]])
expected = MultiIndex.from_arrays([["128", 2], ["128", 2]])
tm.assert_index_equal(idx.dropna(), expected)
tm.assert_index_equal(idx.dropna(how='any'), expected)

expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2],
["128", "128", "128", 2]])
tm.assert_index_equal(idx.dropna(how='all'), expected)


def test_nulls(idx):
# this is really a smoke test for the methods
Expand Down

0 comments on commit 210e2dc

Please sign in to comment.