From 210e2dcd43d2055ca1888d07e8f49961ef60ab5e Mon Sep 17 00:00:00 2001 From: Jiang Yue <35633013+jiangyue12392@users.noreply.github.com> Date: Sat, 1 Jun 2019 22:56:34 +0800 Subject: [PATCH] BUG: MultiIndex not dropping nan level and invalid code value (#26408) --- doc/source/whatsnew/v0.25.0.rst | 37 ++++++++++- pandas/core/indexes/multi.py | 62 ++++++++++++++++--- .../tests/indexes/multi/test_constructor.py | 41 +++++++++++- pandas/tests/indexes/multi/test_missing.py | 15 +++++ 4 files changed, 143 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ebca80025b9f7..3275223b159f8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -119,6 +119,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + +.. _whatsnew_0250.api_breaking.multi_indexing: + + +MultiIndex constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +would be reassigned as -1. (:issue:`19387`) + +.. ipython:: python + + mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) + +*Previous Behavior*: + +.. code-block:: ipython + + In [1]: mi1 + Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + In [2]: mi2 + Out[2]: MultiIndex(levels=[[1, 2]], + codes=[[0, -2]]) + +*New Behavior*: + +.. ipython:: python + + mi1 + mi2 + + .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: GroupBy.apply on ``DataFrame`` evaluates first group only once @@ -536,7 +572,6 @@ MultiIndex - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - -- I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ec2cc70d1a352..9217b388ce86b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -243,11 +243,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, result.sortorder = sortorder if verify_integrity: - result._verify_integrity() + new_codes = result._verify_integrity() + result._codes = new_codes + if _set_identity: result._reset_identity() + return result + def _validate_codes(self, level: list, code: list): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). + + Returns + ------- + code : new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). + """ + null_mask = isna(level) + if np.any(null_mask): + code = np.where(null_mask[code], -1, code) + return code + def _verify_integrity(self, codes=None, levels=None): """ @@ -263,6 +287,11 @@ def _verify_integrity(self, codes=None, levels=None): ValueError If length of levels and codes don't match, if the codes for any level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + codes : new codes where code value = -1 if it corresponds to a + NaN level. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -272,22 +301,33 @@ def _verify_integrity(self, codes=None, levels=None): if len(levels) != len(codes): raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - codes_length = len(self.codes[0]) + codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError("Unequal code lengths: %s" % ([len(code_) for code_ in codes])) if len(level_codes) and level_codes.max() >= len(level): - raise ValueError("On level %d, code max (%d) >= length of" - " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, level_codes.max(), - len(level))) + msg = ("On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), + level_len=len(level))) + raise ValueError(msg) + if len(level_codes) and level_codes.min() < -1: + raise ValueError("On level {level}, code value ({code})" + " < -1".format( + level=i, code=level_codes.min())) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " "level {level}".format( values=[value for value in level], level=i)) + codes = [self._validate_codes(level, code) + for level, code in zip(levels, codes)] + new_codes = FrozenList(codes) + return new_codes + @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ @@ -586,7 +626,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, new_levels = FrozenList(new_levels) if verify_integrity: - self._verify_integrity(levels=new_levels) + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes names = self.names self._levels = new_levels @@ -676,7 +717,6 @@ def labels(self): def _set_codes(self, codes, level=None, copy=False, validate=True, verify_integrity=False): - if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): @@ -696,9 +736,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, new_codes = FrozenList(new_codes) if verify_integrity: - self._verify_integrity(codes=new_codes) + new_codes = self._verify_integrity(codes=new_codes) self._codes = new_codes + self._tuples = None self._reset_cache() @@ -1763,9 +1804,10 @@ def __setstate__(self, state): self._set_levels([Index(x) for x in levels], validate=False) self._set_codes(codes) + new_codes = self._verify_integrity() + self._set_codes(new_codes) self._set_names(names) self.sortorder = sortorder - self._verify_integrity() self._reset_identity() def __getitem__(self, key): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 37290bc6eb1c0..7cab05660ac49 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -63,9 +63,10 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." + length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." " NOTE: this index is in an inconsistent state") label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): @@ -82,6 +83,44 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], + verify_integrity=False) + + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + # GH26408 + # test if codes are re-assigned value -1 for levels + # with mising values (NaN, NaT, None) + result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[-1, -1, 1, -1, 3, -1]]) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( + [[np.nan, 's', pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[1, 2, 2, 2, 2, 2]]).set_codes( + [[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) + def test_labels_deprecated(idx): # GH23752 diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index ed90f74d80989..518c12bb20e13 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -73,6 +73,21 @@ def test_dropna(): with pytest.raises(ValueError, match=msg): idx.dropna(how='xxx') + # GH26408 + # test if missing values are dropped for mutiindex constructed + # from codes and values + idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], + [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], + [0, -1, 3, 3, 3, 4]]) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how='any'), expected) + + expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], + ["128", "128", "128", 2]]) + tm.assert_index_equal(idx.dropna(how='all'), expected) + def test_nulls(idx): # this is really a smoke test for the methods