From 95348c1aeb084a4bd17bf3ec8927b28535fda20a Mon Sep 17 00:00:00 2001 From: thequackdaddy Date: Tue, 20 Jun 2017 15:14:33 -0500 Subject: [PATCH] BUG: Load data from a CategoricalIndex for dtype comparison, closes #16627 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/reshape/merge.py | 8 ++++++-- pandas/tests/test_join.py | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 046a6c885bd24b..c4429eef143274 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -100,6 +100,7 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). +- Fixed a bug that prevented joining on a categorical MultiIndex (:issue:`13873`). I/O diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ffe0cac33ec8f5..99d9af28ac0191 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -11,7 +11,7 @@ import pandas.compat as compat from pandas import (Categorical, Series, DataFrame, - Index, MultiIndex, Timedelta) + Index, MultiIndex, Timedelta, CategoricalIndex) from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1441,9 +1441,13 @@ def _factorize_keys(lk, rk, sort=True): rk = rk.values # if we exactly match in categories, allow us to use codes + if isinstance(lk, CategoricalIndex): + ldata = lk._data + else: + ldata = lk if (is_categorical_dtype(lk) and is_categorical_dtype(rk) and - lk.is_dtype_equal(rk)): + ldata.is_dtype_equal(rk)): return lk.codes, rk.codes, len(lk.categories) if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 3fc13d23b53f7d..5d29c5355f8806 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -192,3 +192,24 @@ def test_inner_join_indexer2(): exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) assert_almost_equal(ridx, exp_ridx) + + +def test_merge_join_categorical_multiindex(): + # From issue 16627 + import pandas as pd + a = {'Cat1': pd.Categorical(['a', 'b', 'a', 'c', 'a', 'b'], + ['a', 'b', 'c']), + 'Int1': [0, 1, 0, 1, 0, 0]} + a = pd.DataFrame(a) + + b = {'Cat': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + ['a', 'b', 'c']), + 'Int': [0, 0, 0, 1, 1, 1], + 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} + b = pd.DataFrame(b).set_index(['Cat', 'Int'])['Factor'] + + c = pd.merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], + right_on=['Cat', 'Int'], how='left') + d = a.join(b, on=['Cat1', 'Int1']) + c = c.drop(['Cat', 'Int'], axis=1) + assert_almost_equal(c, d)