From 84056c52e3f20ab44921b86a8e0f05275bf8ddb4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 26 Jan 2019 09:47:00 -0800 Subject: [PATCH] Backport PR #24916: BUG-24212 fix regression in #24897 (#24951) --- doc/source/whatsnew/v0.24.1.rst | 3 ++ pandas/core/reshape/merge.py | 45 ++++++++++++++++++++++-- pandas/tests/reshape/merge/test_merge.py | 31 ++++++++-------- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index ee4b7ab62b31a..3ac2ed73ea53f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -63,6 +63,9 @@ Bug Fixes - - +**Reshaping** + +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) **Other** diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e11847d2b8ce2..1dd19a7c1514e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,13 +757,21 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self.left.index.take(left_indexer) + join_index = self._create_join_index(self.left.index, + self.right.index, + left_indexer, + right_indexer, + how='right') else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self.right.index.take(right_indexer) + join_index = self._create_join_index(self.right.index, + self.left.index, + right_indexer, + left_indexer, + how='left') else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -774,6 +782,39 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer + def _create_join_index(self, index, other_index, indexer, + other_indexer, how='left'): + """ + Create a join index by rearranging one index to match another + + Parameters + ---------- + index: Index being rearranged + other_index: Index used to supply values not found in index + indexer: how to rearrange index + how: replacement is only necessary if indexer based on other_index + + Returns + ------- + join_index + """ + join_index = index.take(indexer) + if (self.how in (how, 'outer') and + not isinstance(other_index, MultiIndex)): + # if final index requires values in other_index but not target + # index, indexer may hold missing (-1) values, causing Index.take + # to take the final value in target index + mask = indexer == -1 + if np.any(mask): + # if values missing (-1) from target index, + # take from other_index instead + join_list = join_index.to_numpy() + other_list = other_index.take(other_indexer).to_numpy() + join_list[mask] = other_list[mask] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) + return join_index + def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0a3ddc8ce8a4..c17c301968269 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -939,25 +939,22 @@ def test_merge_two_empty_df_no_division_error(self): with np.errstate(divide='raise'): merge(a, a, on=('a', 'b')) - @pytest.mark.parametrize('how', ['left', 'outer']) - @pytest.mark.xfail(reason="GH-24897") + @pytest.mark.parametrize('how', ['right', 'outer']) def test_merge_on_index_with_more_values(self, how): # GH 24212 - # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is - # interpreted as a missing value instead of the last element - df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], - columns=['a', 'b']) - df2 = pd.DataFrame([[3, 30], [4, 40]], - columns=['a', 'c']) - df1.set_index('a', drop=False, inplace=True) - df2.set_index('a', inplace=True) - result = pd.merge(df1, df2, left_index=True, right_on='a', how=how) - expected = pd.DataFrame([[1, 2, np.nan], - [2, 4, np.nan], - [3, 6, 30.0], - [4, 8, 40.0]], - columns=['a', 'b', 'c']) - expected.set_index('a', drop=False, inplace=True) + # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that + # -1 is interpreted as a missing value instead of the last element + df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) + df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) + result = df1.merge(df2, left_on='key', right_index=True, how=how) + expected = pd.DataFrame([[1.0, 0, 1], + [2.0, 2, 3], + [3.0, 2, 3], + [np.nan, 1, 2], + [np.nan, 3, 4], + [np.nan, 4, 5]], + columns=['a', 'key', 'b']) + expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True) assert_frame_equal(result, expected) def test_merge_right_index_right(self):