diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index e47c8c9b1714a7..df7ebd8d4844fc 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -32,6 +32,7 @@ Bug fixes - Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 17535c1710a1fa..156b1050322d4d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -266,7 +266,10 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # GH50430: let pyarrow infer type, then cast scalars = pa.array(scalars, from_pandas=True) if pa_dtype and scalars.type != pa_dtype: - scalars = scalars.cast(pa_dtype) + if pa.types.is_dictionary(pa_dtype): + scalars = scalars.dictionary_encode() + else: + scalars = scalars.cast(pa_dtype) arr = cls(scalars) if pa.types.is_duration(scalars.type) and scalars.null_count > 0: # GH52843: upstream bug for duration types when originally @@ -878,7 +881,10 @@ def factorize( else: data = self._pa_array - encoded = data.dictionary_encode(null_encoding=null_encoding) + if pa.types.is_dictionary(data.type): + encoded = data + else: + encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 196f692a4716b0..72ef74c6387b99 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1845,6 +1845,20 @@ def test_searchsorted_with_na_raises(data_for_sorting, as_series): arr.searchsorted(b) +def test_sort_values_dictionary(): + df = pd.DataFrame( + { + "a": pd.Series( + ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string())) + ), + "b": [1, 2], + }, + ) + expected = df.copy() + result = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("pat", ["abc", "a[a-z]{2}"]) def test_str_count(pat): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))