From e6ef75004358c25a6047c075c1884aaf2f7005b8 Mon Sep 17 00:00:00 2001 From: veljanin <veljkojovanovic1991@gmail.com> Date: Wed, 2 Oct 2024 09:56:12 +0200 Subject: [PATCH 1/6] Handling the case where converting empty categorical to 'pyarrow' dtype_backend results in error. Since conversion of non-empty categorical returns categorical of 'numpy_nullable' dtype_backend, now, instead of raising an error, we ensure empty categorical is returned as well. --- pandas/core/dtypes/cast.py | 10 +++++++--- .../tests/frame/methods/test_convert_dtypes.py | 16 ++++++++++++++++ .../tests/series/methods/test_convert_dtypes.py | 13 +++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6ba07b1761557..dee3452136438 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1112,7 +1112,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - + if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype @@ -1145,12 +1145,16 @@ def convert_dtypes( and isna(input_array).all() ): import pyarrow as pa - + pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - inferred_dtype = ArrowDtype(pa_type) + if isna(input_array).all() and hasattr(input_array, 'categories'): + inferred_dtype = input_array.dtype + else: + inferred_dtype = ArrowDtype(pa_type) + elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index e7f6e5d625d3e..29a05652b38a2 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -34,6 +34,22 @@ def test_convert_empty(self): # Empty DataFrame can pass convert_dtypes, see GH#40393 empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + + def test_convert_empty_categorical_to_pyarrow(self): + df = pd.DataFrame( + { + "A": pd.Series(pd.Categorical([None] * 5)), + "B": pd.Series(pd.Categorical([None] * 5, categories=["B1", "B2"])), + } + ) + converted = df.convert_dtypes(dtype_backend="pyarrow") + expected = df + tm.assert_frame_equal(converted, expected) + + assert df.A.dtype == "category", "Dtype in column A is not 'category'" + assert df.B.dtype == "category", "Dtype in column B is not 'category'" + assert df.A.cat.categories.empty, "Categories in column A are not empty" + assert (df.B.cat.categories == ["B1", "B2"]).all(), "Categories in column A are not empty" def test_convert_dtypes_retain_column_names(self): # GH#41435 diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 90c4056a39e84..cc2635c35e798 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -297,3 +297,16 @@ def test_convert_dtypes_pyarrow_null(self): result = ser.convert_dtypes(dtype_backend="pyarrow") expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) + + def test_convert_empty_categorical_to_pyarrow(self): + ser = pd.Series(pd.Series(pd.Categorical([None] * 5))) + + converted = ser.convert_dtypes(dtype_backend="pyarrow") + expected = ser + tm.assert_series_equal(converted, expected) + + assert ser.dtype == "category", "Series dtype is not 'category'" + assert ser.cat.categories.empty, "Series categories are not empty" + + ser2 = pd.Series(pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"]))) + assert (ser2.cat.categories == ["S1", "S2"]).all(), "Series categories are not empty" From 94769a1cc1e5fc089864337dc56324d705b18376 Mon Sep 17 00:00:00 2001 From: veljanin <veljkojovanovic1991@gmail.com> Date: Fri, 4 Oct 2024 11:48:40 +0200 Subject: [PATCH 2/6] additional revisions --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/dtypes/cast.py | 7 +++-- .../frame/methods/test_convert_dtypes.py | 23 +++++++++------ .../series/methods/test_convert_dtypes.py | 29 +++++++++++-------- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a5b4560a47bc4..d6dde366506c2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,7 +544,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in :func:`convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) - Datetimelike diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index dee3452136438..c7272a517497c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1112,7 +1112,7 @@ def convert_dtypes( else: inferred_dtype = input_array.dtype - + if dtype_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.string_ import StringDtype @@ -1143,14 +1143,15 @@ def convert_dtypes( base_dtype.kind == "O" # type: ignore[union-attr] and input_array.size > 0 and isna(input_array).all() + and not isinstance(input_array.dtype, CategoricalDtype) ): import pyarrow as pa - + pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - if isna(input_array).all() and hasattr(input_array, 'categories'): + if isna(input_array).all() and hasattr(input_array, "categories"): inferred_dtype = input_array.dtype else: inferred_dtype = ArrowDtype(pa_type) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 29a05652b38a2..5a0010d06a951 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -34,22 +34,27 @@ def test_convert_empty(self): # Empty DataFrame can pass convert_dtypes, see GH#40393 empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) - + def test_convert_empty_categorical_to_pyarrow(self): + # GH#59934 df = pd.DataFrame( { - "A": pd.Series(pd.Categorical([None] * 5)), - "B": pd.Series(pd.Categorical([None] * 5, categories=["B1", "B2"])), - } + "A": pd.Categorical([None] * 5), + "B": pd.Categorical([None] * 5, categories=["B1", "B2"]), + } ) converted = df.convert_dtypes(dtype_backend="pyarrow") expected = df tm.assert_frame_equal(converted, expected) - - assert df.A.dtype == "category", "Dtype in column A is not 'category'" - assert df.B.dtype == "category", "Dtype in column B is not 'category'" - assert df.A.cat.categories.empty, "Categories in column A are not empty" - assert (df.B.cat.categories == ["B1", "B2"]).all(), "Categories in column A are not empty" + + assert converted.A.dtype == "category", "Dtype in column A is not 'category'" + assert converted.B.dtype == "category", "Dtype in column B is not 'category'" + assert converted.A.cat.categories.empty, "Categories in column A are not empty" + assert converted.B.cat.categories.__contains__( + "B1" + ) and converted.B.cat.categories.__contains__( + "B2" + ), "Categories in column B doesn't contain adequate categories" def test_convert_dtypes_retain_column_names(self): # GH#41435 diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index cc2635c35e798..22d4ec6d7906a 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -297,16 +297,21 @@ def test_convert_dtypes_pyarrow_null(self): result = ser.convert_dtypes(dtype_backend="pyarrow") expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) - + def test_convert_empty_categorical_to_pyarrow(self): - ser = pd.Series(pd.Series(pd.Categorical([None] * 5))) - - converted = ser.convert_dtypes(dtype_backend="pyarrow") - expected = ser - tm.assert_series_equal(converted, expected) - - assert ser.dtype == "category", "Series dtype is not 'category'" - assert ser.cat.categories.empty, "Series categories are not empty" - - ser2 = pd.Series(pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"]))) - assert (ser2.cat.categories == ["S1", "S2"]).all(), "Series categories are not empty" + # GH#59934 + ser1 = pd.Series(pd.Categorical([None] * 5)) + converted1 = ser1.convert_dtypes(dtype_backend="pyarrow") + expected = ser1 + + tm.assert_series_equal(converted1, expected) + assert converted1.dtype == "category", "Series dtype is not 'category'" + assert converted1.cat.categories.empty, "Series categories are not empty" + + ser2 = pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"])) + converted2 = ser2.convert_dtypes(dtype_backend="pyarrow") + assert converted2.cat.categories.__contains__( + "S1" + ) and converted2.cat.categories.__contains__( + "S2" + ), "Categories in ser2 doesn't contain adequate categories" From 8e879da17e8f97322e945755586c081fb8521c28 Mon Sep 17 00:00:00 2001 From: veljanin <veljkojovanovic1991@gmail.com> Date: Fri, 4 Oct 2024 11:53:11 +0200 Subject: [PATCH 3/6] removing the change for input_array... --- pandas/core/dtypes/cast.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c7272a517497c..597f76510618a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1151,10 +1151,7 @@ def convert_dtypes( else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - if isna(input_array).all() and hasattr(input_array, "categories"): - inferred_dtype = input_array.dtype - else: - inferred_dtype = ArrowDtype(pa_type) + inferred_dtype = ArrowDtype(pa_type) elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 From a5b388263b4cc39fadd37c05c0e832d53a2857bc Mon Sep 17 00:00:00 2001 From: veljanin <veljkojovanovic1991@gmail.com> Date: Mon, 7 Oct 2024 08:20:04 +0200 Subject: [PATCH 4/6] reverting newline in Series.convert_dtypes and precising respective docs in whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/dtypes/cast.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d6dde366506c2..3c707d7931c97 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,7 +544,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- Bug in :func:`convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) +- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) - Datetimelike diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 597f76510618a..1255c5a557d27 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1152,7 +1152,6 @@ def convert_dtypes( pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) - elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] From c1ed1f0e5a1cb9e6e01409f27adfd6566e754391 Mon Sep 17 00:00:00 2001 From: veljanin <veljkojovanovic1991@gmail.com> Date: Wed, 9 Oct 2024 12:10:29 +0200 Subject: [PATCH 5/6] resolved conflicts and updated main with latest changes --- .github/workflows/package-checks.yml | 2 +- .github/workflows/wheels.yml | 2 +- .pre-commit-config.yaml | 10 +- ci/code_checks.sh | 30 - doc/source/user_guide/style.ipynb | 689 ++++++++++-------- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/lib.pyx | 14 + pandas/_libs/tslibs/nattype.pyi | 45 +- pandas/_testing/asserters.py | 10 + pandas/core/accessor.py | 7 +- pandas/core/apply.py | 14 +- pandas/core/arrays/categorical.py | 14 +- pandas/core/arrays/datetimes.py | 8 + pandas/core/arrays/sparse/accessor.py | 20 +- pandas/core/arrays/string_.py | 2 +- pandas/core/frame.py | 10 +- pandas/core/groupby/generic.py | 28 +- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/ops.py | 39 +- pandas/core/indexes/period.py | 8 + pandas/core/reshape/encoding.py | 3 +- pandas/core/tools/numeric.py | 6 + pandas/errors/__init__.py | 5 + pandas/io/formats/html.py | 2 + pandas/io/formats/style.py | 3 +- pandas/io/formats/style_render.py | 24 +- pandas/io/stata.py | 20 + pandas/plotting/_misc.py | 17 +- pandas/tests/apply/test_frame_apply.py | 3 +- pandas/tests/apply/test_series_apply.py | 6 +- pandas/tests/arrays/sparse/test_accessor.py | 4 + .../tests/frame/methods/test_value_counts.py | 4 +- .../groupby/methods/test_value_counts.py | 86 ++- pandas/tests/indexes/test_old_base.py | 2 +- pandas/tests/io/formats/style/test_style.py | 13 +- pandas/tests/io/formats/test_format.py | 19 +- pandas/tests/tools/test_to_numeric.py | 15 + pandas/util/version/__init__.py | 8 + pyproject.toml | 9 +- 39 files changed, 746 insertions(+), 461 deletions(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 6748832903e30..331af6e05b650 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -67,7 +67,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: recipe-test create-args: >- diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 2aaec8c9b56b0..de59a454c827c 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -165,7 +165,7 @@ jobs: CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: wheel-env # Use a fixed Python, since we might have an unreleased Python not diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f6717dd503c9b..7c9ebf7d94173 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.0 + rev: v0.6.9 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.11' + rev: 'v2.13' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -52,7 +52,7 @@ repos: - id: cython-lint - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-case-conflict - id: check-toml @@ -90,12 +90,12 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.9.1 + rev: v1.0.0 hooks: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v19.1.1 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4a1a0042405e3..6fb675069e81d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,30 +73,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.cat.add_categories PR01,PR02" \ - -i "pandas.Series.cat.as_ordered PR01" \ - -i "pandas.Series.cat.as_unordered PR01" \ - -i "pandas.Series.cat.remove_categories PR01,PR02" \ - -i "pandas.Series.cat.remove_unused_categories PR01" \ - -i "pandas.Series.cat.rename_categories PR01,PR02" \ - -i "pandas.Series.cat.reorder_categories PR01,PR02" \ - -i "pandas.Series.cat.set_categories PR01,PR02" \ - -i "pandas.Series.dt.as_unit PR01,PR02" \ - -i "pandas.Series.dt.ceil PR01,PR02" \ - -i "pandas.Series.dt.day_name PR01,PR02" \ - -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.normalize PR01" \ - -i "pandas.Series.dt.round PR01,PR02" \ - -i "pandas.Series.dt.strftime PR01,PR02" \ - -i "pandas.Series.dt.to_period PR01,PR02" \ - -i "pandas.Series.dt.total_seconds PR01" \ - -i "pandas.Series.dt.tz_convert PR01,PR02" \ - -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ @@ -106,13 +85,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.DatetimeArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ @@ -163,7 +140,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ - -i "pandas.errors.InvalidVersion SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ @@ -172,24 +148,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.PossiblePrecisionLoss SA01" \ - -i "pandas.errors.SpecificationError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataReader.data_label SA01" \ - -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \ -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index daecfce6ecebc..abb7181fc8d72 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -38,19 +38,6 @@ "[concatfunc]: ../reference/api/pandas.io.formats.style.Styler.concat.rst" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot\n", - "# We have this here to trigger matplotlib's font cache stuff.\n", - "# This cell is hidden from the output" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -78,17 +65,13 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import matplotlib as mpl\n", "\n", - "df = pd.DataFrame({\n", - " \"strings\": [\"Adam\", \"Mike\"],\n", - " \"ints\": [1, 3],\n", - " \"floats\": [1.123, 1000.23]\n", - "})\n", - "df.style \\\n", - " .format(precision=3, thousands=\".\", decimal=\",\") \\\n", - " .format_index(str.upper, axis=1) \\\n", - " .relabel_index([\"row 1\", \"row 2\"], axis=0)" + "df = pd.DataFrame(\n", + " {\"strings\": [\"Adam\", \"Mike\"], \"ints\": [1, 3], \"floats\": [1.123, 1000.23]}\n", + ")\n", + "df.style.format(precision=3, thousands=\".\", decimal=\",\").format_index(\n", + " str.upper, axis=1\n", + ").relabel_index([\"row 1\", \"row 2\"], axis=0)" ] }, { @@ -104,17 +87,21 @@ "metadata": {}, "outputs": [], "source": [ - "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", - " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", - " columns=[\"Tokyo\", \"Beijing\"])\n", + "weather_df = pd.DataFrame(\n", + " np.random.default_rng().random((10, 2)) * 5,\n", + " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", + " columns=[\"Tokyo\", \"Beijing\"],\n", + ")\n", + "\n", "\n", - "def rain_condition(v): \n", + "def rain_condition(v):\n", " if v < 1.75:\n", " return \"Dry\"\n", " elif v < 2.75:\n", " return \"Rain\"\n", " return \"Heavy Rain\"\n", "\n", + "\n", "def make_pretty(styler):\n", " styler.set_caption(\"Weather Conditions\")\n", " styler.format(rain_condition)\n", @@ -122,6 +109,7 @@ " styler.background_gradient(axis=None, vmin=1, vmax=5, cmap=\"YlGnBu\")\n", " return styler\n", "\n", + "\n", "weather_df" ] }, @@ -157,10 +145,8 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame(np.random.randn(5, 5))\n", - "df.style \\\n", - " .hide(subset=[0, 2, 4], axis=0) \\\n", - " .hide(subset=[0, 2, 4], axis=1)" + "df = pd.DataFrame(np.random.default_rng().standard_normal((5, 5)))\n", + "df.style.hide(subset=[0, 2, 4], axis=0).hide(subset=[0, 2, 4], axis=1)" ] }, { @@ -177,9 +163,9 @@ "outputs": [], "source": [ "show = [0, 2, 4]\n", - "df.style \\\n", - " .hide([row for row in df.index if row not in show], axis=0) \\\n", - " .hide([col for col in df.columns if col not in show], axis=1)" + "df.style.hide([row for row in df.index if row not in show], axis=0).hide(\n", + " [col for col in df.columns if col not in show], axis=1\n", + ")" ] }, { @@ -199,9 +185,9 @@ "metadata": {}, "outputs": [], "source": [ - "summary_styler = df.agg([\"sum\", \"mean\"]).style \\\n", - " .format(precision=3) \\\n", - " .relabel_index([\"Sum\", \"Average\"])\n", + "summary_styler = (\n", + " df.agg([\"sum\", \"mean\"]).style.format(precision=3).relabel_index([\"Sum\", \"Average\"])\n", + ")\n", "df.style.format(precision=1).concat(summary_styler)" ] }, @@ -227,9 +213,16 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n", - " index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n", - " columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n", + "idx = pd.Index([\"Tumour (Positive)\", \"Non-Tumour (Negative)\"], name=\"Actual Label:\")\n", + "cols = pd.MultiIndex.from_product(\n", + " [[\"Decision Tree\", \"Regression\", \"Random\"], [\"Tumour\", \"Non-Tumour\"]],\n", + " names=[\"Model:\", \"Predicted:\"],\n", + ")\n", + "df = pd.DataFrame(\n", + " [[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],\n", + " index=idx,\n", + " columns=cols,\n", + ")\n", "df.style" ] }, @@ -242,63 +235,68 @@ "outputs": [], "source": [ "# Hidden cell to just create the below example: code is covered throughout the guide.\n", - "s = df.style\\\n", - " .hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis='columns')\\\n", - " .format('{:.0f}')\\\n", - " .set_table_styles([{\n", - " 'selector': '',\n", - " 'props': 'border-collapse: separate;'\n", - " },{\n", - " 'selector': 'caption',\n", - " 'props': 'caption-side: bottom; font-size:1.3em;'\n", - " },{\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", - " },{\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", - " },{\n", - " 'selector': 'th.col_heading',\n", - " 'props': 'text-align: center;'\n", - " },{\n", - " 'selector': 'th.col_heading.level0',\n", - " 'props': 'font-size: 1.5em;'\n", - " },{\n", - " 'selector': 'th.col2',\n", - " 'props': 'border-left: 1px solid white;'\n", - " },{\n", - " 'selector': '.col2',\n", - " 'props': 'border-left: 1px solid #000066;'\n", - " },{\n", - " 'selector': 'td',\n", - " 'props': 'text-align: center; font-weight:bold;'\n", - " },{\n", - " 'selector': '.true',\n", - " 'props': 'background-color: #e6ffe6;'\n", - " },{\n", - " 'selector': '.false',\n", - " 'props': 'background-color: #ffe6e6;'\n", - " },{\n", - " 'selector': '.border-red',\n", - " 'props': 'border: 2px dashed red;'\n", - " },{\n", - " 'selector': '.border-green',\n", - " 'props': 'border: 2px dashed green;'\n", - " },{\n", - " 'selector': 'td:hover',\n", - " 'props': 'background-color: #ffffb3;'\n", - " }])\\\n", - " .set_td_classes(pd.DataFrame([['true border-green', 'false', 'true', 'false border-red', '', ''],\n", - " ['false', 'true', 'false', 'true', '', '']], \n", - " index=df.index, columns=df.columns))\\\n", - " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n", - " .set_tooltips(pd.DataFrame([['This model has a very strong true positive rate', '', '', \"This model's total number of false negatives is too high\", '', ''],\n", - " ['', '', '', '', '', '']], \n", - " index=df.index, columns=df.columns),\n", - " css_class='pd-tt', props=\n", - " 'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", - " 'background-color: white; color: #000066; font-size: 0.8em;' \n", - " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n" + "s = (\n", + " df.style.hide([(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\")\n", + " .format(\"{:.0f}\")\n", + " .set_table_styles(\n", + " [\n", + " {\"selector\": \"\", \"props\": \"border-collapse: separate;\"},\n", + " {\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.3em;\"},\n", + " {\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", + " },\n", + " {\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", + " },\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"th.col2\", \"props\": \"border-left: 1px solid white;\"},\n", + " {\"selector\": \".col2\", \"props\": \"border-left: 1px solid #000066;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight:bold;\"},\n", + " {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n", + " {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n", + " {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n", + " {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n", + " {\"selector\": \"td:hover\", \"props\": \"background-color: #ffffb3;\"},\n", + " ]\n", + " )\n", + " .set_td_classes(\n", + " pd.DataFrame(\n", + " [\n", + " [\"true border-green\", \"false\", \"true\", \"false border-red\", \"\", \"\"],\n", + " [\"false\", \"true\", \"false\", \"true\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " )\n", + " )\n", + " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\n", + " .set_tooltips(\n", + " pd.DataFrame(\n", + " [\n", + " [\n", + " \"This model has a very strong true positive rate\",\n", + " \"\",\n", + " \"\",\n", + " \"This model's total number of false negatives is too high\",\n", + " \"\",\n", + " \"\",\n", + " ],\n", + " [\"\", \"\", \"\", \"\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " ),\n", + " css_class=\"pd-tt\",\n", + " props=\"visibility: hidden; \"\n", + " \"position: absolute; z-index: 1; \"\n", + " \"border: 1px solid #000066;\"\n", + " \"background-color: white; color: #000066; font-size: 0.8em;\"\n", + " \"transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;\",\n", + " )\n", + ")" ] }, { @@ -325,7 +323,9 @@ "metadata": {}, "outputs": [], "source": [ - "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n", + "s = df.style.format(\"{:.0f}\").hide(\n", + " [(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\"\n", + ")\n", "s" ] }, @@ -337,8 +337,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_hide')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_hide\")" ] }, { @@ -395,16 +395,16 @@ "outputs": [], "source": [ "cell_hover = { # for row hover use <tr> instead of <td>\n", - " 'selector': 'td:hover',\n", - " 'props': [('background-color', '#ffffb3')]\n", + " \"selector\": \"td:hover\",\n", + " \"props\": [(\"background-color\", \"#ffffb3\")],\n", "}\n", "index_names = {\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", "}\n", "headers = {\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", "}\n", "s.set_table_styles([cell_hover, index_names, headers])" ] @@ -417,8 +417,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles1')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles1\")" ] }, { @@ -434,11 +434,14 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([\n", - " {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n", - " {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n", - " {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n", - "], overwrite=False)" + "s.set_table_styles(\n", + " [\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight: bold;\"},\n", + " ],\n", + " overwrite=False,\n", + ")" ] }, { @@ -449,8 +452,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles2')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles2\")" ] }, { @@ -468,10 +471,16 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles({\n", - " ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n", - " {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n", - "}, overwrite=False, axis=0)" + "s.set_table_styles(\n", + " {\n", + " (\"Regression\", \"Tumour\"): [\n", + " {\"selector\": \"th\", \"props\": \"border-left: 1px solid white\"},\n", + " {\"selector\": \"td\", \"props\": \"border-left: 1px solid #000066\"},\n", + " ]\n", + " },\n", + " overwrite=False,\n", + " axis=0,\n", + ")" ] }, { @@ -482,8 +491,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('xyz01')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"xyz01\")" ] }, { @@ -508,7 +517,7 @@ "outputs": [], "source": [ "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n", - "print(out[out.find('<table'):][:109])" + "print(out[out.find(\"<table\") :][:109])" ] }, { @@ -531,14 +540,18 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([ # create internal CSS classes\n", - " {'selector': '.true', 'props': 'background-color: #e6ffe6;'},\n", - " {'selector': '.false', 'props': 'background-color: #ffe6e6;'},\n", - "], overwrite=False)\n", - "cell_color = pd.DataFrame([['true ', 'false ', 'true ', 'false '], \n", - " ['false ', 'true ', 'false ', 'true ']], \n", - " index=df.index, \n", - " columns=df.columns[:4])\n", + "s.set_table_styles(\n", + " [ # create internal CSS classes\n", + " {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n", + " {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n", + " ],\n", + " overwrite=False,\n", + ")\n", + "cell_color = pd.DataFrame(\n", + " [[\"true \", \"false \", \"true \", \"false \"], [\"false \", \"true \", \"false \", \"true \"]],\n", + " index=df.index,\n", + " columns=df.columns[:4],\n", + ")\n", "s.set_td_classes(cell_color)" ] }, @@ -550,8 +563,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_classes')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_classes\")" ] }, { @@ -579,8 +592,9 @@ "metadata": {}, "outputs": [], "source": [ - "np.random.seed(0)\n", - "df2 = pd.DataFrame(np.random.randn(10,4), columns=['A','B','C','D'])\n", + "df2 = pd.DataFrame(\n", + " np.random.default_rng(0).standard_normal((10, 4)), columns=[\"A\", \"B\", \"C\", \"D\"]\n", + ")\n", "df2.style" ] }, @@ -597,10 +611,13 @@ "metadata": {}, "outputs": [], "source": [ - "def style_negative(v, props=''):\n", + "def style_negative(v, props=\"\"):\n", " return props if v < 0 else None\n", - "s2 = df2.style.map(style_negative, props='color:red;')\\\n", - " .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\n", + "\n", + "\n", + "s2 = df2.style.map(style_negative, props=\"color:red;\").map(\n", + " lambda v: \"opacity: 20%;\" if (v < 0.3) and (v > -0.3) else None\n", + ")\n", "s2" ] }, @@ -612,8 +629,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s2.set_uuid('after_applymap')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s2.set_uuid(\"after_applymap\")" ] }, { @@ -629,9 +646,11 @@ "metadata": {}, "outputs": [], "source": [ - "def highlight_max(s, props=''):\n", - " return np.where(s == np.nanmax(s.values), props, '')\n", - "s2.apply(highlight_max, props='color:white;background-color:darkblue', axis=0)" + "def highlight_max(s, props=\"\"):\n", + " return np.where(s == np.nanmax(s.values), props, \"\")\n", + "\n", + "\n", + "s2.apply(highlight_max, props=\"color:white;background-color:darkblue\", axis=0)" ] }, { @@ -642,8 +661,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s2.set_uuid('after_apply')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s2.set_uuid(\"after_apply\")" ] }, { @@ -659,8 +678,9 @@ "metadata": {}, "outputs": [], "source": [ - "s2.apply(highlight_max, props='color:white;background-color:pink;', axis=1)\\\n", - " .apply(highlight_max, props='color:white;background-color:purple', axis=None)" + "s2.apply(highlight_max, props=\"color:white;background-color:pink;\", axis=1).apply(\n", + " highlight_max, props=\"color:white;background-color:purple\", axis=None\n", + ")" ] }, { @@ -671,8 +691,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s2.set_uuid('after_apply_again')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s2.set_uuid(\"after_apply_again\")" ] }, { @@ -713,8 +733,10 @@ "metadata": {}, "outputs": [], "source": [ - "s2.map_index(lambda v: \"color:pink;\" if v>4 else \"color:darkblue;\", axis=0)\n", - "s2.apply_index(lambda s: np.where(s.isin([\"A\", \"B\"]), \"color:pink;\", \"color:darkblue;\"), axis=1)" + "s2.map_index(lambda v: \"color:pink;\" if v > 4 else \"color:darkblue;\", axis=0)\n", + "s2.apply_index(\n", + " lambda s: np.where(s.isin([\"A\", \"B\"]), \"color:pink;\", \"color:darkblue;\"), axis=1\n", + ")" ] }, { @@ -734,11 +756,12 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n", - " .set_table_styles([{\n", - " 'selector': 'caption',\n", - " 'props': 'caption-side: bottom; font-size:1.25em;'\n", - " }], overwrite=False)" + "s.set_caption(\n", + " \"Confusion matrix for multiple cancer prediction models.\"\n", + ").set_table_styles(\n", + " [{\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.25em;\"}],\n", + " overwrite=False,\n", + ")" ] }, { @@ -749,8 +772,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_caption')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_caption\")" ] }, { @@ -768,12 +791,24 @@ "metadata": {}, "outputs": [], "source": [ - "tt = pd.DataFrame([['This model has a very strong true positive rate', \n", - " \"This model's total number of false negatives is too high\"]], \n", - " index=['Tumour (Positive)'], columns=df.columns[[0,3]])\n", - "s.set_tooltips(tt, props='visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", - " 'background-color: white; color: #000066; font-size: 0.8em;' \n", - " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')" + "tt = pd.DataFrame(\n", + " [\n", + " [\n", + " \"This model has a very strong true positive rate\",\n", + " \"This model's total number of false negatives is too high\",\n", + " ]\n", + " ],\n", + " index=[\"Tumour (Positive)\"],\n", + " columns=df.columns[[0, 3]],\n", + ")\n", + "s.set_tooltips(\n", + " tt,\n", + " props=\"visibility: hidden; position: absolute; z-index: 1; \"\n", + " \"border: 1px solid #000066;\"\n", + " \"background-color: white; color: #000066; font-size: 0.8em;\"\n", + " \"transform: translate(0px, -24px); padding: 0.6em; \"\n", + " \"border-radius: 0.5em;\",\n", + ")" ] }, { @@ -784,8 +819,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tooltips')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tooltips\")" ] }, { @@ -801,14 +836,18 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([ # create internal CSS classes\n", - " {'selector': '.border-red', 'props': 'border: 2px dashed red;'},\n", - " {'selector': '.border-green', 'props': 'border: 2px dashed green;'},\n", - "], overwrite=False)\n", - "cell_border = pd.DataFrame([['border-green ', ' ', ' ', 'border-red '], \n", - " [' ', ' ', ' ', ' ']], \n", - " index=df.index, \n", - " columns=df.columns[:4])\n", + "s.set_table_styles(\n", + " [ # create internal CSS classes\n", + " {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n", + " {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n", + " ],\n", + " overwrite=False,\n", + ")\n", + "cell_border = pd.DataFrame(\n", + " [[\"border-green \", \" \", \" \", \"border-red \"], [\" \", \" \", \" \", \" \"]],\n", + " index=df.index,\n", + " columns=df.columns[:4],\n", + ")\n", "s.set_td_classes(cell_color + cell_border)" ] }, @@ -820,8 +859,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_borders')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_borders\")" ] }, { @@ -847,9 +886,11 @@ "metadata": {}, "outputs": [], "source": [ - "df3 = pd.DataFrame(np.random.randn(4,4), \n", - " pd.MultiIndex.from_product([['A', 'B'], ['r1', 'r2']]),\n", - " columns=['c1','c2','c3','c4'])\n", + "df3 = pd.DataFrame(\n", + " np.random.default_rng().standard_normal((4, 4)),\n", + " pd.MultiIndex.from_product([[\"A\", \"B\"], [\"r1\", \"r2\"]]),\n", + " columns=[\"c1\", \"c2\", \"c3\", \"c4\"],\n", + ")\n", "df3" ] }, @@ -866,9 +907,10 @@ "metadata": {}, "outputs": [], "source": [ - "slice_ = ['c3', 'c4']\n", - "df3.style.apply(highlight_max, props='color:red;', axis=0, subset=slice_)\\\n", - " .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)" + "slice_ = [\"c3\", \"c4\"]\n", + "df3.style.apply(\n", + " highlight_max, props=\"color:red;\", axis=0, subset=slice_\n", + ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)" ] }, { @@ -885,9 +927,10 @@ "outputs": [], "source": [ "idx = pd.IndexSlice\n", - "slice_ = idx[idx[:,'r1'], idx['c2':'c4']]\n", - "df3.style.apply(highlight_max, props='color:red;', axis=0, subset=slice_)\\\n", - " .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)" + "slice_ = idx[idx[:, \"r1\"], idx[\"c2\":\"c4\"]]\n", + "df3.style.apply(\n", + " highlight_max, props=\"color:red;\", axis=0, subset=slice_\n", + ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)" ] }, { @@ -903,9 +946,10 @@ "metadata": {}, "outputs": [], "source": [ - "slice_ = idx[idx[:,'r2'], :]\n", - "df3.style.apply(highlight_max, props='color:red;', axis=1, subset=slice_)\\\n", - " .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)" + "slice_ = idx[idx[:, \"r2\"], :]\n", + "df3.style.apply(\n", + " highlight_max, props=\"color:red;\", axis=1, subset=slice_\n", + ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)" ] }, { @@ -923,9 +967,10 @@ "metadata": {}, "outputs": [], "source": [ - "slice_ = idx[idx[(df3['c1'] + df3['c3']) < -2.0], ['c2', 'c4']]\n", - "df3.style.apply(highlight_max, props='color:red;', axis=1, subset=slice_)\\\n", - " .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)" + "slice_ = idx[idx[(df3[\"c1\"] + df3[\"c3\"]) < -2.0], [\"c2\", \"c4\"]]\n", + "df3.style.apply(\n", + " highlight_max, props=\"color:red;\", axis=1, subset=slice_\n", + ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)" ] }, { @@ -981,7 +1026,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4 = pd.DataFrame([[1,2],[3,4]])\n", + "df4 = pd.DataFrame([[1, 2], [3, 4]])\n", "s4 = df4.style" ] }, @@ -1003,6 +1048,7 @@ "outputs": [], "source": [ "from pandas.io.formats.style import Styler\n", + "\n", "s4 = Styler(df4, uuid_len=0, cell_ids=False)" ] }, @@ -1053,7 +1099,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_table_styles([{'selector': 'td.col1', 'props': props}])" + "df4.style.set_table_styles([{\"selector\": \"td.col1\", \"props\": props}])" ] }, { @@ -1082,9 +1128,11 @@ "metadata": {}, "outputs": [], "source": [ - "df2.style.apply(highlight_max, props='color:white;background-color:darkblue;', axis=0)\\\n", - " .apply(highlight_max, props='color:white;background-color:pink;', axis=1)\\\n", - " .apply(highlight_max, props='color:white;background-color:purple', axis=None)" + "df2.style.apply(\n", + " highlight_max, props=\"color:white;background-color:darkblue;\", axis=0\n", + ").apply(highlight_max, props=\"color:white;background-color:pink;\", axis=1).apply(\n", + " highlight_max, props=\"color:white;background-color:purple\", axis=None\n", + ")" ] }, { @@ -1105,14 +1153,18 @@ "outputs": [], "source": [ "build = lambda x: pd.DataFrame(x, index=df2.index, columns=df2.columns)\n", - "cls1 = build(df2.apply(highlight_max, props='cls-1 ', axis=0))\n", - "cls2 = build(df2.apply(highlight_max, props='cls-2 ', axis=1, result_type='expand').values)\n", - "cls3 = build(highlight_max(df2, props='cls-3 '))\n", - "df2.style.set_table_styles([\n", - " {'selector': '.cls-1', 'props': 'color:white;background-color:darkblue;'},\n", - " {'selector': '.cls-2', 'props': 'color:white;background-color:pink;'},\n", - " {'selector': '.cls-3', 'props': 'color:white;background-color:purple;'}\n", - "]).set_td_classes(cls1 + cls2 + cls3)" + "cls1 = build(df2.apply(highlight_max, props=\"cls-1 \", axis=0))\n", + "cls2 = build(\n", + " df2.apply(highlight_max, props=\"cls-2 \", axis=1, result_type=\"expand\").values\n", + ")\n", + "cls3 = build(highlight_max(df2, props=\"cls-3 \"))\n", + "df2.style.set_table_styles(\n", + " [\n", + " {\"selector\": \".cls-1\", \"props\": \"color:white;background-color:darkblue;\"},\n", + " {\"selector\": \".cls-2\", \"props\": \"color:white;background-color:pink;\"},\n", + " {\"selector\": \".cls-3\", \"props\": \"color:white;background-color:purple;\"},\n", + " ]\n", + ").set_td_classes(cls1 + cls2 + cls3)" ] }, { @@ -1152,10 +1204,14 @@ " \"blank\": \"\",\n", "}\n", "html = Styler(df4, uuid_len=0, cell_ids=False)\n", - "html.set_table_styles([{'selector': 'td', 'props': props},\n", - " {'selector': '.c1', 'props': 'color:green;'},\n", - " {'selector': '.l0', 'props': 'color:blue;'}],\n", - " css_class_names=my_css)\n", + "html.set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": props},\n", + " {\"selector\": \".c1\", \"props\": \"color:green;\"},\n", + " {\"selector\": \".l0\", \"props\": \"color:blue;\"},\n", + " ],\n", + " css_class_names=my_css,\n", + ")\n", "print(html.to_html())" ] }, @@ -1213,9 +1269,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2.iloc[0,2] = np.nan\n", - "df2.iloc[4,3] = np.nan\n", - "df2.loc[:4].style.highlight_null(color='yellow')" + "df2.iloc[0, 2] = np.nan\n", + "df2.iloc[4, 3] = np.nan\n", + "df2.loc[:4].style.highlight_null(color=\"yellow\")" ] }, { @@ -1231,7 +1287,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2.loc[:4].style.highlight_max(axis=1, props='color:white; font-weight:bold; background-color:darkblue;')" + "df2.loc[:4].style.highlight_max(\n", + " axis=1, props=(\"color:white; \" \"font-weight:bold; \" \"background-color:darkblue;\")\n", + ")" ] }, { @@ -1249,7 +1307,9 @@ "outputs": [], "source": [ "left = pd.Series([1.0, 0.0, 1.0], index=[\"A\", \"B\", \"D\"])\n", - "df2.loc[:4].style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:purple;')" + "df2.loc[:4].style.highlight_between(\n", + " left=left, right=1.5, axis=1, props=\"color:white; background-color:purple;\"\n", + ")" ] }, { @@ -1266,7 +1326,7 @@ "metadata": {}, "outputs": [], "source": [ - "df2.loc[:4].style.highlight_quantile(q_left=0.85, axis=None, color='yellow')" + "df2.loc[:4].style.highlight_quantile(q_left=0.85, axis=None, color=\"yellow\")" ] }, { @@ -1290,6 +1350,7 @@ "outputs": [], "source": [ "import seaborn as sns\n", + "\n", "cm = sns.light_palette(\"green\", as_cmap=True)\n", "\n", "df2.style.background_gradient(cmap=cm)" @@ -1329,9 +1390,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2.loc[:4].style.set_properties(**{'background-color': 'black',\n", - " 'color': 'lawngreen',\n", - " 'border-color': 'white'})" + "df2.loc[:4].style.set_properties(\n", + " **{\"background-color\": \"black\", \"color\": \"lawngreen\", \"border-color\": \"white\"}\n", + ")" ] }, { @@ -1354,7 +1415,7 @@ "metadata": {}, "outputs": [], "source": [ - "df2.style.bar(subset=['A', 'B'], color='#d65f5f')" + "df2.style.bar(subset=[\"A\", \"B\"], color=\"#d65f5f\")" ] }, { @@ -1372,10 +1433,15 @@ "metadata": {}, "outputs": [], "source": [ - "df2.style.format('{:.3f}', na_rep=\"\")\\\n", - " .bar(align=0, vmin=-2.5, vmax=2.5, cmap=\"bwr\", height=50,\n", - " width=60, props=\"width: 120px; border-right: 1px solid black;\")\\\n", - " .text_gradient(cmap=\"bwr\", vmin=-2.5, vmax=2.5)" + "df2.style.format(\"{:.3f}\", na_rep=\"\").bar(\n", + " align=0,\n", + " vmin=-2.5,\n", + " vmax=2.5,\n", + " cmap=\"bwr\",\n", + " height=50,\n", + " width=60,\n", + " props=\"width: 120px; border-right: 1px solid black;\",\n", + ").text_gradient(cmap=\"bwr\", vmin=-2.5, vmax=2.5)" ] }, { @@ -1398,10 +1464,10 @@ "from IPython.display import HTML\n", "\n", "# Test series\n", - "test1 = pd.Series([-100,-60,-30,-20], name='All Negative')\n", - "test2 = pd.Series([-10,-5,0,90], name='Both Pos and Neg')\n", - "test3 = pd.Series([10,20,50,100], name='All Positive')\n", - "test4 = pd.Series([100, 103, 101, 102], name='Large Positive')\n", + "test1 = pd.Series([-100, -60, -30, -20], name=\"All Negative\")\n", + "test2 = pd.Series([-10, -5, 0, 90], name=\"Both Pos and Neg\")\n", + "test3 = pd.Series([10, 20, 50, 100], name=\"All Positive\")\n", + "test4 = pd.Series([100, 103, 101, 102], name=\"Large Positive\")\n", "\n", "\n", "head = \"\"\"\n", @@ -1417,19 +1483,22 @@ "\n", "\"\"\"\n", "\n", - "aligns = ['left', 'right', 'zero', 'mid', 'mean', 99]\n", + "aligns = [\"left\", \"right\", \"zero\", \"mid\", \"mean\", 99]\n", "for align in aligns:\n", " row = \"<tr><th>{}</th>\".format(align)\n", - " for series in [test1,test2,test3, test4]:\n", + " for series in [test1, test2, test3, test4]:\n", " s = series.copy()\n", - " s.name=''\n", - " row += \"<td>{}</td>\".format(s.to_frame().style.hide(axis='index').bar(align=align, \n", - " color=['#d65f5f', '#5fba7d'], \n", - " width=100).to_html()) #testn['width']\n", - " row += '</tr>'\n", + " s.name = \"\"\n", + " row += \"<td>{}</td>\".format(\n", + " s.to_frame()\n", + " .style.hide(axis=\"index\")\n", + " .bar(align=align, color=[\"#d65f5f\", \"#5fba7d\"], width=100)\n", + " .to_html()\n", + " ) # testn['width']\n", + " row += \"</tr>\"\n", " head += row\n", - " \n", - "head+= \"\"\"\n", + "\n", + "head += \"\"\"\n", "</tbody>\n", "</table>\"\"\"" ] @@ -1463,11 +1532,12 @@ "metadata": {}, "outputs": [], "source": [ - "style1 = df2.style\\\n", - " .map(style_negative, props='color:red;')\\\n", - " .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\\\n", - " .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\\\n", - " .hide(axis=\"index\")\n", + "style1 = (\n", + " df2.style.map(style_negative, props=\"color:red;\")\n", + " .map(lambda v: \"opacity: 20%;\" if (v < 0.3) and (v > -0.3) else None)\n", + " .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\n", + " .hide(axis=\"index\")\n", + ")\n", "style1" ] }, @@ -1526,11 +1596,14 @@ "outputs": [], "source": [ "from ipywidgets import widgets\n", + "\n", + "\n", "@widgets.interact\n", - "def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0., 99.9), l=(0., 99.9)):\n", + "def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0.0, 99.9), l_post=(0.0, 99.9)):\n", " return df2.style.background_gradient(\n", - " cmap=sns.palettes.diverging_palette(h_neg=h_neg, h_pos=h_pos, s=s, l=l,\n", - " as_cmap=True)\n", + " cmap=sns.palettes.diverging_palette(\n", + " h_neg=h_neg, h_pos=h_pos, s=s, l=l_post, as_cmap=True\n", + " )\n", " )" ] }, @@ -1548,16 +1621,15 @@ "outputs": [], "source": [ "def magnify():\n", - " return [dict(selector=\"th\",\n", - " props=[(\"font-size\", \"4pt\")]),\n", - " dict(selector=\"td\",\n", - " props=[('padding', \"0em 0em\")]),\n", - " dict(selector=\"th:hover\",\n", - " props=[(\"font-size\", \"12pt\")]),\n", - " dict(selector=\"tr:hover td:hover\",\n", - " props=[('max-width', '200px'),\n", - " ('font-size', '12pt')])\n", - "]" + " return [\n", + " {\"selector\": \"th\", \"props\": [(\"font-size\", \"4pt\")]},\n", + " {\"selector\": \"td\", \"props\": [(\"padding\", \"0em 0em\")]},\n", + " {\"selector\": \"th:hover\", \"props\": [(\"font-size\", \"12pt\")]},\n", + " {\n", + " \"selector\": \"tr:hover td:hover\",\n", + " \"props\": [(\"max-width\", \"200px\"), (\"font-size\", \"12pt\")],\n", + " },\n", + " ]" ] }, { @@ -1566,15 +1638,12 @@ "metadata": {}, "outputs": [], "source": [ - "np.random.seed(25)\n", - "cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)\n", - "bigdf = pd.DataFrame(np.random.randn(20, 25)).cumsum()\n", + "cmap = sns.diverging_palette(5, 250, as_cmap=True)\n", + "bigdf = pd.DataFrame(np.random.default_rng(25).standard_normal((20, 25))).cumsum()\n", "\n", - "bigdf.style.background_gradient(cmap, axis=1)\\\n", - " .set_properties(**{'max-width': '80px', 'font-size': '1pt'})\\\n", - " .set_caption(\"Hover to magnify\")\\\n", - " .format(precision=2)\\\n", - " .set_table_styles(magnify())" + "bigdf.style.background_gradient(cmap, axis=1).set_properties(\n", + " **{\"max-width\": \"80px\", \"font-size\": \"1pt\"}\n", + ").set_caption(\"Hover to magnify\").format(precision=2).set_table_styles(magnify())" ] }, { @@ -1594,7 +1663,7 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf = pd.DataFrame(np.random.randn(16, 100))\n", + "bigdf = pd.DataFrame(np.random.default_rng().standard_normal((16, 100)))\n", "bigdf.style.set_sticky(axis=\"index\")" ] }, @@ -1611,8 +1680,8 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf.index = pd.MultiIndex.from_product([[\"A\",\"B\"],[0,1],[0,1,2,3]])\n", - "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1,2])" + "bigdf.index = pd.MultiIndex.from_product([[\"A\", \"B\"], [0, 1], [0, 1, 2, 3]])\n", + "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1, 2])" ] }, { @@ -1632,7 +1701,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4 = pd.DataFrame([['<div></div>', '\"&other\"', '<span></span>']])\n", + "df4 = pd.DataFrame([[\"<div></div>\", '\"&other\"', \"<span></span>\"]])\n", "df4.style" ] }, @@ -1651,7 +1720,9 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.format('<a href=\"https://pandas.pydata.org\" target=\"_blank\">{}</a>', escape=\"html\")" + "df4.style.format(\n", + " '<a href=\"https://pandas.pydata.org\" target=\"_blank\">{}</a>', escape=\"html\"\n", + ")" ] }, { @@ -1693,10 +1764,9 @@ "metadata": {}, "outputs": [], "source": [ - "df2.style.\\\n", - " map(style_negative, props='color:red;').\\\n", - " highlight_max(axis=0).\\\n", - " to_excel('styled.xlsx', engine='openpyxl')" + "df2.style.map(style_negative, props=\"color:red;\").highlight_max(axis=0).to_excel(\n", + " \"styled.xlsx\", engine=\"openpyxl\"\n", + ")" ] }, { @@ -1765,7 +1835,11 @@ "metadata": {}, "outputs": [], "source": [ - "print(pd.DataFrame([[1,2],[3,4]], index=['i1', 'i2'], columns=['c1', 'c2']).style.to_html())" + "print(\n", + " pd.DataFrame(\n", + " [[1, 2], [3, 4]], index=[\"i1\", \"i2\"], columns=[\"c1\", \"c2\"]\n", + " ).style.to_html()\n", + ")" ] }, { @@ -1783,9 +1857,8 @@ "metadata": {}, "outputs": [], "source": [ - "df4 = pd.DataFrame([['text']])\n", - "df4.style.map(lambda x: 'color:green;')\\\n", - " .map(lambda x: 'color:red;')" + "df4 = pd.DataFrame([[\"text\"]])\n", + "df4.style.map(lambda x: \"color:green;\").map(lambda x: \"color:red;\")" ] }, { @@ -1794,8 +1867,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.map(lambda x: 'color:red;')\\\n", - " .map(lambda x: 'color:green;')" + "df4.style.map(lambda x: \"color:red;\").map(lambda x: \"color:green;\")" ] }, { @@ -1820,9 +1892,9 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('a_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'}])\\\n", - " .map(lambda x: 'color:green;')" + "df4.style.set_uuid(\"a_\").set_table_styles(\n", + " [{\"selector\": \"td\", \"props\": \"color:red;\"}]\n", + ").map(lambda x: \"color:green;\")" ] }, { @@ -1838,11 +1910,12 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('b_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", - " {'selector': '.cls-1', 'props': 'color:blue;'}])\\\n", - " .map(lambda x: 'color:green;')\\\n", - " .set_td_classes(pd.DataFrame([['cls-1']]))" + "df4.style.set_uuid(\"b_\").set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": \"color:red;\"},\n", + " {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n", + " ]\n", + ").map(lambda x: \"color:green;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))" ] }, { @@ -1858,12 +1931,13 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('c_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", - " {'selector': '.cls-1', 'props': 'color:blue;'},\n", - " {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n", - " .map(lambda x: 'color:green;')\\\n", - " .set_td_classes(pd.DataFrame([['cls-1']]))" + "df4.style.set_uuid(\"c_\").set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": \"color:red;\"},\n", + " {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n", + " {\"selector\": \"td.data\", \"props\": \"color:yellow;\"},\n", + " ]\n", + ").map(lambda x: \"color:green;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))" ] }, { @@ -1881,12 +1955,13 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.set_uuid('d_')\\\n", - " .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n", - " {'selector': '.cls-1', 'props': 'color:blue;'},\n", - " {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n", - " .map(lambda x: 'color:green !important;')\\\n", - " .set_td_classes(pd.DataFrame([['cls-1']]))" + "df4.style.set_uuid(\"d_\").set_table_styles(\n", + " [\n", + " {\"selector\": \"td\", \"props\": \"color:red;\"},\n", + " {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n", + " {\"selector\": \"td.data\", \"props\": \"color:yellow;\"},\n", + " ]\n", + ").map(lambda x: \"color:green !important;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))" ] }, { @@ -1940,8 +2015,8 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"templates/myhtml.tpl\") as f:\n", - " print(f.read())" + "with open(\"templates/myhtml.tpl\") as f_html:\n", + " print(f_html.read())" ] }, { @@ -1960,10 +2035,12 @@ "source": [ "class MyStyler(Styler):\n", " env = Environment(\n", - " loader=ChoiceLoader([\n", - " FileSystemLoader(\"templates\"), # contains ours\n", - " Styler.loader, # the default\n", - " ])\n", + " loader=ChoiceLoader(\n", + " [\n", + " FileSystemLoader(\"templates\"), # contains ours\n", + " Styler.loader, # the default\n", + " ]\n", + " )\n", " )\n", " template_html_table = env.get_template(\"myhtml.tpl\")" ] @@ -2045,8 +2122,8 @@ }, "outputs": [], "source": [ - "with open(\"templates/html_style_structure.html\") as f:\n", - " style_structure = f.read()" + "with open(\"templates/html_style_structure.html\") as f_sty:\n", + " style_structure = f_sty.read()" ] }, { @@ -2073,8 +2150,8 @@ }, "outputs": [], "source": [ - "with open(\"templates/html_table_structure.html\") as f:\n", - " table_structure = f.read()" + "with open(\"templates/html_table_structure.html\") as f_table_struct:\n", + " table_structure = f_table_struct.read()" ] }, { @@ -2106,7 +2183,7 @@ "# from IPython.display import HTML\n", "# with open(\"themes/nature_with_gtoc/static/nature.css_t\") as f:\n", "# css = f.read()\n", - " \n", + "\n", "# HTML('<style>{}</style>'.format(css))" ] } diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3c707d7931c97..a5b4560a47bc4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,7 +544,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend='pyarrow'`` parameter where empty categorical series raise error or get converted to null[pyarrow] (:issue:`59934`) +- - Datetimelike diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de7d9af731010..23e0f387466aa 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1089,9 +1089,23 @@ def is_float(obj: object) -> bool: """ Return True if given object is float. + This method checks whether the passed object is a float type. It + returns `True` if the object is a float, and `False` otherwise. + + Parameters + ---------- + obj : object + The object to check for float type. + Returns ------- bool + `True` if the object is of float type, otherwise `False`. + + See Also + -------- + api.types.is_integer : Check if an object is of integer type. + api.types.is_numeric_dtype : Check if an object is of numeric type. Examples -------- diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index f49e894a0bfec..d3b10fbe79cb9 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -9,6 +9,7 @@ from typing import ( Literal, NoReturn, TypeAlias, + overload, ) import numpy as np @@ -24,12 +25,8 @@ NaT: NaTType iNaT: int nat_strings: set[str] -_NaTComparisonTypes: TypeAlias = ( - datetime | timedelta | Period | np.datetime64 | np.timedelta64 -) - -class _NatComparison: - def __call__(self, other: _NaTComparisonTypes) -> bool: ... +_TimeLike: TypeAlias = datetime | timedelta | Period | np.datetime64 | np.timedelta64 +_TimeDelta: TypeAlias = timedelta | np.timedelta64 class NaTType: _value: np.int64 @@ -159,15 +156,31 @@ class NaTType: # inject Period properties @property def qyear(self) -> float: ... - def __eq__(self, other: object) -> bool: ... - def __ne__(self, other: object) -> bool: ... - __lt__: _NatComparison - __le__: _NatComparison - __gt__: _NatComparison - __ge__: _NatComparison - def __sub__(self, other: Self | timedelta | datetime) -> Self: ... - def __rsub__(self, other: Self | timedelta | datetime) -> Self: ... - def __add__(self, other: Self | timedelta | datetime) -> Self: ... - def __radd__(self, other: Self | timedelta | datetime) -> Self: ... + # comparisons + def __eq__(self, other: object, /) -> Literal[False]: ... + def __ne__(self, other: object, /) -> Literal[True]: ... + def __lt__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __le__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __gt__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __ge__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + # unary operators + def __pos__(self) -> Self: ... + def __neg__(self) -> Self: ... + # binary operators + def __sub__(self, other: Self | _TimeLike, /) -> Self: ... + def __rsub__(self, other: Self | _TimeLike, /) -> Self: ... + def __add__(self, other: Self | _TimeLike, /) -> Self: ... + def __radd__(self, other: Self | _TimeLike, /) -> Self: ... + def __mul__(self, other: float, /) -> Self: ... # analogous to timedelta + def __rmul__(self, other: float, /) -> Self: ... + @overload # analogous to timedelta + def __truediv__(self, other: Self | _TimeDelta, /) -> float: ... # Literal[NaN] + @overload + def __truediv__(self, other: float, /) -> Self: ... + @overload # analogous to timedelta + def __floordiv__(self, other: Self | _TimeDelta, /) -> float: ... # Literal[NaN] + @overload + def __floordiv__(self, other: float, /) -> Self: ... + # other def __hash__(self) -> int: ... def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index bbd5e60a5a812..01c4dcd92ee40 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -701,6 +701,10 @@ def assert_extension_array_equal( """ Check that left and right ExtensionArrays are equal. + This method compares two ``ExtensionArray`` instances for equality, + including checks for missing values, the dtype of the arrays, and + the exactness of the comparison (or tolerance when comparing floats). + Parameters ---------- left, right : ExtensionArray @@ -726,6 +730,12 @@ def assert_extension_array_equal( .. versionadded:: 2.0.0 + See Also + -------- + testing.assert_series_equal : Check that left and right ``Series`` are equal. + testing.assert_frame_equal : Check that left and right ``DataFrame`` are equal. + testing.assert_index_equal : Check that left and right ``Index`` are equal. + Notes ----- Missing values are checked separately from valid values. diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index d8463fda34caa..78684eacf2d66 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -7,6 +7,7 @@ from __future__ import annotations +import functools from typing import ( TYPE_CHECKING, final, @@ -117,12 +118,12 @@ def _setter(self, new_values): ) def _create_delegator_method(name: str): + method = getattr(delegate, accessor_mapping(name)) + + @functools.wraps(method) def f(self, *args, **kwargs): return self._delegate_method(name, *args, **kwargs) - f.__name__ = name - f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__ - return f for name in accessors: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7d50b466f5126..1f13459724d78 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -38,10 +38,7 @@ is_numeric_dtype, is_sequence, ) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCNDFrame, @@ -1465,14 +1462,7 @@ def curried(x): else: curried = func - - # row-wise access - # apply doesn't have a `na_action` keyword and for backward compat reasons - # we need to give `na_action="ignore"` for categorical data. - # TODO: remove the `na_action="ignore"` when that default has been changed in - # Categorical (GH51645). - action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None - mapped = obj._map_values(mapper=curried, na_action=action) + mapped = obj._map_values(mapper=curried) if len(mapped) and isinstance(mapped[0], ABCSeries): # GH#43986 Need to do list(mapped) in order to get treated as nested diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a69e197df851d..0484ef89f61c2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1155,6 +1155,12 @@ def rename_categories(self, new_categories) -> Self: """ Rename categories. + This method is commonly used to re-label or adjust the + category names in categorical data without changing the + underlying data. It is useful in situations where you want + to modify the labels used for clarity, consistency, + or readability. + Parameters ---------- new_categories : list-like, dict-like or callable @@ -1371,8 +1377,8 @@ def remove_categories(self, removals) -> Self: """ Remove the specified categories. - `removals` must be included in the old categories. Values which were in - the removed categories will be set to NaN + The ``removals`` argument must be a subset of the current categories. + Any values that were part of the removed categories will be set to NaN. Parameters ---------- @@ -1431,6 +1437,10 @@ def remove_unused_categories(self) -> Self: """ Remove categories which are not used. + This method is useful when working with datasets + that undergo dynamic changes where categories may no longer be + relevant, allowing to maintain a clean, efficient data structure. + Returns ------- Categorical diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 43f4428118aa7..41128e52e31b3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -205,6 +205,14 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ------- None + See Also + -------- + DatetimeIndex : Immutable Index for datetime-like data. + Series : One-dimensional labeled array capable of holding datetime-like data. + Timestamp : Pandas replacement for python datetime.datetime object. + to_datetime : Convert argument to datetime. + period_range : Return a fixed frequency PeriodIndex. + Examples -------- >>> pd.arrays.DatetimeArray._from_sequence( diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index e610e018c5a74..0ed5f69fe4703 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -88,9 +88,17 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: """ Create a Series with sparse values from a scipy.sparse.coo_matrix. + This method takes a ``scipy.sparse.coo_matrix`` (coordinate format) as input and + returns a pandas ``Series`` where the non-zero elements are represented as + sparse values. The index of the Series can either include only the coordinates + of non-zero elements (default behavior) or the full sorted set of coordinates + from the matrix if ``dense_index`` is set to `True`. + Parameters ---------- A : scipy.sparse.coo_matrix + The sparse matrix in coordinate format from which the sparse Series + will be created. dense_index : bool, default False If False (default), the index consists of only the coords of the non-null entries of the original coo_matrix. @@ -102,6 +110,12 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: s : Series A Series with sparse values. + See Also + -------- + DataFrame.sparse.from_spmatrix : Create a new DataFrame from a scipy sparse + matrix. + scipy.sparse.coo_matrix : A sparse matrix in COOrdinate format. + Examples -------- >>> from scipy import sparse @@ -369,10 +383,10 @@ def to_dense(self) -> DataFrame: 1 1 2 0 """ - from pandas import DataFrame - data = {k: v.array.to_dense() for k, v in self._parent.items()} - return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + return self._parent._constructor( + data, index=self._parent.index, columns=self._parent.columns + ) def to_coo(self) -> spmatrix: """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index b3aa782341c77..e4daf9ed450fb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -190,7 +190,7 @@ def __eq__(self, other: object) -> bool: # cannot be checked with normal `==` if isinstance(other, str): # TODO should dtype == "string" work for the NaN variant? - if other == "string" or other == self.name: # noqa: PLR1714 + if other == "string" or other == self.name: return True try: other = self.construct_from_string(other) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f184aab4070d7..1b47002e72fc6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7266,7 +7266,11 @@ def value_counts( normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True - Sort by frequencies when True. Sort by DataFrame column values when False. + Sort by frequencies when True. Preserve the order of the data when False. + + .. versionchanged:: 3.0.0 + + Prior to 3.0.0, ``sort=False`` would sort by the columns values. ascending : bool, default False Sort in ascending order. dropna : bool, default True @@ -7372,7 +7376,9 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() + counts = self.groupby( + subset, sort=False, dropna=dropna, observed=False + )._grouper.size() counts.name = name if sort: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 110c0ea88a0a1..f076f8d79f104 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2621,7 +2621,13 @@ def value_counts( normalize : bool, default False Return proportions rather than frequencies. sort : bool, default True - Sort by frequencies. + Sort by frequencies when True. When False, non-grouping columns will appear + in the order they occur in within groups. + + .. versionchanged:: 3.0.0 + + In prior versions, ``sort=False`` would sort the non-grouping columns + by label. ascending : bool, default False Sort in ascending order. dropna : bool, default True @@ -2673,8 +2679,8 @@ def value_counts( >>> df.groupby("gender").value_counts() gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low FR 2 US 1 medium FR 1 @@ -2682,8 +2688,8 @@ def value_counts( >>> df.groupby("gender").value_counts(ascending=True) gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low US 1 medium FR 1 low FR 2 @@ -2691,8 +2697,8 @@ def value_counts( >>> df.groupby("gender").value_counts(normalize=True) gender education country - female high FR 0.50 - US 0.50 + female high US 0.50 + FR 0.50 male low FR 0.50 US 0.25 medium FR 0.25 @@ -2700,16 +2706,16 @@ def value_counts( >>> df.groupby("gender", as_index=False).value_counts() gender education country count - 0 female high FR 1 - 1 female high US 1 + 0 female high US 1 + 1 female high FR 1 2 male low FR 2 3 male low US 1 4 male medium FR 1 >>> df.groupby("gender", as_index=False).value_counts(normalize=True) gender education country proportion - 0 female high FR 0.50 - 1 female high US 0.50 + 0 female high US 0.50 + 1 female high FR 0.50 2 male low FR 0.50 3 male low US 0.25 4 male medium FR 0.25 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e2410788ea95e..68314567d1b5e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2519,7 +2519,7 @@ def _value_counts( grouper, _, _ = get_grouper( df, key=key, - sort=self.sort, + sort=False, observed=False, dropna=dropna, ) @@ -2528,7 +2528,7 @@ def _value_counts( # Take the size of the overall columns gb = df.groupby( groupings, - sort=self.sort, + sort=False, observed=self.observed, dropna=self.dropna, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0e99178642715..b32119a2ddbde 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -755,6 +755,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings ] + sorts = [ping._sort for ping in self.groupings] # When passed a categorical grouping, keep all categories for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: @@ -765,7 +766,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: result_index.name = self.names[0] ids = ensure_platform_int(self.codes[0]) elif all(obs): - result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names) + result_index, ids = self._ob_index_and_ids( + levels, self.codes, self.names, sorts + ) elif not any(obs): result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names) else: @@ -778,6 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: levels=[levels[idx] for idx in ob_indices], codes=[codes[idx] for idx in ob_indices], names=[names[idx] for idx in ob_indices], + sorts=[sorts[idx] for idx in ob_indices], ) unob_index, unob_ids = self._unob_index_and_ids( levels=[levels[idx] for idx in unob_indices], @@ -800,9 +804,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: ).reorder_levels(index) ids = len(unob_index) * ob_ids + unob_ids - if self._sort: + if any(sorts): # Sort result_index and recode ids using the new order - sorter = result_index.argsort() + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = result_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = result_index.argsort() result_index = result_index.take(sorter) _, index = np.unique(sorter, return_index=True) ids = ensure_platform_int(ids) @@ -837,10 +850,13 @@ def _ob_index_and_ids( levels: list[Index], codes: list[npt.NDArray[np.intp]], names: list[Hashable], + sorts: list[bool], ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + consistent_sorting = all(sorts[0] == sort for sort in sorts[1:]) + sort_in_compress = sorts[0] if consistent_sorting else False shape = tuple(len(level) for level in levels) group_index = get_group_index(codes, shape, sort=True, xnull=True) - ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress) ob_ids = ensure_platform_int(ob_ids) ob_index_codes = decons_obs_group_ids( ob_ids, obs_group_ids, shape, codes, xnull=True @@ -851,6 +867,21 @@ def _ob_index_and_ids( names=names, verify_integrity=False, ) + if not consistent_sorting and len(ob_index) > 0: + # Sort by the levels where the corresponding sort argument is True + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = ob_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = ob_index.argsort() + ob_index = ob_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids)) ob_ids = ensure_platform_int(ob_ids) return ob_index, ob_ids diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b5f05ef0ab78f..377406e24b1d3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -563,6 +563,14 @@ def period_range( Returns ------- PeriodIndex + A PeriodIndex of fixed frequency periods. + + See Also + -------- + date_range : Returns a fixed frequency DatetimeIndex. + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating regular periods + in time. Notes ----- diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index c397c1c2566a5..33ff182f5baee 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -68,7 +68,8 @@ def get_dummies( If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix`. dummy_na : bool, default False - Add a column to indicate NaNs, if False NaNs are ignored. + If True, a NaN indicator column will be added even if no NaN values are present. + If False, NA values are encoded as all zero. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 982851d0557c3..f159babb7e018 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -11,6 +11,10 @@ lib, missing as libmissing, ) +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -189,6 +193,8 @@ def to_numeric( return float(arg) if is_number(arg): return arg + if isinstance(arg, (Timedelta, Timestamp)): + return arg._value is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 46e090cc3a589..cf2a9d3f4a238 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -444,6 +444,11 @@ class SpecificationError(Exception): The second way is calling ``agg`` on a Dataframe with duplicated functions names without assigning column name. + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index adaeed017d7bf..fdea1831d5596 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -195,6 +195,8 @@ def _write_cell( esc = {} rs = pprint_thing(s, escape_chars=esc).strip() + # replace spaces betweens strings with non-breaking spaces + rs = rs.replace(" ", " ") if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6e5ae09485951..eb6773310da69 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -222,6 +222,7 @@ class Styler(StylerRenderer): * ``level<k>`` where `k` is the level in a MultiIndex * Column label cells include + * ``col_heading`` * ``col<n>`` where `n` is the numeric position of the column * ``level<k>`` where `k` is the level in a MultiIndex @@ -231,7 +232,7 @@ class Styler(StylerRenderer): * Trimmed cells include ``col_trim`` or ``row_trim``. Any, or all, or these classes can be renamed by using the ``css_class_names`` - argument in ``Styler.set_table_classes``, giving a value such as + argument in ``Styler.set_table_styles``, giving a value such as *{"row": "MY_ROW_CLASS", "col_trim": "", "row_trim": ""}*. Examples diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 8a6383f7e8f82..08d9fd938c873 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -906,9 +906,9 @@ def concatenated_visible_rows(obj): row_body_headers = [ { **col, - "display_value": col["display_value"] - if col["is_visible"] - else "", + "display_value": ( + col["display_value"] if col["is_visible"] else "" + ), "cellstyle": self.ctx_index[r, c], } for c, col in enumerate(row[:index_levels]) @@ -2069,18 +2069,18 @@ def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: ('border','1px solid red')] """ if isinstance(style, str): - s = style.split(";") - try: - return [ - (x.split(":")[0].strip(), x.split(":")[1].strip()) - for x in s - if x.strip() != "" - ] - except IndexError as err: + if style and ":" not in style: raise ValueError( "Styles supplied as string must follow CSS rule formats, " f"for example 'attr: val;'. '{style}' was given." - ) from err + ) + s = style.split(";") + return [ + (x.split(":")[0].strip(), ":".join(x.split(":")[1:]).strip()) + for x in s + if x.strip() != "" + ] + return style diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4be06f93689f2..f1d289726c9c8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2004,6 +2004,16 @@ def data_label(self) -> str: """ Return data label of Stata file. + The data label is a descriptive string associated with the dataset + stored in the Stata file. This property provides access to that + label, if one is present. + + See Also + -------- + io.stata.StataReader.variable_labels : Return a dict associating each variable + name with corresponding label. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame([(1,)], columns=["variable"]) @@ -2066,9 +2076,19 @@ def value_labels(self) -> dict[str, dict[int, str]]: """ Return a nested dict associating each variable name to its value and label. + This method retrieves the value labels from a Stata file. Value labels are + mappings between the coded values and their corresponding descriptive labels + in a Stata dataset. + Returns ------- dict + A python dictionary. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. Examples -------- diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index d8455f44ef0d1..81940613dd2b0 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -39,7 +39,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: **kwargs Keyword arguments to be passed to matplotlib.table.table. If `rowLabels` or `colLabels` is not specified, data index or column - name will be used. + names will be used. Returns ------- @@ -59,11 +59,11 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: >>> import matplotlib.pyplot as plt >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - >>> fix, ax = plt.subplots() + >>> fig, ax = plt.subplots() >>> ax.axis("off") (0.0, 1.0, 0.0, 1.0) >>> table = pd.plotting.table( - ... ax, df, loc="center", cellLoc="center", colWidths=list([0.2, 0.2]) + ... ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2] ... ) """ plot_backend = _get_plot_backend("matplotlib") @@ -549,6 +549,10 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax """ Lag plot for time series. + A lag plot is a scatter plot of a time series against a lag of itself. It helps + in visualizing the temporal dependence between observations by plotting the values + at time `t` on the x-axis and the values at time `t + lag` on the y-axis. + Parameters ---------- series : Series @@ -563,6 +567,13 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib Axes object containing the lag plot. + + See Also + -------- + plotting.autocorrelation_plot : Autocorrelation plot for time series. + matplotlib.pyplot.scatter : A scatter plot of y vs. x with varying marker size + and/or color in Matplotlib. Examples -------- diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index dee0efcd8fd15..f0ab01e9e960e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -741,8 +741,9 @@ def test_apply_category_equalness(val): result = df.a.apply(lambda x: x == val) expected = Series( - [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" + [False if pd.isnull(x) else x == val for x in df_values], name="a" ) + # False since behavior of NaN for categorical dtype has been changed (GH 59966) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 76704de6f2d10..9541b0b7495c7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -236,10 +236,10 @@ def test_apply_categorical_with_nan_values(series, by_row): with pytest.raises(AttributeError, match=msg): s.apply(lambda x: x.split("-")[0], by_row=by_row) return - - result = s.apply(lambda x: x.split("-")[0], by_row=by_row) + # NaN for cat dtype fixed in (GH 59966) + result = s.apply(lambda x: x.split("-")[0] if pd.notna(x) else False, by_row=by_row) result = result.astype(object) - expected = Series(["1", "1", np.nan], dtype="category") + expected = Series(["1", "1", False], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index bd3298940ae3a..08bfd5b69fdd9 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -252,3 +252,7 @@ def test_with_column_named_sparse(self): # https://github.com/pandas-dev/pandas/issues/30758 df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) + + def test_subclassing(self): + df = tm.SubclassedDataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse.to_dense(), tm.SubclassedDataFrame) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 7670b53f23173..de5029b9f18b2 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -128,7 +128,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): expected = pd.Series( data=[1, 1], index=pd.MultiIndex.from_arrays( - [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + [("John", "Beth"), ("Smith", "Louise")], names=["first_name", "middle_name"] ), name="count", ) @@ -156,7 +156,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): pd.Index(["Anne", "Beth", "John"]), pd.Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[2, 0, 2, 1], [1, 2, 2, 0]], names=["first_name", "middle_name"], ), name="count", diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8f8f7f64aba75..8ca6593a19f20 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -255,10 +255,10 @@ def test_basic(education_df, request): index=MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), - ("US", "female", "high"), + ("FR", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), ], names=["country", "gender", "education"], ), @@ -472,11 +472,11 @@ def test_data_frame_value_counts( ( False, False, - [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0, 1, 3, 5, 6, 7, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], ), (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), - (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, False, [0, 1, 5, 6, 7, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), ], ) @@ -518,7 +518,7 @@ def test_dropna_combinations( True, [1, 1], MultiIndex.from_arrays( - [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + [(1, 1), ("John", "Beth"), ("Smith", "Louise")], names=["key", "first_name", "middle_name"], ), ), @@ -531,7 +531,7 @@ def test_dropna_combinations( Index(["Anne", "Beth", "John"]), Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[0, 0, 0, 0], [2, 0, 2, 1], [1, 2, 2, 0]], names=["key", "first_name", "middle_name"], ), ), @@ -609,17 +609,17 @@ def test_categorical_single_grouper_with_only_observed_categories( expected_index = MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ], names=["country", "gender", "education"], ) @@ -711,17 +711,17 @@ def test_categorical_single_grouper_observed_true( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] assert_categorical_single_grouper( @@ -791,23 +791,23 @@ def test_categorical_single_grouper_observed_false( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ("ASIA", "female", "high"), - ("ASIA", "female", "low"), - ("ASIA", "female", "medium"), - ("ASIA", "male", "high"), ("ASIA", "male", "low"), ("ASIA", "male", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "high"), ] assert_categorical_single_grouper( @@ -837,8 +837,8 @@ def test_categorical_single_grouper_observed_false( ("US", "high", "male"), ("US", "low", "male"), ("US", "low", "female"), - ("US", "medium", "female"), ("US", "medium", "male"), + ("US", "medium", "female"), ], ), ( @@ -949,17 +949,17 @@ def test_categorical_non_groupers( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] expected_series = Series( data=expected_data, @@ -1178,7 +1178,7 @@ def test_value_counts_sort(sort, vc_sort, normalize): if sort and vc_sort: taker = [0, 1, 2] elif sort and not vc_sort: - taker = [0, 1, 2] + taker = [1, 0, 2] elif not sort and vc_sort: taker = [0, 2, 1] else: @@ -1219,3 +1219,25 @@ def test_value_counts_sort_categorical(sort, vc_sort, normalize): expected = expected.take(taker) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("groupby_sort", [True, False]) +def test_value_counts_all_na(sort, dropna, groupby_sort): + # GH#59989 + df = DataFrame({"a": [2, 1, 1], "b": np.nan}) + gb = df.groupby("a", sort=groupby_sort) + result = gb.value_counts(sort=sort, dropna=dropna) + + kwargs = {"levels": [[1, 2], [np.nan]], "names": ["a", "b"]} + if dropna: + data = [] + index = MultiIndex(codes=[[], []], **kwargs) + elif not groupby_sort and not sort: + data = [1, 2] + index = MultiIndex(codes=[[1, 0], [0, 0]], **kwargs) + else: + data = [2, 1] + index = MultiIndex(codes=[[0, 1], [0, 0]], **kwargs) + expected = Series(data, index=index, dtype="int64", name="count") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index cd3d599abd30e..0199e21bfc980 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -455,7 +455,7 @@ def test_insert_out_of_bounds(self, index, using_infer_string): msg = "slice indices must be integers or None or have an __index__ method" if using_infer_string and ( - index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + index.dtype == "string" or index.dtype == "category" ): msg = "loc must be an integer between" diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 89addbbbc1ded..e9fc2b2d27afd 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -886,8 +886,19 @@ def test_maybe_convert_css_to_tuples(self): expected = [] assert maybe_convert_css_to_tuples("") == expected + # issue #59623 + expected = [("a", "b"), ("c", "url('data:123')")] + assert maybe_convert_css_to_tuples("a:b;c: url('data:123');") == expected + + # if no value, return attr and empty string + expected = [("a", ""), ("c", "")] + assert maybe_convert_css_to_tuples("a:;c: ") == expected + def test_maybe_convert_css_to_tuples_err(self): - msg = "Styles supplied as string must follow CSS rule formats" + msg = ( + "Styles supplied as string must follow CSS rule formats, " + "for example 'attr: val;'. 'err' was given." + ) with pytest.raises(ValueError, match=msg): maybe_convert_css_to_tuples("err") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 82cc3a838ca68..0dc16e1ebc723 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -375,12 +375,29 @@ def test_repr_min_rows(self): (None, "{:.3f}", "None"), ("", "{:.2f}", ""), (112345.6789, "{:6.3f}", "112345.679"), + ("foo foo", None, "foo foo"), + (" foo", None, "foo"), + ( + "foo foo foo", + None, + "foo foo foo", + ), # odd no.of spaces + ( + "foo foo foo", + None, + "foo foo foo", + ), # even no.of spaces ], ) def test_repr_float_formatting_html_output( self, data, format_option, expected_values ): - with option_context("display.float_format", format_option.format): + if format_option is not None: + with option_context("display.float_format", format_option.format): + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + else: df = DataFrame({"A": [data]}) html_output = df._repr_html_() assert expected_values in html_output diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 585b7ca94f730..f3645bf0649bd 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -384,6 +384,21 @@ def test_timedelta(transform_assert_equal): assert_equal(result, expected) +@pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(1, "D"), + pd.Timestamp("2017-01-01T12"), + pd.Timestamp("2017-01-01T12", tz="US/Pacific"), + ], +) +def test_timedelta_timestamp_scalar(scalar): + # GH#59944 + result = to_numeric(scalar) + expected = to_numeric(Series(scalar))[0] + assert result == expected + + def test_period(request, transform_assert_equal): transform, assert_equal = transform_assert_equal diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index b5d975a0db1d8..bd741140f6542 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -114,6 +114,14 @@ class InvalidVersion(ValueError): """ An invalid version was found, users should refer to PEP 440. + The ``InvalidVersion`` exception is raised when a version string is + improperly formatted. Pandas uses this exception to ensure that all + version strings are PEP 440 compliant. + + See Also + -------- + util.version.Version : Class for handling and parsing version strings. + Examples -------- >>> pd.util.version.Version("1.") diff --git a/pyproject.toml b/pyproject.toml index d0fcdc4b21b33..1386546996bf2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -324,7 +324,8 @@ ignore = [ "PT019", # The following rules may cause conflicts when used with the formatter: "ISC001", - + # if-stmt-min-max + "PLR1730", ### TODO: Enable gradually # Useless statement @@ -341,8 +342,10 @@ ignore = [ "RUF012", # type-comparison "E721", - - # Additional pylint rules + # repeated-equality-comparison + "PLR1714", + # self-or-cls-assignment + "PLW0642", # literal-membership "PLR6201", # 847 errors # Method could be a function, class method, or static method From 902601f3e86501dcec8aaf0c28cbc59a7f7ee6d6 Mon Sep 17 00:00:00 2001 From: veljanin <veljkojovanovic1991@gmail.com> Date: Mon, 14 Oct 2024 12:34:05 +0200 Subject: [PATCH 6/6] revised testing to resolve CI errors v2 --- pandas/tests/frame/methods/test_convert_dtypes.py | 12 ++++++------ pandas/tests/series/methods/test_convert_dtypes.py | 11 +++++------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 8a7052eb81b98..ab4f849c741fd 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -35,7 +37,7 @@ def test_convert_empty(self): empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) - @tm.skip_if_no("pyarrow") + @td.skip_if_no("pyarrow") def test_convert_empty_categorical_to_pyarrow(self): # GH#59934 df = pd.DataFrame( @@ -51,11 +53,9 @@ def test_convert_empty_categorical_to_pyarrow(self): assert converted.A.dtype == "category", "Dtype in column A is not 'category'" assert converted.B.dtype == "category", "Dtype in column B is not 'category'" assert converted.A.cat.categories.empty, "Categories in column A are not empty" - assert converted.B.cat.categories.__contains__( - "B1" - ) and converted.B.cat.categories.__contains__( - "B2" - ), "Categories in column B doesn't contain adequate categories" + assert converted.B.cat.categories.isin( + ["B1", "B2"] + ).all(), "Categories in column B doesn't contain adequate categories" def test_convert_dtypes_retain_column_names(self): # GH#41435 diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index fa9035f325d71..cb1fa11c9f7c4 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -4,6 +4,7 @@ import pytest from pandas._libs import lib +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -298,7 +299,7 @@ def test_convert_dtypes_pyarrow_null(self): expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) - @tm.skip_if_no("pyarrow") + @td.skip_if_no("pyarrow") def test_convert_empty_categorical_to_pyarrow(self): # GH#59934 ser1 = pd.Series(pd.Categorical([None] * 5)) @@ -311,8 +312,6 @@ def test_convert_empty_categorical_to_pyarrow(self): ser2 = pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"])) converted2 = ser2.convert_dtypes(dtype_backend="pyarrow") - assert converted2.cat.categories.__contains__( - "S1" - ) and converted2.cat.categories.__contains__( - "S2" - ), "Categories in ser2 doesn't contain adequate categories" + assert converted2.cat.categories.isin( + ["S1", "S2"] + ).all(), "Categories in ser2 doesn't contain adequate categories"