Support pandas 2 (#1175)

* [wip] support pandas 2 * fix tests * fix tests Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * uncomment tests Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * remove python 3.11 from tests Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * update pylint, add 3.11 back to ci Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * add 3.11 to nox Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * fix ci Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * fix ci Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug ci Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug ci Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * update Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * update Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * revert __str__ and __repr__ calls Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * fix docs Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * skip pyspark docs tests with pandas >=2 Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * cleanup Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * Update ci-tests.yml * Update ci-tests.yml * debugging 3.11 support Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> * debug Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com> --------- Signed-off-by: Niels Bantilan <niels.bantilan@gmail.com>
unionai-oss · May 4, 2023 · 547aff1 · 547aff1
1 parent cd9ced7
commit 547aff1
Show file tree

Hide file tree

Showing 27 changed files with 179 additions and 95 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
     defaults:
       run:
         shell: bash -l {0}
@@ -100,11 +100,17 @@ jobs:
       fail-fast: true
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
-        pandas-version: ["1.3.0", "latest"]
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        pandas-version: ["1.3.0", "1.5.2", "2.0.1"]
         exclude:
+        - python-version: "3.7"
+          pandas-version: "2.0.1"
+        - python-version: "3.7"
+          pandas-version: "1.5.2"
         - python-version: "3.10"
           pandas-version: "1.3.0"
+        - python-version: "3.11"
+          pandas-version: "1.3.0"
         include:
         - os: ubuntu-latest
           pip-cache: ~/.cache/pip
@@ -113,7 +119,6 @@ jobs:
         - os: windows-latest
           pip-cache: ~/AppData/Local/pip/Cache
 
-
     steps:
       - uses: actions/checkout@v2
 
@@ -139,7 +144,6 @@ jobs:
         with:
           auto-update-conda: true
           python-version: ${{ matrix.python-version }}
-          # mamba-version: "*"
           miniforge-version: latest
           miniforge-variant: Mambaforge
           use-mamba: true
@@ -148,22 +152,26 @@ jobs:
           channel-priority: true
           use-only-tar-bz2: true
 
-      - name: Install Conda Deps [Latest]
-        if: ${{ matrix.pandas-version == 'latest' }}
+      # ray currently cannot be installed on python 3.11
+      - name: Remove Ray from Deps
+        if: ${{ matrix.python-version == '3.11' && matrix.os != 'macos-latest' }}
+        run: sed -i '/ray/d' environment.yml
+
+      - name: Remove Ray from Deps
+        if: ${{ matrix.python-version == '3.11' && matrix.os == 'macos-latest' }}
+        run: sed -i .bak '/ray/d' environment.yml
+
+      # need to install pandas via pip: conda installation is on the fritz
+      - name: Install Conda Deps [pandas 2]
+        if: ${{ matrix.pandas-version == '2.0.1' }}
         run: |
           mamba install -c conda-forge asv pandas geopandas bokeh
           mamba env update -n pandera-dev -f environment.yml
+          pip install pandas==${{ matrix.pandas-version }}
+          pip install --user dask>=2023.3.2
 
       - name: Install Conda Deps
-        if: ${{ matrix.pandas-version != 'latest' }}
-        run: mamba install -c conda-forge pandas==${{ matrix.pandas-version }} geopandas
-
-      # ray currently cannot be installed on python 3.10, windows
-      - name: Remove Ray from Deps
-        if: ${{ matrix.os == 'windows-latest' && matrix.python-version == '3.10' }}
-        run: sed -i 's/^ray//g' requirements-dev.txt
-
-      - name: Install Pip Deps
+        if: ${{ matrix.pandas-version != '2.0.1' }}
         run: |
           mamba install -c conda-forge asv pandas==${{ matrix.pandas-version }} geopandas bokeh
           mamba env update -n pandera-dev -f environment.yml
@@ -198,13 +206,15 @@ jobs:
         run: pytest tests/geopandas ${{ env.PYTEST_FLAGS }}
 
       - name: Unit Tests - Dask
+        if: ${{ matrix.pandas-version != '2.0.1' }}
         run: pytest tests/dask ${{ env.PYTEST_FLAGS }}
 
       - name: Unit Tests - Pyspark
-        if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.pandas-version != '1.1.5' }}
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) && matrix.pandas-version != '2.0.1' }}
         run: pytest tests/pyspark ${{ env.PYTEST_FLAGS }}
 
       - name: Unit Tests - Modin-Dask
+        if: ${{ !contains(fromJson('["3.11"]'), matrix.python-version) && matrix.pandas-version != '2.0.1' }}
         run: pytest tests/modin ${{ env.PYTEST_FLAGS }}
         env:
           CI_MODIN_ENGINES: dask
@@ -214,7 +224,7 @@ jobs:
         # - windows, python 3.10
         # - mac, python 3.7
         # Tracking issue: https://github.com/modin-project/modin/issues/5466
-        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version)  && matrix.pandas-version != '2.0.1' }}
         run: pytest tests/modin ${{ env.PYTEST_FLAGS }}
         env:
           CI_MODIN_ENGINES: ray
@@ -223,9 +233,9 @@ jobs:
         uses: codecov/codecov-action@v3
 
       - name: Check Docstrings
-        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) }}
         run: nox ${{ env.NOX_FLAGS }} --session doctests
 
       - name: Check Docs
-        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }}
+        if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) }}
         run: nox ${{ env.NOX_FLAGS }} --session docs
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -39,7 +39,7 @@ repos:
         args: ["--line-length=79"]
 
   - repo: https://github.com/pycqa/pylint
-    rev: v2.12.2
+    rev: v2.17.3
     hooks:
       - id: pylint
         args: ["--disable=import-error"]

diff --git a/.pylintrc b/.pylintrc
@@ -1,5 +1,5 @@
 [BASIC]
-ignore=mypy.py
+ignore=mypy.py,noxfile.py
 good-names=
     T,
     F,
@@ -21,12 +21,16 @@ good-names=
     ge,
     lt,
     le,
-    dt
+    dt,
+    tz,
+    TBaseModel,
+    TArraySchemaBase,
+    TDataFrameModel,
+    _DataType
 
 [MESSAGES CONTROL]
 disable=
     # C0330 conflicts with black: https://github.com/psf/black/issues/48
-    C0330,
     R0913,
     duplicate-code,
     too-many-instance-attributes,
@@ -40,4 +44,5 @@ disable=
     ungrouped-imports,
     function-redefined,
     arguments-differ,
-    no-self-use
+    unnecessary-dunder-call,
+    use-dict-literal
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -70,7 +70,10 @@
 
 SKIP = sys.version_info < (3, 6)
 PY36 = sys.version_info < (3, 7)
-SKIP_PANDAS_LT_V1 = version.parse(pd.__version__).release < (1, 0) or PY36
+PANDAS_LT_V2 = version.parse(pd.__version__).release < (1, 0)
+PANDAS_GT_V2 = version.parse(pd.__version__).release >= (2, 0)
+SKIP_PANDAS_LT_V1 = PANDAS_LT_V2 or PY36
+SKIP_PANDAS_LT_V1_OR_GT_V2 = PANDAS_LT_V2 or PANDAS_GT_V2 or PY36
 SKIP_SCALING = True
 SKIP_SCHEMA_MODEL = SKIP_PANDAS_LT_V1
 SKIP_MODIN = True

diff --git a/docs/source/lazy_validation.rst b/docs/source/lazy_validation.rst
@@ -118,7 +118,7 @@ catch these errors and inspect the failure cases in a more granular form:
 
 
 .. testcode:: lazy_validation
-    :skipif: SKIP_PANDAS_LT_V1
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
     try:
         schema.validate(df, lazy=True)
@@ -129,7 +129,7 @@ catch these errors and inspect the failure cases in a more granular form:
         print(err.data)
 
 .. testoutput:: lazy_validation
-    :skipif: SKIP_PANDAS_LT_V1
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
     Schema errors and failure cases:
         schema_context        column                check check_number  \

diff --git a/docs/source/pyspark.rst b/docs/source/pyspark.rst
@@ -24,6 +24,7 @@ below we'll use the :ref:`class-based API <dataframe_models>` to define a
 :py:class:`~pandera.api.pandas.model.DataFrameModel` for validation.
 
 .. testcode:: scaling_pyspark
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
     import pyspark.pandas as ps
     import pandas as pd
@@ -57,6 +58,7 @@ below we'll use the :ref:`class-based API <dataframe_models>` to define a
 
 
 .. testoutput:: scaling_pyspark
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
       state           city  price
     0    FL        Orlando      8
@@ -72,6 +74,7 @@ pyspark pandas dataframes at runtime:
 
 
 .. testcode:: scaling_pyspark
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
     @pa.check_types
     def function(df: DataFrame[Schema]) -> DataFrame[Schema]:
@@ -81,6 +84,7 @@ pyspark pandas dataframes at runtime:
 
 
 .. testoutput:: scaling_pyspark
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
       state           city  price
     3    CA  San Francisco     16
@@ -92,6 +96,7 @@ And of course, you can use the object-based API to validate dask dataframes:
 
 
 .. testcode:: scaling_pyspark
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
     schema = pa.DataFrameSchema({
         "state": pa.Column(str),
@@ -102,6 +107,7 @@ And of course, you can use the object-based API to validate dask dataframes:
 
 
 .. testoutput:: scaling_pyspark
+    :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2
 
       state           city  price
     0    FL        Orlando      8

diff --git a/environment.yml b/environment.yml
@@ -49,7 +49,7 @@ dependencies:
   # testing
   - isort >= 5.7.0
   - mypy <= 0.982
-  - pylint = 2.12.2
+  - pylint <= 2.17.3
   - pytest
   - pytest-cov
   - pytest-xdist

diff --git a/noxfile.py b/noxfile.py
@@ -7,8 +7,10 @@
 from typing import Dict, List
 
 # setuptools must be imported before distutils !
-import setuptools  # pylint:disable=unused-import  # noqa: F401
-from distutils.core import run_setup  # pylint:disable=wrong-import-order
+import setuptools
+from distutils.core import (
+    run_setup,
+)
 
 import nox
 from nox import Session
@@ -24,7 +26,7 @@
 )
 
 DEFAULT_PYTHON = "3.8"
-PYTHON_VERSIONS = ["3.8", "3.9", "3.10"]
+PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
 PANDAS_VERSIONS = ["1.2.0", "1.3.5", "latest"]
 
 PACKAGE = "pandera"

diff --git a/pandera/api/pandas/model_components.py b/pandera/api/pandas/model_components.py
@@ -221,8 +221,6 @@ def _check_dispatch():
 class CheckInfo(BaseCheckInfo):  # pylint:disable=too-few-public-methods
     """Captures extra information about a Check."""
 
-    ...
-
 
 class FieldCheckInfo(CheckInfo):  # pylint:disable=too-few-public-methods
     """Captures extra information about a Check assigned to a field."""

diff --git a/pandera/backends/base/__init__.py b/pandera/backends/base/__init__.py
@@ -29,8 +29,6 @@ class CoreCheckResult(NamedTuple):
 class CoreParserResult(NamedTuple):
     """Namedtuple for holding core parser results."""
 
-    ...
-
 
 class BaseSchemaBackend(ABC):
     """Abstract base class for a schema backend implementation."""
@@ -132,7 +130,6 @@ class BaseCheckBackend(ABC):
 
     def __init__(self, check):  # pylint: disable=unused-argument
         """Initializes a check backend object."""
-        ...
 
     def __call__(self, check_obj, key=None):
         raise NotImplementedError

diff --git a/pandera/backends/pandas/checks.py b/pandera/backends/pandas/checks.py
@@ -56,8 +56,13 @@ def _format_groupby_input(
         # NOTE: this behavior should be deprecated such that the user deals with
         # pandas groupby objects instead of dicts.
         if groups is None:
-            return dict(list(groupby_obj))  # type: ignore [call-overload]
-        group_keys = set(group_key for group_key, _ in groupby_obj)  # type: ignore [union-attr]
+            return {
+                (k if isinstance(k, bool) else k[0] if len(k) == 1 else k): v
+                for k, v in groupby_obj  # type: ignore [union-attr]
+            }
+        group_keys = set(
+            k[0] if len(k) == 1 else k for k, _ in groupby_obj  # type: ignore [union-attr]
+        )
         invalid_groups = [g for g in groups if g not in group_keys]
         if invalid_groups:
             raise KeyError(

diff --git a/pandera/engines/engine.py b/pandera/engines/engine.py
@@ -62,21 +62,21 @@ class Engine(ABCMeta):
     _registered_dtypes: Set[Type[DataType]]
     _base_pandera_dtypes: Tuple[Type[DataType]]
 
-    def __new__(cls, name, bases, namespace, **kwargs):
+    def __new__(mcs, name, bases, namespace, **kwargs):
         base_pandera_dtypes = kwargs.pop("base_pandera_dtypes")
         try:
             namespace["_base_pandera_dtypes"] = tuple(base_pandera_dtypes)
         except TypeError:
             namespace["_base_pandera_dtypes"] = (base_pandera_dtypes,)
 
         namespace["_registered_dtypes"] = set()
-        engine = super().__new__(cls, name, bases, namespace, **kwargs)
+        engine = super().__new__(mcs, name, bases, namespace, **kwargs)
 
         @functools.singledispatch
         def dtype(data_type: Any) -> DataType:
             raise ValueError(f"Data type '{data_type}' not understood")
 
-        cls._registry[engine] = _DtypeRegistry(dispatch=dtype, equivalents={})
+        mcs._registry[engine] = _DtypeRegistry(dispatch=dtype, equivalents={})
         return engine
 
     def _check_source_dtype(cls, data_type: Any) -> None:

diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -825,29 +825,31 @@ def _coerce(
 
         def _to_datetime(col: PandasObject) -> PandasObject:
             col = to_datetime_fn(col, **self.to_datetime_kwargs)
-            if hasattr(pandas_dtype, "tz") and pandas_dtype.tz is not None:
+            pdtype_tz = getattr(pandas_dtype, "tz", None)
+            coltype_tz = getattr(col.dtype, "tz", None)
+            if pdtype_tz is not None or coltype_tz is not None:
                 if hasattr(col, "dt"):
                     if col.dt.tz is None:
                         # localize datetime column so that it's timezone-aware
                         col = col.dt.tz_localize(
-                            pandas_dtype.tz,
+                            pdtype_tz,
                             **_tz_localize_kwargs,
                         )
                     else:
-                        col = col.dt.tz_convert(pandas_dtype.tz)
+                        col = col.dt.tz_convert(pdtype_tz)
                 elif (
                     hasattr(col, "tz")
-                    and col.tz != pandas_dtype.tz
+                    and col.tz != pdtype_tz
                     and hasattr(col, "tz_localize")
                 ):
                     if col.tz is None:
                         # localize datetime index so that it's timezone-aware
                         col = col.tz_localize(
-                            pandas_dtype.tz,
+                            pdtype_tz,
                             **_tz_localize_kwargs,
                         )
                     else:
-                        col = col.tz_convert(pandas_dtype.tz)
+                        col = col.tz_convert(pdtype_tz)
             return col.astype(pandas_dtype)
 
         if isinstance(data_container, pd.DataFrame):

diff --git a/pandera/io/pandas_io.py b/pandera/io/pandas_io.py
@@ -56,18 +56,17 @@ def _serialize_check_stats(check_stats, dtype=None):
     """Serialize check statistics into json/yaml-compatible format."""
 
     def handle_stat_dtype(stat):
+
         if pandas_engine.Engine.dtype(dtypes.DateTime).check(
             dtype
         ) and hasattr(stat, "strftime"):
             # try serializing stat as a string if it's datetime-like,
             # otherwise return original value
             return stat.strftime(DATETIME_FORMAT)
-        elif pandas_engine.Engine.dtype(dtypes.Timedelta).check(
-            dtype
-        ) and hasattr(stat, "delta"):
+        elif pandas_engine.Engine.dtype(dtypes.Timedelta).check(dtype):
             # try serializing stat into an int in nanoseconds if it's
             # timedelta-like, otherwise return original value
-            return stat.delta
+            return getattr(stat, "value", stat)
 
         return stat