From 547aff1672fe455741f380c8bec1ed648074effc Mon Sep 17 00:00:00 2001 From: Niels Bantilan Date: Thu, 4 May 2023 17:45:51 -0400 Subject: [PATCH] Support pandas 2 (#1175) * [wip] support pandas 2 * fix tests * fix tests Signed-off-by: Niels Bantilan * uncomment tests Signed-off-by: Niels Bantilan * remove python 3.11 from tests Signed-off-by: Niels Bantilan * update pylint, add 3.11 back to ci Signed-off-by: Niels Bantilan * add 3.11 to nox Signed-off-by: Niels Bantilan * fix ci Signed-off-by: Niels Bantilan * fix ci Signed-off-by: Niels Bantilan * debug ci Signed-off-by: Niels Bantilan * debug ci Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * update Signed-off-by: Niels Bantilan * update Signed-off-by: Niels Bantilan * revert __str__ and __repr__ calls Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * fix docs Signed-off-by: Niels Bantilan * skip pyspark docs tests with pandas >=2 Signed-off-by: Niels Bantilan * cleanup Signed-off-by: Niels Bantilan * Update ci-tests.yml * Update ci-tests.yml * debugging 3.11 support Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan * debug Signed-off-by: Niels Bantilan --------- Signed-off-by: Niels Bantilan --- .github/workflows/ci-tests.yml | 50 +++++++++++++---------- .pre-commit-config.yaml | 2 +- .pylintrc | 13 ++++-- docs/source/conf.py | 5 ++- docs/source/lazy_validation.rst | 4 +- docs/source/pyspark.rst | 6 +++ environment.yml | 2 +- noxfile.py | 8 ++-- pandera/api/pandas/model_components.py | 2 - pandera/backends/base/__init__.py | 3 -- pandera/backends/pandas/checks.py | 9 ++++- pandera/engines/engine.py | 6 +-- pandera/engines/pandas_engine.py | 14 ++++--- pandera/io/pandas_io.py | 7 ++-- pandera/strategies/pandas_strategies.py | 53 +++++++++++++++++++------ pandera/typing/common.py | 2 +- requirements-dev.txt | 2 +- setup.py | 3 +- tests/conftest.py | 10 +++++ tests/core/test_checks.py | 2 - tests/core/test_decorators.py | 8 ++-- tests/core/test_dtypes.py | 17 +++++++- tests/core/test_engine.py | 2 +- tests/core/test_pandas_engine.py | 8 ++-- tests/io/test_io.py | 2 - tests/modin/conftest.py | 1 - tests/strategies/test_strategies.py | 33 +++++++++------ 27 files changed, 179 insertions(+), 95 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 324b445f6..ee8565027 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] defaults: run: shell: bash -l {0} @@ -100,11 +100,17 @@ jobs: fail-fast: true matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] - python-version: ["3.7", "3.8", "3.9", "3.10"] - pandas-version: ["1.3.0", "latest"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + pandas-version: ["1.3.0", "1.5.2", "2.0.1"] exclude: + - python-version: "3.7" + pandas-version: "2.0.1" + - python-version: "3.7" + pandas-version: "1.5.2" - python-version: "3.10" pandas-version: "1.3.0" + - python-version: "3.11" + pandas-version: "1.3.0" include: - os: ubuntu-latest pip-cache: ~/.cache/pip @@ -113,7 +119,6 @@ jobs: - os: windows-latest pip-cache: ~/AppData/Local/pip/Cache - steps: - uses: actions/checkout@v2 @@ -139,7 +144,6 @@ jobs: with: auto-update-conda: true python-version: ${{ matrix.python-version }} - # mamba-version: "*" miniforge-version: latest miniforge-variant: Mambaforge use-mamba: true @@ -148,22 +152,26 @@ jobs: channel-priority: true use-only-tar-bz2: true - - name: Install Conda Deps [Latest] - if: ${{ matrix.pandas-version == 'latest' }} + # ray currently cannot be installed on python 3.11 + - name: Remove Ray from Deps + if: ${{ matrix.python-version == '3.11' && matrix.os != 'macos-latest' }} + run: sed -i '/ray/d' environment.yml + + - name: Remove Ray from Deps + if: ${{ matrix.python-version == '3.11' && matrix.os == 'macos-latest' }} + run: sed -i .bak '/ray/d' environment.yml + + # need to install pandas via pip: conda installation is on the fritz + - name: Install Conda Deps [pandas 2] + if: ${{ matrix.pandas-version == '2.0.1' }} run: | mamba install -c conda-forge asv pandas geopandas bokeh mamba env update -n pandera-dev -f environment.yml + pip install pandas==${{ matrix.pandas-version }} + pip install --user dask>=2023.3.2 - name: Install Conda Deps - if: ${{ matrix.pandas-version != 'latest' }} - run: mamba install -c conda-forge pandas==${{ matrix.pandas-version }} geopandas - - # ray currently cannot be installed on python 3.10, windows - - name: Remove Ray from Deps - if: ${{ matrix.os == 'windows-latest' && matrix.python-version == '3.10' }} - run: sed -i 's/^ray//g' requirements-dev.txt - - - name: Install Pip Deps + if: ${{ matrix.pandas-version != '2.0.1' }} run: | mamba install -c conda-forge asv pandas==${{ matrix.pandas-version }} geopandas bokeh mamba env update -n pandera-dev -f environment.yml @@ -198,13 +206,15 @@ jobs: run: pytest tests/geopandas ${{ env.PYTEST_FLAGS }} - name: Unit Tests - Dask + if: ${{ matrix.pandas-version != '2.0.1' }} run: pytest tests/dask ${{ env.PYTEST_FLAGS }} - name: Unit Tests - Pyspark - if: ${{ matrix.os != 'windows-latest' && matrix.python-version != '3.10' && matrix.pandas-version != '1.1.5' }} + if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) && matrix.pandas-version != '2.0.1' }} run: pytest tests/pyspark ${{ env.PYTEST_FLAGS }} - name: Unit Tests - Modin-Dask + if: ${{ !contains(fromJson('["3.11"]'), matrix.python-version) && matrix.pandas-version != '2.0.1' }} run: pytest tests/modin ${{ env.PYTEST_FLAGS }} env: CI_MODIN_ENGINES: dask @@ -214,7 +224,7 @@ jobs: # - windows, python 3.10 # - mac, python 3.7 # Tracking issue: https://github.com/modin-project/modin/issues/5466 - if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }} + if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) && matrix.pandas-version != '2.0.1' }} run: pytest tests/modin ${{ env.PYTEST_FLAGS }} env: CI_MODIN_ENGINES: ray @@ -223,9 +233,9 @@ jobs: uses: codecov/codecov-action@v3 - name: Check Docstrings - if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }} + if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) }} run: nox ${{ env.NOX_FLAGS }} --session doctests - name: Check Docs - if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10"]'), matrix.python-version) }} + if: ${{ matrix.os != 'windows-latest' && !contains(fromJson('["3.7", "3.10", "3.11"]'), matrix.python-version) }} run: nox ${{ env.NOX_FLAGS }} --session docs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f446e842..580cacfb6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: args: ["--line-length=79"] - repo: https://github.com/pycqa/pylint - rev: v2.12.2 + rev: v2.17.3 hooks: - id: pylint args: ["--disable=import-error"] diff --git a/.pylintrc b/.pylintrc index 94cf9acca..6bed87754 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,5 +1,5 @@ [BASIC] -ignore=mypy.py +ignore=mypy.py,noxfile.py good-names= T, F, @@ -21,12 +21,16 @@ good-names= ge, lt, le, - dt + dt, + tz, + TBaseModel, + TArraySchemaBase, + TDataFrameModel, + _DataType [MESSAGES CONTROL] disable= # C0330 conflicts with black: https://github.com/psf/black/issues/48 - C0330, R0913, duplicate-code, too-many-instance-attributes, @@ -40,4 +44,5 @@ disable= ungrouped-imports, function-redefined, arguments-differ, - no-self-use + unnecessary-dunder-call, + use-dict-literal diff --git a/docs/source/conf.py b/docs/source/conf.py index 4c85c3c02..e546e9696 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -70,7 +70,10 @@ SKIP = sys.version_info < (3, 6) PY36 = sys.version_info < (3, 7) -SKIP_PANDAS_LT_V1 = version.parse(pd.__version__).release < (1, 0) or PY36 +PANDAS_LT_V2 = version.parse(pd.__version__).release < (1, 0) +PANDAS_GT_V2 = version.parse(pd.__version__).release >= (2, 0) +SKIP_PANDAS_LT_V1 = PANDAS_LT_V2 or PY36 +SKIP_PANDAS_LT_V1_OR_GT_V2 = PANDAS_LT_V2 or PANDAS_GT_V2 or PY36 SKIP_SCALING = True SKIP_SCHEMA_MODEL = SKIP_PANDAS_LT_V1 SKIP_MODIN = True diff --git a/docs/source/lazy_validation.rst b/docs/source/lazy_validation.rst index e8fb363c9..2554821c7 100644 --- a/docs/source/lazy_validation.rst +++ b/docs/source/lazy_validation.rst @@ -118,7 +118,7 @@ catch these errors and inspect the failure cases in a more granular form: .. testcode:: lazy_validation - :skipif: SKIP_PANDAS_LT_V1 + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 try: schema.validate(df, lazy=True) @@ -129,7 +129,7 @@ catch these errors and inspect the failure cases in a more granular form: print(err.data) .. testoutput:: lazy_validation - :skipif: SKIP_PANDAS_LT_V1 + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 Schema errors and failure cases: schema_context column check check_number \ diff --git a/docs/source/pyspark.rst b/docs/source/pyspark.rst index a72c4bc04..f925bccd0 100644 --- a/docs/source/pyspark.rst +++ b/docs/source/pyspark.rst @@ -24,6 +24,7 @@ below we'll use the :ref:`class-based API ` to define a :py:class:`~pandera.api.pandas.model.DataFrameModel` for validation. .. testcode:: scaling_pyspark + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 import pyspark.pandas as ps import pandas as pd @@ -57,6 +58,7 @@ below we'll use the :ref:`class-based API ` to define a .. testoutput:: scaling_pyspark + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 state city price 0 FL Orlando 8 @@ -72,6 +74,7 @@ pyspark pandas dataframes at runtime: .. testcode:: scaling_pyspark + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 @pa.check_types def function(df: DataFrame[Schema]) -> DataFrame[Schema]: @@ -81,6 +84,7 @@ pyspark pandas dataframes at runtime: .. testoutput:: scaling_pyspark + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 state city price 3 CA San Francisco 16 @@ -92,6 +96,7 @@ And of course, you can use the object-based API to validate dask dataframes: .. testcode:: scaling_pyspark + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 schema = pa.DataFrameSchema({ "state": pa.Column(str), @@ -102,6 +107,7 @@ And of course, you can use the object-based API to validate dask dataframes: .. testoutput:: scaling_pyspark + :skipif: SKIP_PANDAS_LT_V1_OR_GT_V2 state city price 0 FL Orlando 8 diff --git a/environment.yml b/environment.yml index 4ddbfddc7..ea653c0bb 100644 --- a/environment.yml +++ b/environment.yml @@ -49,7 +49,7 @@ dependencies: # testing - isort >= 5.7.0 - mypy <= 0.982 - - pylint = 2.12.2 + - pylint <= 2.17.3 - pytest - pytest-cov - pytest-xdist diff --git a/noxfile.py b/noxfile.py index e09bf3cf1..0981f60dc 100644 --- a/noxfile.py +++ b/noxfile.py @@ -7,8 +7,10 @@ from typing import Dict, List # setuptools must be imported before distutils ! -import setuptools # pylint:disable=unused-import # noqa: F401 -from distutils.core import run_setup # pylint:disable=wrong-import-order +import setuptools +from distutils.core import ( + run_setup, +) import nox from nox import Session @@ -24,7 +26,7 @@ ) DEFAULT_PYTHON = "3.8" -PYTHON_VERSIONS = ["3.8", "3.9", "3.10"] +PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"] PANDAS_VERSIONS = ["1.2.0", "1.3.5", "latest"] PACKAGE = "pandera" diff --git a/pandera/api/pandas/model_components.py b/pandera/api/pandas/model_components.py index e86af0168..a996a4afe 100644 --- a/pandera/api/pandas/model_components.py +++ b/pandera/api/pandas/model_components.py @@ -221,8 +221,6 @@ def _check_dispatch(): class CheckInfo(BaseCheckInfo): # pylint:disable=too-few-public-methods """Captures extra information about a Check.""" - ... - class FieldCheckInfo(CheckInfo): # pylint:disable=too-few-public-methods """Captures extra information about a Check assigned to a field.""" diff --git a/pandera/backends/base/__init__.py b/pandera/backends/base/__init__.py index e92b1147d..d18e73acb 100644 --- a/pandera/backends/base/__init__.py +++ b/pandera/backends/base/__init__.py @@ -29,8 +29,6 @@ class CoreCheckResult(NamedTuple): class CoreParserResult(NamedTuple): """Namedtuple for holding core parser results.""" - ... - class BaseSchemaBackend(ABC): """Abstract base class for a schema backend implementation.""" @@ -132,7 +130,6 @@ class BaseCheckBackend(ABC): def __init__(self, check): # pylint: disable=unused-argument """Initializes a check backend object.""" - ... def __call__(self, check_obj, key=None): raise NotImplementedError diff --git a/pandera/backends/pandas/checks.py b/pandera/backends/pandas/checks.py index 682a0e893..886b4e7c2 100644 --- a/pandera/backends/pandas/checks.py +++ b/pandera/backends/pandas/checks.py @@ -56,8 +56,13 @@ def _format_groupby_input( # NOTE: this behavior should be deprecated such that the user deals with # pandas groupby objects instead of dicts. if groups is None: - return dict(list(groupby_obj)) # type: ignore [call-overload] - group_keys = set(group_key for group_key, _ in groupby_obj) # type: ignore [union-attr] + return { + (k if isinstance(k, bool) else k[0] if len(k) == 1 else k): v + for k, v in groupby_obj # type: ignore [union-attr] + } + group_keys = set( + k[0] if len(k) == 1 else k for k, _ in groupby_obj # type: ignore [union-attr] + ) invalid_groups = [g for g in groups if g not in group_keys] if invalid_groups: raise KeyError( diff --git a/pandera/engines/engine.py b/pandera/engines/engine.py index 32981da80..44193e802 100644 --- a/pandera/engines/engine.py +++ b/pandera/engines/engine.py @@ -62,7 +62,7 @@ class Engine(ABCMeta): _registered_dtypes: Set[Type[DataType]] _base_pandera_dtypes: Tuple[Type[DataType]] - def __new__(cls, name, bases, namespace, **kwargs): + def __new__(mcs, name, bases, namespace, **kwargs): base_pandera_dtypes = kwargs.pop("base_pandera_dtypes") try: namespace["_base_pandera_dtypes"] = tuple(base_pandera_dtypes) @@ -70,13 +70,13 @@ def __new__(cls, name, bases, namespace, **kwargs): namespace["_base_pandera_dtypes"] = (base_pandera_dtypes,) namespace["_registered_dtypes"] = set() - engine = super().__new__(cls, name, bases, namespace, **kwargs) + engine = super().__new__(mcs, name, bases, namespace, **kwargs) @functools.singledispatch def dtype(data_type: Any) -> DataType: raise ValueError(f"Data type '{data_type}' not understood") - cls._registry[engine] = _DtypeRegistry(dispatch=dtype, equivalents={}) + mcs._registry[engine] = _DtypeRegistry(dispatch=dtype, equivalents={}) return engine def _check_source_dtype(cls, data_type: Any) -> None: diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index 59db0e818..01ec0ee4d 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -825,29 +825,31 @@ def _coerce( def _to_datetime(col: PandasObject) -> PandasObject: col = to_datetime_fn(col, **self.to_datetime_kwargs) - if hasattr(pandas_dtype, "tz") and pandas_dtype.tz is not None: + pdtype_tz = getattr(pandas_dtype, "tz", None) + coltype_tz = getattr(col.dtype, "tz", None) + if pdtype_tz is not None or coltype_tz is not None: if hasattr(col, "dt"): if col.dt.tz is None: # localize datetime column so that it's timezone-aware col = col.dt.tz_localize( - pandas_dtype.tz, + pdtype_tz, **_tz_localize_kwargs, ) else: - col = col.dt.tz_convert(pandas_dtype.tz) + col = col.dt.tz_convert(pdtype_tz) elif ( hasattr(col, "tz") - and col.tz != pandas_dtype.tz + and col.tz != pdtype_tz and hasattr(col, "tz_localize") ): if col.tz is None: # localize datetime index so that it's timezone-aware col = col.tz_localize( - pandas_dtype.tz, + pdtype_tz, **_tz_localize_kwargs, ) else: - col = col.tz_convert(pandas_dtype.tz) + col = col.tz_convert(pdtype_tz) return col.astype(pandas_dtype) if isinstance(data_container, pd.DataFrame): diff --git a/pandera/io/pandas_io.py b/pandera/io/pandas_io.py index 7ad274bdf..8bf16aa83 100644 --- a/pandera/io/pandas_io.py +++ b/pandera/io/pandas_io.py @@ -56,18 +56,17 @@ def _serialize_check_stats(check_stats, dtype=None): """Serialize check statistics into json/yaml-compatible format.""" def handle_stat_dtype(stat): + if pandas_engine.Engine.dtype(dtypes.DateTime).check( dtype ) and hasattr(stat, "strftime"): # try serializing stat as a string if it's datetime-like, # otherwise return original value return stat.strftime(DATETIME_FORMAT) - elif pandas_engine.Engine.dtype(dtypes.Timedelta).check( - dtype - ) and hasattr(stat, "delta"): + elif pandas_engine.Engine.dtype(dtypes.Timedelta).check(dtype): # try serializing stat into an int in nanoseconds if it's # timedelta-like, otherwise return original value - return stat.delta + return getattr(stat, "value", stat) return stat diff --git a/pandera/strategies/pandas_strategies.py b/pandera/strategies/pandas_strategies.py index 682e34cda..6c63bf0ad 100644 --- a/pandera/strategies/pandas_strategies.py +++ b/pandera/strategies/pandas_strategies.py @@ -245,6 +245,34 @@ def _to_datetime(value) -> pd.DatetimeTZDtype: return st.builds(dtype.type, strategy, res) +def convert_dtype(array: Union[pd.Series, pd.Index], col_dtype: Any): + """Convert datatypes of an array (series or index).""" + if str(col_dtype).startswith("datetime64"): + try: + return array.astype(col_dtype) + except TypeError: + tz = getattr(col_dtype, "tz", None) + if tz is None: + tz_match = re.match(r"datetime64\[ns, (.+)\]", str(col_dtype)) + tz = None if not tz_match else tz_match.group(1) + + if isinstance(array, pd.Index): + return array.tz_localize(tz) # type: ignore [attr-defined] + return array.dt.tz_localize(tz) # type: ignore [union-attr] + return array.astype(col_dtype) + + +def convert_dtypes(df: pd.DataFrame, col_dtypes: Dict[str, Any]): + """Convert datatypes of a dataframe.""" + if df.empty: + return df + + for col_name, col_dtype in col_dtypes.items(): + df[col_name] = convert_dtype(df[col_name], col_dtype) + + return df + + def numpy_time_dtypes( dtype: Union[np.dtype, pd.DatetimeTZDtype], min_value=None, max_value=None ): @@ -788,12 +816,6 @@ def undefined_check_strategy(elements, check): if elements is None: elements = pandas_dtype_strategy(pandera_dtype) - # Hypothesis only supports pure numpy datetime64 (i.e. timezone naive). - # We cast to datetime64 after applying the check strategy so that checks - # can see timezone-aware values. - if _is_datetime_tz(pandera_dtype): - elements = _timestamp_to_datetime64_strategy(elements) - return elements @@ -821,10 +843,19 @@ def series_strategy( :returns: ``hypothesis`` strategy. """ elements = field_element_strategy(pandera_dtype, strategy, checks=checks) + + dtype = ( + None + # let hypothesis use the elements strategy to build datatime-aware + # series + if _is_datetime_tz(pandera_dtype) + else to_numpy_dtype(pandera_dtype) + ) + strategy = ( pdst.series( elements=elements, - dtype=to_numpy_dtype(pandera_dtype), + dtype=dtype, index=pdst.range_indexes( min_size=0 if size is None else size, max_size=size ), @@ -832,7 +863,7 @@ def series_strategy( ) .filter(lambda x: x.shape[0] > 0) .map(lambda x: x.rename(name)) - .map(lambda x: x.astype(pandera_dtype.type)) + .map(partial(convert_dtype, col_dtype=pandera_dtype.type)) ) if nullable: strategy = null_field_masks(strategy) @@ -920,7 +951,7 @@ def index_strategy( min_size=0 if size is None else size, max_size=size, unique=bool(unique), - ).map(lambda x: x.astype(pandera_dtype.type)) + ).map(partial(convert_dtype, col_dtype=pandera_dtype.type)) # this is a hack to convert np.str_ data values into native python str. col_dtype = str(pandera_dtype) @@ -1135,9 +1166,7 @@ def _dataframe_strategy(draw): ) ) - strategy = strategy.map( - lambda df: df if df.empty else df.astype(col_dtypes) - ) + strategy = strategy.map(partial(convert_dtypes, col_dtypes=col_dtypes)) if size is not None and size > 0 and any(nullable_columns.values()): strategy = null_dataframe_masks(strategy, nullable_columns) diff --git a/pandera/typing/common.py b/pandera/typing/common.py index cd98e6c95..1145fefc2 100644 --- a/pandera/typing/common.py +++ b/pandera/typing/common.py @@ -134,7 +134,7 @@ ], ) -DataFrameModel = TypeVar("Schema", bound="DataFrameModel") # type: ignore +DataFrameModel = TypeVar("DataFrameModel", bound="DataFrameModel") # type: ignore # pylint:disable=invalid-name diff --git a/requirements-dev.txt b/requirements-dev.txt index ef5f8aeac..499470ad7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -28,7 +28,7 @@ fastapi black >= 22.1.0 isort >= 5.7.0 mypy <= 0.982 -pylint == 2.12.2 +pylint <= 2.17.3 pytest pytest-cov pytest-xdist diff --git a/setup.py b/setup.py index 22a4a4ed2..df9ffa483 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ "wrapt", ], extras_require=extras_require, - python_requires=">=3.7", + python_requires=">=3.7,<=3.11", platforms="any", classifiers=[ "Development Status :: 5 - Production/Stable", @@ -69,6 +69,7 @@ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering", ], ) diff --git a/tests/conftest.py b/tests/conftest.py index 07e327a63..d359f2776 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,8 @@ import os +from pandera.engines.utils import pandas_version + try: # pylint: disable=unused-import import hypothesis # noqa F401 @@ -13,6 +15,14 @@ # ignore test files associated with hypothesis strategies collect_ignore = [] +collect_ignore_glob = [] + +# ignore pyspark, modin and dask tests until these libraries support pandas 2 +if pandas_version().release >= (2, 0, 0): + collect_ignore_glob.append("pyspark/**") + collect_ignore_glob.append("modin/**") + collect_ignore_glob.append("dask/**") + if not HAS_HYPOTHESIS: collect_ignore.append("test_strategies.py") else: diff --git a/tests/core/test_checks.py b/tests/core/test_checks.py index d8e4f9c05..a7a17edcf 100644 --- a/tests/core/test_checks.py +++ b/tests/core/test_checks.py @@ -495,8 +495,6 @@ def test_check_backend_not_found(): class CustomDataObject: """Custom data object.""" - ... - dummy_check = Check(lambda _: True) with pytest.raises(KeyError, match="Backend not found for class"): diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py index e1001276d..3e98a6c0c 100644 --- a/tests/core/test_decorators.py +++ b/tests/core/test_decorators.py @@ -257,7 +257,7 @@ def test_check_instance_method_decorator_error() -> None: class TestClass: @check_input(DataFrameSchema({"column1": Column(Int)})) def test_method(self, df): - # pylint: disable=missing-function-docstring,no-self-use + # pylint: disable=missing-function-docstring return df with pytest.raises( @@ -822,7 +822,7 @@ class Config: class SomeClass: @check_types - def regular_method( # pylint: disable=no-self-use + def regular_method( self, df1: DataFrame[SchemaIn1], df2: DataFrame[SchemaIn2], @@ -976,7 +976,7 @@ class Meta(type): @check_output(Schema.to_schema()) @check_input(Schema.to_schema(), "df1") @check_io(df1=Schema.to_schema(), out=Schema.to_schema()) - async def regular_meta_coroutine( # pylint: disable=no-self-use + async def regular_meta_coroutine( cls, df1: DataFrame[Schema], ) -> DataFrame[Schema]: @@ -1007,7 +1007,7 @@ class SomeClass(metaclass=Meta): @check_output(Schema.to_schema()) @check_input(Schema.to_schema(), "df1") @check_io(df1=Schema.to_schema(), out=Schema.to_schema()) - async def regular_coroutine( # pylint: disable=no-self-use + async def regular_coroutine( self, df1: DataFrame[Schema], ) -> DataFrame[Schema]: diff --git a/tests/core/test_dtypes.py b/tests/core/test_dtypes.py index 9f92c26b9..ab3037c82 100644 --- a/tests/core/test_dtypes.py +++ b/tests/core/test_dtypes.py @@ -5,6 +5,7 @@ import dataclasses import datetime import inspect +import re from decimal import Decimal from typing import Any, Dict, List, Tuple @@ -19,6 +20,7 @@ import pandera as pa from pandera.engines import pandas_engine +from pandera.engines.utils import pandas_version from pandera.system import FLOAT_128_AVAILABLE # List dtype classes and associated pandas alias, @@ -315,7 +317,18 @@ def test_check_not_equivalent(dtype: Any): def test_coerce_no_cast(dtype: Any, pd_dtype: Any, data: List[Any]): """Test that dtypes can be coerced without casting.""" expected_dtype = pandas_engine.Engine.dtype(dtype) - series = pd.Series(data, dtype=pd_dtype) + + if isinstance(pd_dtype, str) and "datetime64" in pd_dtype: + # handle dtype case + tz_match = re.match(r"datetime64\[ns, (.+)\]", pd_dtype) + tz = None if not tz_match else tz_match.group(1) + if pandas_version().release >= (2, 0, 0): + series = pd.Series(data, dtype=pd_dtype).dt.tz_localize(tz) + else: + series = pd.Series(data, dtype=pd_dtype) # type: ignore[assignment] + else: + series = pd.Series(data, dtype=pd_dtype) # type: ignore[assignment] + coerced_series = expected_dtype.coerce(series) assert series.equals(coerced_series) @@ -323,7 +336,7 @@ def test_coerce_no_cast(dtype: Any, pd_dtype: Any, data: List[Any]): pandas_engine.Engine.dtype(coerced_series.dtype) ) - df = pd.DataFrame({"col": data}, dtype=pd_dtype) + df = pd.DataFrame({"col": series}) coerced_df = expected_dtype.coerce(df) assert df.equals(coerced_df) diff --git a/tests/core/test_engine.py b/tests/core/test_engine.py index ddc450cab..0124ab746 100644 --- a/tests/core/test_engine.py +++ b/tests/core/test_engine.py @@ -97,7 +97,7 @@ def test_register_notclassmethod_from_parametrized_dtype(engine: Engine): @engine.register_dtype class _InvalidDtype(BaseDataType): - def from_parametrized_dtype( # pylint:disable=no-self-argument,no-self-use + def from_parametrized_dtype( # pylint:disable=no-self-argument cls, x: int ): return x diff --git a/tests/core/test_pandas_engine.py b/tests/core/test_pandas_engine.py index 7dffb33c0..125b79995 100644 --- a/tests/core/test_pandas_engine.py +++ b/tests/core/test_pandas_engine.py @@ -69,7 +69,7 @@ def test_pandas_category_dtype(data): coerced_data = dtype.coerce(data) assert dtype.check(coerced_data.dtype) - for _, value in data.iteritems(): + for _, value in data.items(): coerced_value = dtype.coerce_value(value) assert coerced_value in CATEGORIES @@ -83,7 +83,7 @@ def test_pandas_category_dtype_error(data): with pytest.raises(TypeError): dtype.coerce(data) - for _, value in data.iteritems(): + for _, value in data.items(): with pytest.raises(TypeError): dtype.coerce_value(value) @@ -102,7 +102,7 @@ def test_pandas_boolean_native_type(data): coerced_data = dtype.coerce(data) assert dtype.check(coerced_data.dtype) - for _, value in data.iteritems(): + for _, value in data.items(): dtype.coerce_value(value) @@ -115,7 +115,7 @@ def test_pandas_boolean_native_type_error(data): with pytest.raises(TypeError): dtype.coerce(data) - for _, value in data.iteritems(): + for _, value in data.items(): with pytest.raises(TypeError): dtype.coerce_value(value) diff --git a/tests/io/test_io.py b/tests/io/test_io.py index 8b9eeb788..b6c078fc0 100644 --- a/tests/io/test_io.py +++ b/tests/io/test_io.py @@ -956,7 +956,6 @@ def test_to_yaml_custom_dataframe_check(): def test_to_yaml_bugfix_warn_unregistered_global_checks(): """Ensure that unregistered global checks raises a warning.""" - # pylint: disable=no-self-use class CheckedDataFrameModel(pandera.DataFrameModel): """Schema with a global check""" @@ -967,7 +966,6 @@ class CheckedDataFrameModel(pandera.DataFrameModel): @pandera.dataframe_check() def unregistered_check(self, _): """sample unregistered check""" - ... with pytest.warns(UserWarning, match=".*registered checks.*"): CheckedDataFrameModel.to_yaml() diff --git a/tests/modin/conftest.py b/tests/modin/conftest.py index 63e6a12e2..391843299 100644 --- a/tests/modin/conftest.py +++ b/tests/modin/conftest.py @@ -9,7 +9,6 @@ ENGINES = os.getenv("CI_MODIN_ENGINES", "").split(",") if ENGINES == [""]: - # ENGINES = ["ray", "dask"] ENGINES = ["dask"] diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py index 286953895..57d6500e9 100644 --- a/tests/strategies/test_strategies.py +++ b/tests/strategies/test_strategies.py @@ -642,7 +642,11 @@ def test_field_element_strategy(data_type, data): element = data.draw(strategy) expected_type = strategies.to_numpy_dtype(data_type).type - assert element.dtype.type == expected_type + if strategies.pandas_strategies._is_datetime_tz(data_type): + assert isinstance(element, pd.Timestamp) + assert element.tz == data_type.tz + else: + assert element.dtype.type == expected_type with pytest.raises(pa.errors.BaseStrategyOnlyError): strategies.field_element_strategy( @@ -662,6 +666,13 @@ def test_check_nullable_field_strategy( ): """Test strategies for generating nullable column/index data.""" size = 5 + + if ( + str(data_type) == "float16" + and field_strategy.__name__ == "index_strategy" + ): + pytest.xfail("float16 is not supported for indexes") + strat = field_strategy(data_type, nullable=nullable, size=size) example = data.draw(strat) @@ -827,7 +838,6 @@ def test_schema_model_strategy_df_check(data) -> None: class SchemaWithDFCheck(Schema): """Schema with a custom dataframe-level check with no strategy.""" - # pylint:disable=no-self-use @pa.dataframe_check @classmethod def non_empty(cls, df: pd.DataFrame) -> bool: @@ -879,17 +889,16 @@ def test_datetime_example(check_arg, data) -> None: @pytest.mark.parametrize( - "dtype", - ( - pd.DatetimeTZDtype(tz="UTC"), - pd.DatetimeTZDtype(tz="dateutil/US/Central"), - ), -) -@pytest.mark.parametrize( - "check_arg", + "dtype, check_arg", [ - pd.Timestamp("2006-01-01", tz="CET"), - pd.Timestamp("2006-01-01", tz="UTC"), + [ + pd.DatetimeTZDtype(tz="UTC"), + pd.Timestamp("2006-01-01", tz="UTC"), + ], + [ + pd.DatetimeTZDtype(tz="CET"), + pd.Timestamp("2006-01-01", tz="CET"), + ], ], ) @hypothesis.given(st.data())