unionai-oss · cosmicBboy · Dec 15, 2022 · Dec 14, 2022 · Dec 15, 2022 · Dec 15, 2022
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -19,6 +19,10 @@ env:
   # Increase this value to reset cache if environment.yml has not changed
   CACHE_VERSION: 6
 
+concurrency:
+  group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
+  cancel-in-progress: true
+
 jobs:
 
   lint:
@@ -93,7 +97,7 @@ jobs:
       PYTEST_FLAGS: --cov=pandera --cov-report=term-missing --cov-report=xml --cov-append
       HYPOTHESIS_FLAGS: -n=auto -q --hypothesis-profile=ci
     strategy:
-      fail-fast: true
+      fail-fast: false
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
         python-version: ["3.7", "3.8", "3.9", "3.10"]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -54,6 +54,8 @@ repos:
         entry: mypy
         language: python
         types: [python]
-        files: (^pandera/|^tests/|^scripts/)
+        pass_filenames: false
         exclude: (^docs/|^tests/mypy/modules/)
         require_serial: true
+        args: ["pandera", "tests", "scripts"]
+        verbose: true
diff --git a/environment.yml b/environment.yml
@@ -21,7 +21,7 @@ dependencies:
   - pydantic
 
   # mypy extra
-  - pandas-stubs
+  - pandas-stubs <= 1.5.2.221213
 
   # pyspark extra
   - pyspark >= 3.2.0

diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,8 @@
+[mypy]
+ignore_missing_imports = True
+follow_imports = skip
+allow_redefinition = True
+warn_return_any = False
+warn_unused_configs = True
+show_error_codes = True
+exclude = tests/mypy/modules
diff --git a/noxfile.py b/noxfile.py
@@ -265,7 +265,7 @@ def requirements(session: Session) -> None:  # pylint:disable=unused-argument
         print(f"{REQUIREMENT_PATH} has been re-generated ✨ 🍰 ✨")
         raise err
 
-    ignored_pkgs = {"black", "pandas"}
+    ignored_pkgs = {"black", "pandas", "pandas-stubs"}
     mismatched = []
     # only compare package versions, not python version markers.
     str_dev_reqs = [str(x) for x in DEV_REQUIREMENTS]

diff --git a/pandera/checks.py b/pandera/checks.py
@@ -325,7 +325,7 @@ def _prepare_series_input(
         if check_utils.is_field(df_or_series):
             return df_or_series  # type: ignore[return-value]
         elif self.groupby is None:
-            return df_or_series[column]  # type: ignore[index]
+            return df_or_series[column]  # type: ignore
         elif isinstance(self.groupby, list):
             return self._format_groupby_input(  # type: ignore[return-value]
                 df_or_series.groupby(self.groupby)[column],  # type: ignore[index]

diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py
@@ -525,7 +525,7 @@ def coerce_value(self, value: Any) -> decimal.Decimal:
         return dec.quantize(self._exp, context=self._ctx)
 
     def coerce(self, data_container: PandasObject) -> PandasObject:
-        return data_container.apply(self.coerce_value)
+        return data_container.apply(self.coerce_value)  # type: ignore
 
     def check(  # type: ignore
         self,
@@ -577,7 +577,7 @@ def __init__(  # pylint:disable=super-init-not-called
         object.__setattr__(
             self,
             "type",
-            pd.CategoricalDtype(self.categories, self.ordered),
+            pd.CategoricalDtype(self.categories, self.ordered),  # type: ignore
         )
 
     def coerce(self, data_container: PandasObject) -> PandasObject:
@@ -639,13 +639,13 @@ def __str__(self) -> str:
 else:
 
     @Engine.register_dtype(
-        equivalents=["string", pd.StringDtype, pd.StringDtype()]
-    )  # type: ignore
+        equivalents=["string", pd.StringDtype, pd.StringDtype()]  # type: ignore
+    )
     @immutable
     class STRING(DataType, dtypes.String):  # type: ignore
         """Semantic representation of a :class:`pandas.StringDtype`."""
 
-        type = pd.StringDtype()
+        type = pd.StringDtype()  # type: ignore
 
 
 @Engine.register_dtype(
@@ -984,8 +984,8 @@ def __post_init__(self):
     def from_parametrized_dtype(cls, pd_dtype: pd.SparseDtype):
         """Convert a :class:`pandas.SparseDtype` to
         a Pandera :class:`pandera.engines.pandas_engine.Sparse`."""
-        return cls(  # type: ignore
-            dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value
+        return cls(
+            dtype=pd_dtype.subtype, fill_value=pd_dtype.fill_value  # type: ignore
         )
 
 

diff --git a/pandera/schemas.py b/pandera/schemas.py
@@ -347,7 +347,7 @@ def get_dtypes(self, dataframe: pd.DataFrame) -> Dict[str, DataType]:
                 )
         return {
             **{n: c.dtype for n, c in self.columns.items() if not c.regex},
-            **regex_dtype,
+            **regex_dtype,  # type: ignore
         }
 
     @property
@@ -595,7 +595,7 @@ def _validate(
                 is_schema_col = column in expanded_column_names
                 if (self.strict is True) and not is_schema_col:
                     msg = (
-                        f"column '{column}' not in {self.__class__.__name__}"
+                        f"column {column!r} not in {self.__class__.__name__}"
                         f" {self.columns}"
                     )
                     error_handler.collect_error(
@@ -621,7 +621,7 @@ def _validate(
                             errors.SchemaError(
                                 self,
                                 check_obj,
-                                message=f"column '{column}' out-of-order",
+                                message=f"column {column!r} out-of-order",
                                 failure_cases=scalar_failure_case(column),
                                 check="column_ordered",
                             ),

diff --git a/pandera/strategies.py b/pandera/strategies.py
@@ -74,7 +74,7 @@ def _mask(
 ) -> Union[pd.Series, pd.Index]:
     if pd.api.types.is_timedelta64_dtype(val):  # type: ignore [arg-type]
         return val.mask(null_mask, pd.NaT)  # type: ignore [union-attr,arg-type]
-    elif val.dtype == pd.StringDtype():
+    elif val.dtype == pd.StringDtype():  # type: ignore [call-arg]
         return val.mask(null_mask, pd.NA)  # type: ignore [union-attr,arg-type]
     return val.mask(null_mask)  # type: ignore [union-attr]
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -14,7 +14,7 @@ typing_extensions >= 3.7.4.3
 frictionless
 pyarrow
 pydantic
-pandas-stubs
+pandas-stubs <= 1.5.2.221213
 pyspark >= 3.2.0
 modin
 protobuf <= 3.20.3

diff --git a/setup.cfg b/setup.cfg
@@ -1,11 +1,3 @@
 [isort]
 float_to_top = true
 profile = black
-
-[mypy]
-ignore_missing_imports = True
-allow_redefinition = True
-warn_return_any = False
-warn_unused_configs = True
-show_error_codes = True
-exclude = tests/mypy/modules
diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py
@@ -900,13 +900,13 @@ def validate_union(
     ) -> typing.Union[DataFrame[OnlyZeroesSchema], DataFrame[OnlyOnesSchema]]:
         return df
 
-    validate_union(pd.DataFrame({"a": [0, 0]}))
-    validate_union(pd.DataFrame({"a": [1, 1]}))
+    validate_union(pd.DataFrame({"a": [0, 0]}))  # type: ignore [arg-type]
+    validate_union(pd.DataFrame({"a": [1, 1]}))  # type: ignore [arg-type]
 
     with pytest.raises(errors.SchemaErrors):
-        validate_union(pd.DataFrame({"a": [0, 1]}))
+        validate_union(pd.DataFrame({"a": [0, 1]}))  # type: ignore [arg-type]
     with pytest.raises(errors.SchemaErrors):
-        validate_union(pd.DataFrame({"a": [2, 2]}))
+        validate_union(pd.DataFrame({"a": [2, 2]}))  # type: ignore [arg-type]
 
     @check_types
     def validate_union_wrong_outputs(
@@ -916,10 +916,10 @@ def validate_union_wrong_outputs(
     ) -> typing.Union[DataFrame[OnlyZeroesSchema], DataFrame[OnlyOnesSchema]]:
         new_df = df.copy()
         new_df["a"] = [0, 1]
-        return new_df
+        return new_df  # type: ignore [return-value]
 
     with pytest.raises(errors.SchemaErrors):
-        validate_union_wrong_outputs(pd.DataFrame({"a": [0, 0]}))
+        validate_union_wrong_outputs(pd.DataFrame({"a": [0, 0]}))  # type: ignore [arg-type]
 
 
 def test_check_types_non_dataframes() -> None:
@@ -947,7 +947,7 @@ def union_df_int_types_pydantic_check(
     ) -> typing.Union[DataFrame[OnlyZeroesSchema], int]:
         return val
 
-    union_df_int_types_pydantic_check(pd.DataFrame({"a": [0, 0]}))
+    union_df_int_types_pydantic_check(pd.DataFrame({"a": [0, 0]}))  # type: ignore [arg-type]
     int_val_pydantic = union_df_int_types_pydantic_check(5)
     str_val_pydantic = union_df_int_types_pydantic_check("5")  # type: ignore[arg-type]
     assert isinstance(int_val_pydantic, int)

diff --git a/tests/core/test_logical_dtypes.py b/tests/core/test_logical_dtypes.py
@@ -112,12 +112,12 @@ def test_logical_datatype_check(
     "data, expected_datatype, failure_cases",
     [
         (
-            [Decimal("1.2"), Decimal("12.3")],
+            [Decimal("1.2"), Decimal("12.3")] * 100,
             pandas_engine.Decimal(2, 1),
-            [Decimal("12.3")],
+            [Decimal("12.3")] * 100,
         ),
         (
-            [Decimal("1.2"), None, pd.NA, np.nan],
+            [Decimal("1.2"), None, pd.NA, np.nan] * 100,
             pandas_engine.Decimal(19, 5),
             [],
         ),
@@ -129,14 +129,15 @@ def test_logical_datatype_check(
                 pd.NA,
                 np.nan,
                 pd.NaT,
-            ],
+            ]
+            * 100,
             pandas_engine.Date(),
             [],
         ),
         (
-            ["2022-01-01", "01/01/2022"],
+            ["2022-01-01", "01/01/2022"] * 100,
             pandas_engine.Date(to_datetime_kwargs={"format": "%Y-%m-%d"}),
-            ["01/01/2022"],
+            ["01/01/2022"] * 100,
         ),
     ],
 )

diff --git a/tests/modin/test_schemas_on_modin.py b/tests/modin/test_schemas_on_modin.py
@@ -335,16 +335,16 @@ class Schema(pa.SchemaModel):
 
     valid_df = mpd.DataFrame(
         {
-            "int_field": [1, 2, 3],
-            "float_field": [-1.1, -2.1, -3.1],
-            "str_field": ["a", "b", "c"],
+            "int_field": [1, 2, 3] * 10,
+            "float_field": [-1.1, -2.1, -3.1] * 10,
+            "str_field": ["a", "b", "c"] * 10,
         }
     )
     invalid_df = mpd.DataFrame(
         {
-            "int_field": [-1],
-            "field_field": [1],
-            "str_field": ["d"],
+            "int_field": [-1] * 100,
+            "field_field": [1] * 100,
+            "str_field": ["d"] * 100,
         }
     )
 

diff --git a/tests/mypy/modules/pandas_dataframe.py b/tests/mypy/modules/pandas_dataframe.py
@@ -77,7 +77,7 @@ def fn_mutate_inplace(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
 
 @pa.check_types
 def fn_assign_and_get_index(df: DataFrame[Schema]) -> DataFrame[SchemaOut]:
-    return df.assign(foo=30).iloc[:3]
+    return df.assign(foo=30).iloc[:3]  # mypy error
     # error: Incompatible return value type (got "pandas.core.frame.DataFrame",
     # expected "pandera.typing.pandas.DataFrame[SchemaOut]")  [return-value]
 

diff --git a/tests/mypy/modules/pandas_time.py b/tests/mypy/modules/pandas_time.py
@@ -1,11 +1,11 @@
 # pylint: skip-file
 import pandas as pd
 
-pd.Timestamp.now() + pd.tseries.offsets.YearEnd(1)  # false positive
+pd.Timestamp.now() + pd.tseries.offsets.YearEnd(1)
 
-pd.Timedelta(minutes=2)  # false positive
-pd.Timedelta(2, unit="minutes")  # false positive
+pd.Timedelta(minutes=2)
+pd.Timedelta(2, unit="minutes")
 
-pd.Timedelta(minutes=2, seconds=30)  # false positive
-pd.Timedelta(2.5, unit="minutes")  # false positive
+pd.Timedelta(minutes=2, seconds=30)
+pd.Timedelta(2.5, unit="minutes")  # mypy error
 pd.Timedelta(2, unit="minutes") + pd.Timedelta(30, unit="seconds")
diff --git a/tests/mypy/test_static_type_checking.py b/tests/mypy/test_static_type_checking.py
@@ -21,15 +21,19 @@
 test_module_dir = Path(os.path.dirname(__file__))
 
 
-def _get_mypy_errors(stdout) -> typing.List[typing.Dict[str, str]]:
+def _get_mypy_errors(
+    module_name: str,
+    stdout,
+) -> typing.List[typing.Dict[str, str]]:
     """Parse line number and error message."""
     errors: typing.List[typing.Dict[str, str]] = []
     # last line is summary of errors
     for error in [x for x in stdout.split("\n") if x != ""][:-1]:
-        matches = re.match(
-            r".+\.py:(?P<lineno>\d+): error: (?P<msg>.+)  \[(?P<errcode>.+)\]",
-            error,
+        regex = (
+            r".+{}:".format(module_name.replace(".", r"\."))
+            + r"(?P<lineno>\d+): error: (?P<msg>.+)  \[(?P<errcode>.+)\]"
         )
+        matches = re.match(regex, error)
         if matches is not None:
             match_dict = matches.groupdict()
             errors.append(
@@ -53,16 +57,21 @@ def _get_mypy_errors(stdout) -> typing.List[typing.Dict[str, str]]:
 def test_mypy_pandas_dataframe(capfd) -> None:
     """Test that mypy raises expected errors on pandera-decorated functions."""
     # pylint: disable=subprocess-run-check
+    cache_dir = str(test_module_dir / ".mypy_cache" / "test-mypy-default")
     subprocess.run(
         [
             sys.executable,
             "-m",
             "mypy",
             str(test_module_dir / "modules" / "pandas_dataframe.py"),
+            "--cache-dir",
+            cache_dir,
+            "--config-file",
+            str(test_module_dir / "config" / "no_plugin.ini"),
         ],
         text=True,
     )
-    errors = _get_mypy_errors(capfd.readouterr().out)
+    errors = _get_mypy_errors("pandas_dataframe.py", capfd.readouterr().out)
     assert len(PANDAS_DATAFRAME_ERRORS) == len(errors)
     for expected, error in zip(PANDAS_DATAFRAME_ERRORS, errors):
         assert error["errcode"] == expected["errcode"]
@@ -97,6 +106,13 @@ def test_pandera_runtime_errors(fn) -> None:
     {"msg": 'Argument 1 to "fn" has incompatible type', "errcode": "arg-type"},
 ] * 2
 
+PANDAS_TIME_ERRORS = [
+    {
+        "msg": 'Argument 1 to "Timedelta" has incompatible type "float"',
+        "errcode": "arg-type",
+    },
+]
+
 PYTHON_SLICE_ERRORS = [
     {"msg": "Slice index must be an integer or None", "errcode": "misc"},
 ]
@@ -129,12 +145,12 @@ def test_pandera_runtime_errors(fn) -> None:
         ["pandera_types.py", "plugin_mypy.ini", PANDERA_TYPES_ERRORS],
         ["pandas_concat.py", "no_plugin.ini", []],
         ["pandas_concat.py", "plugin_mypy.ini", []],
-        ["pandas_time.py", "no_plugin.ini", []],
-        ["pandas_time.py", "plugin_mypy.ini", []],
+        ["pandas_time.py", "no_plugin.ini", PANDAS_TIME_ERRORS],
+        ["pandas_time.py", "plugin_mypy.ini", PANDAS_TIME_ERRORS],
         ["python_slice.py", "no_plugin.ini", PYTHON_SLICE_ERRORS],
         ["python_slice.py", "plugin_mypy.ini", PYTHON_SLICE_ERRORS],
-        ["pandas_index.py", "no_plugin.ini", PANDAS_INDEX_ERRORS],
-        ["pandas_index.py", "plugin_mypy.ini", PANDAS_INDEX_ERRORS],
+        ["pandas_index.py", "no_plugin.ini", []],
+        ["pandas_index.py", "plugin_mypy.ini", []],
         ["pandas_series.py", "no_plugin.ini", PANDAS_SERIES_ERRORS],
         ["pandas_series.py", "plugin_mypy.ini", PANDAS_SERIES_ERRORS],
     ],
@@ -160,7 +176,7 @@ def test_pandas_stubs_false_positives(
     ]
     # pylint: disable=subprocess-run-check
     subprocess.run(commands, text=True)
-    resulting_errors = _get_mypy_errors(capfd.readouterr().out)
+    resulting_errors = _get_mypy_errors(module, capfd.readouterr().out)
     assert len(expected_errors) == len(resulting_errors)
     for expected, error in zip(expected_errors, resulting_errors):
         assert error["errcode"] == expected["errcode"]