From 2354f94de9c6a7b0a6a8a9633ca938101010d7ab Mon Sep 17 00:00:00 2001 From: Neeraj Malhotra <52220398+NeerajMalhotra-QB@users.noreply.github.com> Date: Thu, 22 Jun 2023 08:01:54 -0700 Subject: [PATCH] Refactors for dead code (#1229) * updating builtin checks for pyspark * registeration * Implementation of checks import and spark columns information check * enhancing __call__, checks classes and builtin_checks * delete junk files * Changes to fix the implemtation of checks. Changed Apply function to send list with dataframe and column name, builtin function registers functions with lists which inculdes the dataframe * extending pyspark checks * Fixed builtin check bug and added test for supported builtin checks for pyspark * add todos * bydefault validate all checks * fixing issue with sqlctx * add dtypes pytests * setting up schema * add negative and positive tests * add fixtures and refactor tests * generalize spark_df func * refactor to use conftest * use conftest * add support for decimal dtype and fixing other types * Added new Datatypes support for pyspark, test cases for dtypes pyspark, created test file for error * refactor ArraySchema * rename array to column.py * 1) Changes in test cases to look for summarised error raise instead of fast fail, since default behaviour is changed to summarised. 2) Added functionality to accept and check the precision and scale in Decimal Datatypes. * add neg test * add custom ErrorHandler * Added functionality to DayTimeIntervalType datatype to accept parameters * Added functionality to DayTimeIntervalType datatype to accept parameters * return summarized error report * replace dataframe to dict for return obj * Changed checks input datatype to custom named tuple from the existing list. Also started changing the pyspark checks to include more datatypes * refactor * introduce error categories * rename error categories * fixing bug in schema.dtype.check * fixing error category to by dynamic * Added checks for each datatype in test cases. Reduced the code redundancy of the code in test file. Refactored the name of custom datatype object for checks. * error_handler pass through * add ErrorHandler to column api * removed SchemaErrors since we now aggregate in errorHandler * fixing dict keys * Added Decorator to raise TypeError in case of unexpected input type for the check function. * replace validator with report_errors * cleaning debugs * Support DataModels and Field * Added Decorator to raise TypeError in case of unexpected input type for the check function. Merged with Develop * Fix to run using the class schema type * use alias types * clean up * add new typing for pyspark.sql * Added Decorator to raise TypeError in case of unexpected input type for the check function. Merged with Develop * Added changes to support raising error for use of datatype not supported by the check and support for map and array type. * support bare dtypes for DataFrameModel * remove resolved TODOs and breakpoints * change to bare types * use spark types instead of bare types * using SchemaErrorReason instead of hardcode in container * fixing an issue with error reason codes * minor fix * fixing checks and errors in pyspark * Changes include the following: 1) Updated dtypes test functionality to make it more readable 2) Changes in accessor tests to support the new functionality 3) Changes in engine class to conform to check class everywhere else * enhancing dataframeschema and model classes * Changes to remove the pandas dependency * Refactoring of the checks test functions * Fixing the test case breaking * Isort and Black formatting * Container Test function failure * Isort and black linting * Changes to remove the pandas dependency * Refactoring of the checks test functions * Isort and black linting * Added Changes to refactor the checks class. Fixes to some test cases failures. * Removing breakpoint * fixing raise error * adding metadata dict * Removing the reference of pandas from docstrings * Removing redundant code block in utils * Changes to return dataframe with errors property * add accessor for errorHandler * support errors access on pyspark.sql * updating pyspark error tcs * fixing model test cases * adjusting errors to use pandera.errors * use accessor instead of dict * revert to develop * Removal of imports which are not needed and improved test case. * setting independent pyspark import * pyspark imports * revert comments * store and retrieve metadata at schema levels * adding metadata support * Added changes to support parameter based run. 1) Added parameters.yaml file to hold the configurations 2) Added code in utility to read the config 3) Updated the test cases to support the parameter based run 4) Moved pyspark decorators to a new file decorators.py in backend 5) Type fix in get_matadata property in container.py file * Changing the default value in config * change to consistent interface * Changes to remove config yaml and introduce environment variables for parameterized runs * cleaning api/pyspark * backend and tests * adding setter on errors accessors for pyspark * reformatting error dict * Changes to remove config yaml and introduce environment variables for parameterized runs * Changes to rename the config object and call only in utils.py * Fixing merge conflict issue * Updating the test cases to support new checks types * Added individualized test for each configuration type. * Removing unnecessary prints * The changes include the following: 1) Fixed test case for validating the environment variable 2) Improved docstrings for test cases and few test cases asserts * Fix reference to with wrong key in test_pyspark_schema_data_checks * minor change * Added Support for docstring substitution method. * Removing an extra indent * Removing commented docstring substitution from __new__ method * remove union * cleaning * Feature to add metadata dictionary for pandas schema * Added test to check the docstring substitution decorator * Added test to check the docstring substitution decorator * Feature to add metadata dictionary for pandas schema * Changes to ensure only pandas run does not import pyspark dependencies * Fix of imports for pandas and pyspark for separation * Rename the function from pyspark to pandas * black lint and isort * black lint and isort * Fixes of pyliny issue and suppression wherever necessary * Fixes of mypy failures and redone black linting post changes. * Added new test cases, removed redundant codes and black lint. * Fixed the doc strings, added functionality and test for custom checks * add rst for pyspark.sql * removing rst * Renamed check name and Fixed pylint and mypy issues * add rst for pyspark.sql * Fixed the doc strings, added functionality and test for custom checks * removing rst * Renamed check name and Fixed pylint and mypy issues * add rst for pyspark.sql * Rename for environment variable key name * removing rst * Black lint * Removed daytime interval type * refactor * override pyspark patching of __class_getitem__ Signed-off-by: Niels Bantilan * fixiing mypy error * lint fixes * lint fixes * fixing more lint and type issues * fixing mypy issues * fixing doctest * doctest Signed-off-by: Neeraj Malhotra * fixing doctest Signed-off-by: Neeraj Malhotra * adding doctest:metadata for pandas container classes Signed-off-by: Neeraj Malhotra * doctest * Fix to support pyspark 3.2 and 3.3 both. The string representation of datatype changed in 3.2 and 3.3. Fix ensure both versions are supported. * Black Lint * fixing doctest Signed-off-by: Neeraj Malhotra * fixing rst Signed-off-by: Neeraj Malhotra * black formatting Signed-off-by: Neeraj Malhotra * fixing str repr for DataFrameSchema across rst * add ps.DataFrame * fixing tests * fix lint Signed-off-by: Niels Bantilan * use full class name in pandas accessor Signed-off-by: Niels Bantilan * use os.environ instead of parameters.yaml Signed-off-by: Neeraj Malhotra * simplify config Signed-off-by: Niels Bantilan * merge with develop * Black Lint * refactor Signed-off-by: Neeraj Malhotra * lint fix Signed-off-by: Neeraj Malhotra * refactor * remove Column class due to redundancy Signed-off-by: Neeraj Malhotra * linting Signed-off-by: Neeraj Malhotra --------- Signed-off-by: Niels Bantilan Signed-off-by: Neeraj Malhotra Co-authored-by: jaskaransinghsidana Co-authored-by: jaskaransinghsidana <112083212+jaskaransinghsidana@users.noreply.github.com> Co-authored-by: Niels Bantilan --- pandera/typing/__init__.py | 6 +----- pandera/typing/pyspark_sql.py | 22 ++-------------------- 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/pandera/typing/__init__.py b/pandera/typing/__init__.py index 9164f7be4..08185c6d3 100644 --- a/pandera/typing/__init__.py +++ b/pandera/typing/__init__.py @@ -60,9 +60,6 @@ ) from pandera.typing.pandas import DataFrame, Index, Series -if pyspark_sql.PYSPARK_SQL_INSTALLED: - from pandera.typing.pyspark_sql import Column - DATAFRAME_TYPES: Set[Type] = {DataFrame} SERIES_TYPES: Set[Type] = {Series} @@ -85,11 +82,10 @@ if pyspark_sql.PYSPARK_SQL_INSTALLED: DATAFRAME_TYPES.update({pyspark_sql.DataFrame}) - COLUMN_TYPES: Set[Type] = {Column} if geopandas.GEOPANDAS_INSTALLED: DATAFRAME_TYPES.update({geopandas.GeoDataFrame}) SERIES_TYPES.update({geopandas.GeoSeries}) -__all__ = ["DataFrame", "Series", "Index", "Column"] +__all__ = ["DataFrame", "Series", "Index"] diff --git a/pandera/typing/pyspark_sql.py b/pandera/typing/pyspark_sql.py index d1f8e2893..b20603580 100644 --- a/pandera/typing/pyspark_sql.py +++ b/pandera/typing/pyspark_sql.py @@ -1,6 +1,6 @@ """Pandera type annotations for Pyspark.""" -from typing import Union, Optional, Type, TypeVar -from pandera.typing.common import DataFrameBase, GenericDtype +from typing import Union, TypeVar +from pandera.typing.common import DataFrameBase from pandera.typing.pandas import DataFrameModel, _GenericAlias try: @@ -51,21 +51,6 @@ if PYSPARK_SQL_INSTALLED: # pylint: disable=too-few-public-methods,arguments-renamed - class ColumnBase(Generic[PysparkDType]): - """Representation of pandas.Index, only used for type annotation. - - *new in 0.5.0* - """ - - default_dtype: Optional[Type] = None - - def __get__( - self, instance: object, owner: Type - ) -> str: # pragma: no cover - raise AttributeError( - "column should resolve to pyspark.sql.Column-s" - ) - class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]): """ Representation of dask.dataframe.DataFrame, only used for type @@ -77,6 +62,3 @@ class DataFrame(DataFrameBase, ps.DataFrame, Generic[T]): def __class_getitem__(cls, item): """Define this to override's pyspark.pandas generic type.""" return _GenericAlias(cls, item) # pragma: no cover - - class Column(ColumnBase, ps.Column, Generic[GenericDtype]): # type: ignore [misc] # noqa - """Representation of pyspark.sql.Column, only used for type annotation."""