bugfix: string and object coercion work correctly (#166)

unionai-oss · Jan 18, 2020 · 7f572fc · 7f572fc
1 parent ee21cdb
commit 7f572fc
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 7 deletions.
diff --git a/pandera/dtypes.py b/pandera/dtypes.py
@@ -24,7 +24,10 @@ class PandasDtype(Enum):
     UInt32 = "uint32"
     UInt64 = "uint64"
     Object = "object"
-    String = "object"
+    # the string datatype doesn't map to a unique string representation and is
+    # representated as a numpy object array. This will change after pandas 1.0,
+    # but for now will need to handle this as a special case.
+    String = "string"
     Timedelta = "timedelta64[ns]"
 
 

diff --git a/pandera/schemas.py b/pandera/schemas.py
@@ -394,9 +394,14 @@ def name(self) -> str:
     @property
     def dtype(self) -> str:
         """String representation of the dtype."""
-        return self._pandas_dtype if (
-            isinstance(self._pandas_dtype, str) or self._pandas_dtype is None
-        ) else self._pandas_dtype.value
+        if isinstance(self._pandas_dtype, str) or self._pandas_dtype is None:
+            dtype = self._pandas_dtype
+        elif self._pandas_dtype is dtypes.PandasDtype.String:
+            # handle special case of string.
+            dtype = dtypes.PandasDtype.Object.value
+        else:
+            dtype = self._pandas_dtype.value
+        return dtype
 
     def coerce_dtype(
             self, series_or_index: Union[pd.Series, pd.Index]) -> pd.Series:
@@ -453,6 +458,7 @@ def validate(
                         "series '%s' to be int, found: %s" %
                         (series.name, set(series)))
                 series = _series
+
         nulls = series.isnull()
         if nulls.sum() > 0:
             if series.dtype != _dtype:

diff --git a/tests/test_schema_components.py b/tests/test_schema_components.py
@@ -1,14 +1,15 @@
 """Testing the components of the Schema objects."""
 
 import copy
+import numpy as np
 import pandas as pd
 import pytest
 
 
 from pandera import errors
 from pandera import (
     Column, DataFrameSchema, Index, MultiIndex, Check, DateTime, Float, Int,
-    String)
+    Object, String)
 from tests.test_dtypes import TESTABLE_DTYPES
 
 
@@ -33,6 +34,25 @@ def test_column():
         Column(Int)(data)
 
 
+def test_coerce_nullable_object_column():
+    """Test that Object dtype coercing preserves object types."""
+    df_objects_with_na = pd.DataFrame({
+        "col": [1, 2.0, [1, 2, 3], {"a": 1}, np.nan, None]
+    })
+
+    column_schema = Column(Object, name="col", coerce=True, nullable=True)
+
+    validated_df = column_schema.validate(df_objects_with_na)
+    assert isinstance(validated_df, pd.DataFrame)
+    assert pd.isna(validated_df["col"].iloc[-1])
+    assert pd.isna(validated_df["col"].iloc[-2])
+    for i in range(4):
+        isinstance(
+            validated_df["col"].iloc[i],
+            type(df_objects_with_na["col"].iloc[i])
+        )
+
+
 def test_column_in_dataframe_schema():
     """Test that a Column check returns a dataframe."""
     schema = DataFrameSchema({
@@ -171,8 +191,9 @@ def tests_multi_index_subindex_coerce():
             assert validated_df.index.get_level_values(level_i).dtype == \
                 indexes[level_i].dtype
         else:
+            # dtype should be string representation of pandas strings
             assert validated_df.index.get_level_values(level_i).dtype == \
-                String.value
+                "object"
 
     # coerce=True in MultiIndex should override subindex coerce setting
     schema_override = DataFrameSchema(index=MultiIndex(indexes), coerce=True)
@@ -187,6 +208,7 @@ def test_column_dtype_property(pandas_dtype, expected):
     """Tests that the dtypes provided by Column match pandas dtypes"""
     assert Column(pandas_dtype).dtype == expected
 
+
 def test_schema_component_equality_operators():
     """Test the usage of == for Column, Index and MultiIndex."""
     column = Column(Int, Check(lambda s: s >= 0))

diff --git a/tests/test_schemas.py b/tests/test_schemas.py
@@ -261,6 +261,7 @@ def test_coerce_dtype_in_dataframe():
 
 def test_coerce_dtype_nullable_str():
     """Tests how null values are handled in string dtypes."""
+    # dataframes with columns where the last two values are null
     df_nans = pd.DataFrame({
         "col": ["foobar", "foo", "bar", "baz", np.nan, np.nan],
     })
@@ -279,7 +280,12 @@ def test_coerce_dtype_nullable_str():
     })
 
     for df in [df_nans, df_nones]:
-        assert isinstance(schema.validate(df), pd.DataFrame)
+        validated_df = schema.validate(df)
+        assert isinstance(validated_df, pd.DataFrame)
+        assert pd.isna(validated_df["col"].iloc[-1])
+        assert pd.isna(validated_df["col"].iloc[-2])
+        for i in range(4):
+            assert isinstance(validated_df["col"].iloc[i], str)
 
 
 def test_no_dtype_dataframe():