Skip to content

Commit

Permalink
bugfix: string and object coercion work correctly (#166)
Browse files Browse the repository at this point in the history
  • Loading branch information
cosmicBboy committed Jan 18, 2020
1 parent ee21cdb commit 7f572fc
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 7 deletions.
5 changes: 4 additions & 1 deletion pandera/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ class PandasDtype(Enum):
UInt32 = "uint32"
UInt64 = "uint64"
Object = "object"
String = "object"
# the string datatype doesn't map to a unique string representation and is
# representated as a numpy object array. This will change after pandas 1.0,
# but for now will need to handle this as a special case.
String = "string"
Timedelta = "timedelta64[ns]"


Expand Down
12 changes: 9 additions & 3 deletions pandera/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,9 +394,14 @@ def name(self) -> str:
@property
def dtype(self) -> str:
"""String representation of the dtype."""
return self._pandas_dtype if (
isinstance(self._pandas_dtype, str) or self._pandas_dtype is None
) else self._pandas_dtype.value
if isinstance(self._pandas_dtype, str) or self._pandas_dtype is None:
dtype = self._pandas_dtype
elif self._pandas_dtype is dtypes.PandasDtype.String:
# handle special case of string.
dtype = dtypes.PandasDtype.Object.value
else:
dtype = self._pandas_dtype.value
return dtype

def coerce_dtype(
self, series_or_index: Union[pd.Series, pd.Index]) -> pd.Series:
Expand Down Expand Up @@ -453,6 +458,7 @@ def validate(
"series '%s' to be int, found: %s" %
(series.name, set(series)))
series = _series

nulls = series.isnull()
if nulls.sum() > 0:
if series.dtype != _dtype:
Expand Down
26 changes: 24 additions & 2 deletions tests/test_schema_components.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Testing the components of the Schema objects."""

import copy
import numpy as np
import pandas as pd
import pytest


from pandera import errors
from pandera import (
Column, DataFrameSchema, Index, MultiIndex, Check, DateTime, Float, Int,
String)
Object, String)
from tests.test_dtypes import TESTABLE_DTYPES


Expand All @@ -33,6 +34,25 @@ def test_column():
Column(Int)(data)


def test_coerce_nullable_object_column():
"""Test that Object dtype coercing preserves object types."""
df_objects_with_na = pd.DataFrame({
"col": [1, 2.0, [1, 2, 3], {"a": 1}, np.nan, None]
})

column_schema = Column(Object, name="col", coerce=True, nullable=True)

validated_df = column_schema.validate(df_objects_with_na)
assert isinstance(validated_df, pd.DataFrame)
assert pd.isna(validated_df["col"].iloc[-1])
assert pd.isna(validated_df["col"].iloc[-2])
for i in range(4):
isinstance(
validated_df["col"].iloc[i],
type(df_objects_with_na["col"].iloc[i])
)


def test_column_in_dataframe_schema():
"""Test that a Column check returns a dataframe."""
schema = DataFrameSchema({
Expand Down Expand Up @@ -171,8 +191,9 @@ def tests_multi_index_subindex_coerce():
assert validated_df.index.get_level_values(level_i).dtype == \
indexes[level_i].dtype
else:
# dtype should be string representation of pandas strings
assert validated_df.index.get_level_values(level_i).dtype == \
String.value
"object"

# coerce=True in MultiIndex should override subindex coerce setting
schema_override = DataFrameSchema(index=MultiIndex(indexes), coerce=True)
Expand All @@ -187,6 +208,7 @@ def test_column_dtype_property(pandas_dtype, expected):
"""Tests that the dtypes provided by Column match pandas dtypes"""
assert Column(pandas_dtype).dtype == expected


def test_schema_component_equality_operators():
"""Test the usage of == for Column, Index and MultiIndex."""
column = Column(Int, Check(lambda s: s >= 0))
Expand Down
8 changes: 7 additions & 1 deletion tests/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def test_coerce_dtype_in_dataframe():

def test_coerce_dtype_nullable_str():
"""Tests how null values are handled in string dtypes."""
# dataframes with columns where the last two values are null
df_nans = pd.DataFrame({
"col": ["foobar", "foo", "bar", "baz", np.nan, np.nan],
})
Expand All @@ -279,7 +280,12 @@ def test_coerce_dtype_nullable_str():
})

for df in [df_nans, df_nones]:
assert isinstance(schema.validate(df), pd.DataFrame)
validated_df = schema.validate(df)
assert isinstance(validated_df, pd.DataFrame)
assert pd.isna(validated_df["col"].iloc[-1])
assert pd.isna(validated_df["col"].iloc[-2])
for i in range(4):
assert isinstance(validated_df["col"].iloc[i], str)


def test_no_dtype_dataframe():
Expand Down

0 comments on commit 7f572fc

Please sign in to comment.