add support for dataframe-level checks (#82)

* add support for dataframe-level checks * add support for wide Hypothesis * add test for element-wise dataframe checks * fix CI issues
unionai-oss · Aug 11, 2019 · dd97eb4 · dd97eb4
1 parent 3d3ef7c
commit dd97eb4
Show file tree

Hide file tree

Showing 6 changed files with 533 additions and 130 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,5 @@
 language: python
+dist: xenial
 python:
   - "2.7"
   - "3.5"
@@ -22,6 +23,7 @@ install:
   - conda info -a
   - conda create -q -n ci-env python=$TRAVIS_PYTHON_VERSION
   - source activate ci-env
+  - conda install -y -c conda-forge numpy scipy pandas codecov
   - pip install -r requirements.txt
   - python setup.py install
 script:

diff --git a/docs/source/checks.rst b/docs/source/checks.rst
@@ -64,7 +64,7 @@ checks.
 Column Check Groups
 -------------------
 
-``Column`` ``Check``\ s support grouping by a different column so that
+``Column`` ``Check``s support grouping by a different column so that
 you can make assertions about subsets of the ``Column`` of interest.
 This changes the function signature of the ``Check`` function so that
 its input is a dict where keys are the group names and keys are subsets
@@ -111,3 +111,61 @@ In the above example we define a ``DataFrameSchema`` with column checks
 for ``height_in_feet`` using a single column, multiple columns, and a
 more complex groupby function that creates a new column
 ``age_less_than_15`` on the fly.
+
+
+Wide Checks
+-----------
+
+``pandera`` is primarily designed to operate on long-form data (commonly known
+as `tidy data <https://vita.had.co.nz/papers/tidy-data.pdf>`_), where each row
+is an observation and columns are attributes associated with the observation.
+
+However, ``pandera`` also supports checks on wide-form data to operate across
+columns in a ``DataFrame``.
+
+For example, if you want to make assertions about ``height`` across two groups,
+the tidy dataset and schema might look like this:
+
+.. code:: python
+
+    import pandas as pd
+    from pandera import DataFrameSchema, Column, Check, Float, String
+
+    df = pd.DataFrame({
+        "height": [5.6, 6.4, 4.0, 7.1],
+        "group": ["A", "B", "A", "B"],
+    })
+
+    schema = DataFrameSchema({
+        "height": Column(
+            Float,
+            Check(lambda g: g["A"].mean() < g["B"].mean(), groupby="group")
+        ),
+        "group": Column(String)
+    })
+
+    schema.validate(df)
+
+
+The equivalent wide-form schema would look like this:
+
+.. code:: python
+
+    import pandas as pd
+    from pandera import DataFrameSchema, Column, Check, Float
+
+    df = pd.DataFrame({
+        "height_A": [5.6, 4.0],
+        "height_B": [6.4, 7.1],
+    })
+
+    schema = DataFrameSchema(
+        columns={
+            "height_A": Column(Float),
+            "height_B": Column(Float),
+        },
+        # define checks at the DataFrameSchema-level
+        checks=Check(lambda df: df["height_A"].mean() < df["height_B"].mean())
+    )
+
+    schema.validate(df)
diff --git a/docs/source/hypothesis.rst b/docs/source/hypothesis.rst
@@ -28,9 +28,9 @@ can be made about the relationships between ``Column``\s.
     schema = DataFrameSchema({
         "height_in_feet": Column(Float, [
             Hypothesis.two_sample_ttest(
+                sample1="M",
+                sample2="F",
                 groupby="sex",
-                group1="M",
-                group2="F",
                 relationship="greater_than",
                 alpha=0.05,
                 equal_var=True),
@@ -43,19 +43,100 @@ can be made about the relationships between ``Column``\s.
     #] SchemaError: <Schema Column: 'height_in_feet' type=float64> failed series validator 0: _check_fn
 
 
-``Hypothesis`` also supports passing custom ``test``'s and ``relationship``'s. This
-enables the user to use non-built in functions as follows:
+``Hypothesis`` also supports passing custom ``test``'s and ``relationship``'s.
+The ``test`` function takes as input an one or multiple array-like objects
+and should return a ``stat``, which is the test statistic, and ``pvalue`` for
+assessing statistical significance. It also takes key-word arguments supplied
+by the ``test_kwargs`` dict when initializing a ``Hypothesis`` object.
+
+The ``relationship`` function should take all of the outputs of ``test`` as
+positional arguments, in addition to key-word arguments supplied by the
+``relationship_kwargs`` dict.
+
+This enables the user to use non-built in functions. Here is an implementation
+of the two-sample t-test that uses the
+`scipy implementation <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html>`_:
 
 .. code:: python
 
     schema = DataFrameSchema({
         "height_in_feet": Column(Float, [
             Hypothesis(
                 test=stats.ttest_ind,
+                samples=["M", "F"],
                 groupby="sex",
-                groups=["M", "F"],
-                relationship="greater_than",
-                relationship_kwargs={"alpha":0.5, "equal_var": True}),
+                relationship=lambda stat, pvalue, alpha=0.01: (
+                    stat > 0 and pvalue / 2 < alpha
+                ),
+                relationship_kwargs={"alpha": 0.5}
+            )
         ]),
         "sex": Column(String)
     })
+
+
+Wide Hypotheses
+---------------
+
+``pandera`` is primarily designed to operate on long-form data (commonly known
+as `tidy data <https://vita.had.co.nz/papers/tidy-data.pdf>`_), where each row
+is an observation and columns are attributes associated with the observation.
+
+However, ``pandera`` also supports hypothesis testing on wide-form data to
+operate across columns in a ``DataFrame``.
+
+For example, if you want to make assertions about ``height`` across two groups,
+the tidy dataset and schema might look like this:
+
+.. code:: python
+
+    import pandas as pd
+    from pandera import DataFrameSchema, Column, Hypothesis, Float, String
+
+    df = pd.DataFrame({
+        "height": [5.6, 6.4, 4.0, 7.1],
+        "group": ["A", "B", "A", "B"],
+    })
+
+    schema = DataFrameSchema({
+        "height": Column(
+            Float, Hypothesis.two_sample_ttest(
+                "A", "B",
+                groupby="group",
+                relationship="less_than",
+                alpha=0.5
+            )
+        ),
+        "group": Column(String, Check(lambda s: s.isin(["A", "B"])))
+    })
+
+    schema.validate(df)
+
+
+The equivalent wide-form schema would look like this:
+
+.. code:: python
+
+    import pandas as pd
+    from pandera import DataFrameSchema, Column, Hypothesis, Float
+
+    df = pd.DataFrame({
+        "height_A": [5.6, 4.0],
+        "height_B": [6.4, 7.1],
+    })
+
+    schema = DataFrameSchema(
+        columns={
+            "height_A": Column(Float),
+            "height_B": Column(Float),
+        },
+        # define checks at the DataFrameSchema-level
+        checks=Hypothesis.two_sample_ttest(
+            "height_A", "height_B",
+            relationship="less_than",
+            alpha=0.5
+        )
+    )
+
+    schema.validate(df)
+
diff --git a/pandera/__init__.py b/pandera/__init__.py
@@ -1,7 +1,26 @@
-from .pandera import DataFrameSchema, Column, Index, MultiIndex, PandasDtype, \
-    SeriesSchema, SchemaError, SchemaInitError, Check, check_input, \
-    check_output, Bool, DateTime, Category, Float, Int, Object, String, \
-    Timedelta, Hypothesis
+from .pandera import (
+    DataFrameSchema,
+    Column,
+    Index,
+    MultiIndex,
+    PandasDtype,
+    SeriesSchema,
+    SchemaError,
+    SchemaInitError,
+    SchemaDefinitionError,
+    Check,
+    check_input,
+    check_output,
+    Bool,
+    DateTime,
+    Category,
+    Float,
+    Int,
+    Object,
+    String,
+    Timedelta,
+    Hypothesis,
+    )
 
 
 __version__ = "0.1.2"