In [4]:
import pandas as pd
data = {
    "animal": ["cat", "hawk", "snake", "cat"],
    "legs": [4, 2, 0, 4],
    "weight": [4.3, 1.8, 1.3, 4.1],
}

df = pd.DataFrame(data)

import whylogs as why

profile_view = why.log(df).profile().view()

print("Profile view #1\n")
print(profile_view.to_pandas().columns)

merged_view = profile_view.merge(profile_view)
print("\nProfile view #2\n")
print(merged_view.to_pandas().columns)

Profile view #1

Index(['counts/n', 'counts/null', 'types/integral', 'types/fractional',
       'types/boolean', 'types/string', 'types/object', 'cardinality/est',
       'cardinality/upper_1', 'cardinality/lower_1',
       'frequent_items/frequent_strings', 'type', 'distribution/mean',
       'distribution/stddev', 'distribution/n', 'distribution/max',
       'distribution/min', 'distribution/q_01', 'distribution/q_05',
       'distribution/q_10', 'distribution/q_25', 'distribution/median',
       'distribution/q_75', 'distribution/q_90', 'distribution/q_95',
       'distribution/q_99', 'ints/max', 'ints/min'],
      dtype='object')

Profile view #2

Index(['types/integral', 'types/fractional', 'types/boolean', 'types/string',
       'types/object', 'cardinality/est', 'cardinality/upper_1',
       'cardinality/lower_1', 'counts/n', 'counts/null',
       'frequent_items/frequent_strings', 'type', 'distribution/mean',
       'distribution/stddev', 'distribution/n', 'distribution/max',
 

In [None]:
from whylogs.core.constraints import ConstraintsBuilder, MetricConstraint, MetricsSelector

## Distribution Metrics Constraints

In [None]:
def greater_than_number(column_name, number):
    """Minimum value of given column must be above defined number.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    number : float
        reference value for applying the constraint

    """
    constraint = MetricConstraint(
        name="{} greater than number {}".format(column_name,number),
        condition=lambda x: x.min > number,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="distribution"),
    )
    return constraint

def mean_between_range(column_name, lower, upper):
    """Estimated mean must be between range defined by lower and upper bounds.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    lower : int
        Lower bound of defined range
    upper : int
        Upper bound of the value range
    """

    constraint = MetricConstraint(
        name="{} mean between {} and {} (inclusive)".format(column_name, lower, upper),
        condition=lambda x: lower <= x.avg <= upper,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="distribution"),
    )
    return constraint


def lower_than_number(column_name, number):
    """Maximum value of given column must be below defined number.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    number : float
        reference value for applying the constraint

    """

    constraint = MetricConstraint(
        name="{} lower than number {}".format(column_name,number),
        # check for nans, Nones
        condition=lambda x: x.max < number,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="distribution"),
    )
    return constraint

def stddev_between_range(column_name,lower,upper):
    """Estimated standard deviation must be between range defined by lower and upper bounds.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    lower : int
        Lower bound of defined range
    upper : int
        Upper bound of the value range
    """

    constraint = MetricConstraint(
        name="{} standard deviation between {} and {} (inclusive)".format(column_name, lower, upper),
        condition=lambda x: lower <= x.stddev <= upper,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="distribution"),
    )
    return constraint


def quantile_between_range(column_name:str, quantile_value: float, lower: float,upper: float):
    """Q-th quantile value must be withing the range defined by lower and upper boundaries.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    quantile_value : float
        Quantile value. E.g. median is equal to quantile_value=0.5
    lower : float
        Lower bound of defined range
    upper : float
        Upper bound of the value range
    """
    #TODO
    pass

### Examples - Distribution Metrics Constraints

## Frequent Items/Frequent Strings Constraints

In [None]:
def frequent_strings_in_reference_set(column_name:str,reference_set:dict):
    """Every item in frequent strings must be in defined reference set

    Parameters
    ----------
    column_name : str
        Columns the constraint is applied to.
    reference_set : dict
        Reference set for applying the constraint
    """
    from whylogs.core.configs import SummaryConfig
    frequent_strings = MetricsSelector(metric_name='frequent_items', column_name=column_name)
    labels_in_set = lambda x: all(y.value in reference_set for y in x.to_summary_dict(SummaryConfig())['frequent_strings'])
    constraint_name = "{} values in set {}".format(column_name,reference_set)
    constraint = MetricConstraint(
            name=constraint_name,
            condition=labels_in_set,
            metric_selector=frequent_strings)
    return constraint

def n_most_common_items_in_set(column_name:str, n:int, reference_set: dict):
    """ n most common items must be in defined reference_set

    Parameters
    ----------
    column_name : str
        Columns the constraint is applied to.
    n : int
        n most common items or strings.
    reference_set : dict
        Reference set for applying the constraint
    """

    pass



### Examples - Frequent Items/Frequent Strings Constraints

## Counters Constraints

In [4]:
def total_value_below_number(column_name:str, number:int):
    """Number of total value must be below given number.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    number : float
        reference value for applying the constraint
    """

    constraint = MetricConstraint(
        name="null values of {} lower than number {}".format(column_name,number),
        condition=lambda x: x.n.value < number,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="counts"),
    )
    return constraint


### Null values

In [None]:

def null_values_below_number(column_name:str, number:int):
    """Number of null values must be below given number.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    number : float
        reference value for applying the constraint
    """

    constraint = MetricConstraint(
        name="null values of {} lower than number {}".format(column_name,number),
        condition=lambda x: x.null.value < number,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="counts"),
    )
    return constraint

    
def null_pct_below_number(column_name:str, number:float):
    """Percentage of null values must be below given number.

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    number : float
        reference value for applying the constraint
    """

    constraint = MetricConstraint(
        name="null percentage of {} lower than number {}".format(column_name,number),
        condition=lambda x: x.null.value/x.n.value < number,
        metric_selector=MetricsSelector(column_name=column_name, metric_name="counts"),
    )
    return constraint

### Examples - Counters Constraints

## Cardinality Constraints

In [None]:


def distinct_number_in_range(column_name:str, lower: int, upper:int):
    """Number of distinct categories must be between lower and upper values (inclusive).

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    lower : int
        Lower bound of defined range
    upper : int
        Upper bound of the value range
    """
    #todo
    pass

def percentage_distinct_in_range(column_name:str, lower: float, upper:float):
    """Percentage of distinct values relative to total number must be between lower and upper values (inclusive).

    Parameters
    ----------
    column_name : str
        Column the constraint is applied to
    lower : int
        Lower bound of defined range
    upper : int
        Upper bound of the value range
    """

    #todo
    pass

### Examples - Cardinality Constraints