# Simple constraint examples and usage

#### There is a specific function for common constraints. Should only continue to use the ValueConstraint and SummaryConstraint for creating a custom constraint that can't be found.

In [1]:
from whylogs import get_or_create_session
from whylogs.util.protobuf import message_to_json

# create session
session = get_or_create_session()

WARN: Missing config


In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
from tabulate import tabulate

def indent(txt, spaces=4):
    return "\n".join(" " * spaces + ln for ln in txt.splitlines())

def format_report(r):
    # report failures in tabular form
    
    r_2 = [entry for entry in r if len(entry)==2] # all the single column constraints
    r_table_shape = [[entry for entry in r if len(entry)!=2 and entry[0].startswith("table")]] # multi column and table shape constraints
    r_multi_column = [[entry for entry in r if len(entry)!=2 and entry[0].startswith("multi column")]]
    
    if len(r_2):
        print("Constraint failures by feature - ")
    for c,r in r_2:
        print(f"{c}:")
        if len(r[0][0]) > 80: 
            print(f"\ntest_name:\t{r[0][0]}\n")
            print(f"total_run:\t{r[0][1]}\n")
            print(f"failed:\t\t{r[0][2]}\n")
        else:    
            print(indent(tabulate(r, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
    
    if len(r_table_shape[0]):
        print ()   
        print("Table shape constraint failures -")
        for entry in r_table_shape:
            print(indent(tabulate(entry, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
        
    if len(r_multi_column[0]):
        print()    
        print("Multi column constraint failures -")
        for entry in r_multi_column:
            if len(entry[0][0]) > 80: 
                print(f"\ntest_name:\t{entry[0][0]}\n")
                print(f"total_run:\t{entry[0][1]}\n")
                print(f"failed:\t\t{entry[0][2]}\n")
            else:
                print(indent(tabulate(entry, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
        

## Between summary constraints on summary fields like: stddev, min, max, mean...

In [4]:
from whylogs.core.statistics.constraints import (
    maxBetweenConstraint,
    maxLessThanEqualConstraint,
    meanBetweenConstraint,
    minBetweenConstraint,
    minGreaterThanEqualConstraint,
    stddevBetweenConstraint,
    stringLengthBetweenConstraint,
    stringLengthEqualConstraint,
    quantileBetweenConstraint,
    DatasetConstraints,
    SummaryConstraints,
)

In [5]:
# define the specific types of constraints
# the ranges of the between constraints include the bouding values

# check if the maximum value of the column is in the range [5, 10.8]
max_between_values = maxBetweenConstraint(lower_value=5, upper_value=10.8) 
# check if the maximum value of the column is less than or equal to 100
max_less_than_equal_value = maxLessThanEqualConstraint(value=100)
# check if the mean of the column is in the range [1.2, 1.6] 
mean_between_values = meanBetweenConstraint(lower_value=1.2, upper_value=1.6)
# check if the minimum value of the column is in the range [0.1, 0.5]
min_between_values = minBetweenConstraint(lower_value=0.1, upper_value=0.5)
# check if the minimum value of the column is greater than or equal to 1
min_greater_than_equal_value = minGreaterThanEqualConstraint(value=1)
# check if the standard deviation of the column is in the range [2.3, 5.4]
stddev_between_values = stddevBetweenConstraint(lower_value=2.3, upper_value=5.4)
# check if the 0.15 quantile value is in the range [2, 4.3]
quantile_between_values = quantileBetweenConstraint(quantile_value = 0.15, lower_value=2, upper_value=4.3) 

# example data frame with columns "col1","col2", "col3"
# you can also read an existing data set using pandas, or as a numpy array
df = pd.DataFrame({
    "col1": [4, 5, 6, 7],
    "col2": [0, 1, 2, 3],
    "col3": [50, 60, 80, 110]
})

# bind the standard deviation between constraint to the dataframe column named "col1"
# bind the mean between constraint to the dataframe column named "col2"
# you can add multiple summary constrants for each column
dc = DatasetConstraints(None, summary_constraints={
    "col1": [max_between_values, stddev_between_values, min_greater_than_equal_value], 
    "col2": [mean_between_values, min_between_values, quantile_between_values],
    "col3": [max_less_than_equal_value]
})  

# logging the dataframe creates a profile with summary statistics for the data set
# the data set profile contains column profiles with summary statistics for each column present in the data set
profile = session.log_dataframe(df, "test.data", constraints=dc)

# serialize the DatasetConstraints to JSON
dc_json = json.loads(dc.to_json())
col1_constraints = json.dumps(dc_json['summaryConstraints']['col1']['constraints'], indent=4)
col2_constraints = json.dumps(dc_json['summaryConstraints']['col2']['constraints'], indent=4)
col3_constraints = json.dumps(dc_json['summaryConstraints']['col3']['constraints'], indent=4)

print(f"Constraints for column 'col1': \n{col1_constraints}\n")
print(f"Constraints for column 'col2': \n{col2_constraints}\n")
print(f"Constraints for column 'col3': \n{col3_constraints}\n")

Constraints for column 'col1': 
[
    {
        "name": "summary max BTWN 5 and 10.8",
        "firstField": "max",
        "op": "BTWN",
        "between": {
            "lowerValue": 5.0,
            "upperValue": 10.8
        },
        "verbose": false,
        "quantileValue": 0.0
    },
    {
        "name": "standard deviation is between 2.3 and 5.4",
        "firstField": "stddev",
        "op": "BTWN",
        "between": {
            "lowerValue": 2.3,
            "upperValue": 5.4
        },
        "verbose": false,
        "quantileValue": 0.0
    },
    {
        "name": "minimum is greater than or equal to 1",
        "firstField": "min",
        "value": 1.0,
        "op": "GE",
        "verbose": false,
        "quantileValue": 0.0
    }
]

Constraints for column 'col2': 
[
    {
        "name": "mean is between 1.2 and 1.6",
        "firstField": "mean",
        "op": "BTWN",
        "between": {
            "lowerValue": 1.2,
            "upperValue": 1.6
        },


#### Summary constraints are applied with apply_summary_constraints on the DatasetProfile.

In [6]:
# summary constraints must be applied on the dataset profile, only after some data has been logged
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
col1:
    test_name                                    total_run    failed
    summary max BTWN 5 and 10.8                          1         0
    standard deviation is between 2.3 and 5.4            1         1
    minimum is greater than or equal to 1                1         0
col2:
    test_name                                      total_run    failed
    mean is between 1.2 and 1.6                            1         0
    minimum is between 0.1 and 0.5                         1         1
    0.15-th quantile value is between 2 and 4.3            1         1
col3:
    test_name                               total_run    failed
    maximum is less than or equal to 100            1         1


As we can see **mean BTWN** passes and the **stddev BTWN** fails as they should.

## Summary constraints for distinct, unique and most common values in a column

### Distinct values in a column

In [7]:
from whylogs.core.statistics.constraints import (
    distinctValuesInSetConstraint, distinctValuesEqualSetConstraint, distinctValuesContainSetConstraint )

In [8]:
in_set = distinctValuesInSetConstraint(reference_set=set(range(1, 10)))
eq_set = distinctValuesEqualSetConstraint(reference_set={'a', 'a', 'a'})
contain_set = distinctValuesContainSetConstraint(reference_set={0, 1})

#### Applying summary constraints sent as an argument to apply_summary_constraints function on the same profile as before!

In [9]:
report = profile.apply_summary_constraints({'col1': SummaryConstraints([in_set, eq_set]), 
                                           'col2': SummaryConstraints([contain_set])})
format_report(report)

Constraint failures by feature - 
col1:
    test_name                                             total_run    failed
    distinct values are in {1, 2, 3, 4, 5, 6, 7, 8, 9}            1         0
    distinct values are equal to the set {'a'}                    1         1
col2:
    test_name                                 total_run    failed
    distinct values contain the set {0, 1}            1         0


### Unique column value count and proportion constraints

In [10]:
from whylogs.core.statistics.constraints import (
    columnUniqueValueCountBetweenConstraint,
    columnUniqueValueProportionBetweenConstraint,
)

In [11]:
# create a data set with customers, the country they live in, and their spending
customer_data = pd.DataFrame({
    "customer": ["c1", "c2", "c3", "c4", "c5", "c6"],
    "country": ["Germany", "Italy", "Germany", "USA", "Germany", "UK"],
    "spending": [1200, 500, 700, 1500, 300, None]
})

In [12]:
# check if there are between 1 and 5 unique values in the specific column
unique_value_count_between = columnUniqueValueCountBetweenConstraint(lower_value=1, upper_value=5)
# check if the proportion of unique values in the set is between 0.3 and 0.4 inclusive
unique_value_proportion_between = columnUniqueValueProportionBetweenConstraint(lower_fraction=0.3, upper_fraction=0.45)
dc = DatasetConstraints(None, summary_constraints={"country": [unique_value_count_between, unique_value_proportion_between]})

# log the customer_data dataframe to obtain the profile
profile = session.log_dataframe(customer_data, 'test2.data', constraints=dc)
# summary constraints must be applied on the profile after the data set has been logged
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
country:
    test_name                                              total_run    failed
    number of unique values is between 1 and 5                     1         0
    proportion of unique values is between 0.3 and 0.45            1         1


### Column most common value in set constraint

In [13]:
from whylogs.core.statistics.constraints import columnMostCommonValueInSetConstraint

In [14]:
# check if the most common value in the column is in the set {"Germany", "Italy"}
most_common_value_in_set = columnMostCommonValueInSetConstraint(value_set={"Germany", "Italy"})
# bind the constraint to the column named "country"
summary_constraint = {"country": [most_common_value_in_set]}
# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(summary_constraint)
format_report(report)

Constraint failures by feature - 
country:
    test_name                                       total_run    failed
    most common value is in {'Germany', 'Italy'}            1         0


### Column values not null

In [15]:
from whylogs.core.statistics.constraints import columnValuesNotNullConstraint

In [16]:
# check if all values in the column are non-null
customer_value_not_null = columnValuesNotNullConstraint()
spending_value_not_null = columnValuesNotNullConstraint()
# bind the constraint to the column, there are no null values in the customer column, but there is one in the spending column
summary_constraint = {"customer": [customer_value_not_null], "spending": [spending_value_not_null]}
# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(summary_constraint)

format_report(report)

Constraint failures by feature - 
customer:
    test_name                          total_run    failed
    does not contain missing values            1         0
spending:
    test_name                          total_run    failed
    does not contain missing values            1         1


### Missing values proportion constraint

In [17]:
from whylogs.core.statistics.constraints import missingValuesProportionBetweenConstraint

In [18]:
# check if the proportion of the missing values is between 0.0 % and 1.0 %
customer_mvpbc = missingValuesProportionBetweenConstraint(lower_fraction=0.0, upper_fraction=0.01) # no missing values in the "customer" column so this constraint passes

# check if the proportion of the missing values is between 0.0 % and 17.0 %
spending_mvpbc = missingValuesProportionBetweenConstraint(lower_fraction=0.0, upper_fraction=0.17) # 1 of 6 missing values in the "spending" column, passes as well

# check if the proportion of the missing values is between 0.0 % and 15.0 %
spending_mvpbc2 = missingValuesProportionBetweenConstraint(lower_fraction=0.1, upper_fraction=0.15) # 1 of 6 is missing, resulting in a missing proportion of 16.6667 %, this one fails

# bind the constraint to the column
summary_constraint = {"customer": [customer_mvpbc], "spending": [spending_mvpbc, spending_mvpbc2]}
# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(summary_constraint)

format_report(report)

Constraint failures by feature - 
customer:
    test_name                                             total_run    failed
    missing values proportion is between 0.0% and 1.0%            1         0
spending:
    test_name                                               total_run    failed
    missing values proportion is between 0.0% and 17.0%             1         0
    missing values proportion is between 10.0% and 15.0%            1         1


No missing values in the **"customer"** column so the constraint requiring missing proportion between 0% and 1% passes.

Only 1 of 6 missing values in the **"spending"** column, the constraint requiring missing proportion between 0% and 17% passes as well

This time 1 of 6 missing for the **"spending"** column is too much, resulting in a missing proportion of 16.6667%, but the constraint requires missing proportion of 10% to 15%, resulting in a fail.

### Column value type equals or is in set constraint

In [19]:
from whylogs.core.statistics.constraints import (
    columnValuesTypeEqualsConstraint,
    columnValuesTypeInSetConstraint
)
from whylogs.proto import InferredType

In [20]:
# check if the values of the specified column are of type string
column_values_type_equals_string = columnValuesTypeEqualsConstraint(expected_type=InferredType.Type.STRING)
# check if the values of the specified column are either fractional or integral numbers
type_set = {InferredType.Type.FRACTIONAL, InferredType.Type.INTEGRAL}
column_value_types_in_set = columnValuesTypeInSetConstraint(type_set=type_set, verbose=True)

column_type_summary_constraint = {
    "country": [column_values_type_equals_string],
    "spending": [column_value_types_in_set]
}

# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(column_type_summary_constraint)
# should not have failures since the country column type is string, and the spending column contains numbers
format_report(report)

Constraint failures by feature - 
country:
    test_name                              total_run    failed
    type of the column values is STRING            1         0
spending:
    test_name                                                     total_run    failed
    type of the column values is in {'FRACTIONAL', 'INTEGRAL'}            1         0


# Column values in set

In [21]:
from whylogs.core.statistics.constraints import columnValuesInSetConstraint

In [22]:
student_grades = pd.DataFrame({
    'student_id': [1, 5, 15, 16, 22],
    'grade': ['C', 'C', 'A', '/', 'B']
})

val_set = {'A', 'B', 'C', 'E', 'F'}  # valid grades
column_values_in_set = columnValuesInSetConstraint(value_set=val_set)

dc = DatasetConstraints(None, value_constraints={
    "grade": [column_values_in_set], 
})

# the value constraints are applied at the time of logging the dataframe
profile = session.log_dataframe(student_grades, "test.data", constraints=dc)

# out of the five student's grades we expect to see one failure for the '/' unknown grade
# the total number of runs of the constraint should equal the number of values in the column
format_report(dc.report())

Constraint failures by feature - 
grade:
    test_name                                  total_run    failed
    values are in {'A', 'F', 'B', 'C', 'E'}            5         1


# Regex matching constraints

### String length value constraints using regex

In [23]:
from whylogs.core.statistics.constraints import stringLengthEqualConstraint, stringLengthBetweenConstraint
df = pd.DataFrame(
    [
        {"str1": "length7"},
        {"str1": "length_8"},
        {"str1": "length__9"},
        {"str1": "a       10"},
        {"str1": "11        b"},
        {"str1": '(*&^%^&*(24!@_+>:|}?><"\\'},
        {"str1": "1b34567"},
    ]
)
length_constraint7 = stringLengthEqualConstraint(length=7)
length_constraint7to10 = stringLengthBetweenConstraint(lower_value=7, upper_value=10)
length_constraints = [length_constraint7, length_constraint7to10]
dc = DatasetConstraints(None, value_constraints={"str1": length_constraints})

profile = session.log_dataframe(df, 'test2.data', constraints=dc)
format_report(dc.report())

Constraint failures by feature - 
str1:
    test_name                                          total_run    failed
    length of the string values is equal to 7                  7         5
    length of the string values is between 7 and 10            7         2


### Email matching constraint

In [24]:
from whylogs.core.statistics.constraints import containsEmailConstraint

In [25]:
customer_emails = pd.DataFrame([
    {"email": r"abc's@gmail.com"},  # valid
    {"email": r'"aVrrR Test \@"@gmail.com'},  # valid (if wrapped in quotes, emails can contain special characters)
    {"email": r"abc..q12@example.us"},  # invalid (two consecutive dots)
    {"email": r'"sdsss\d"@gmail.com'},  # valid
    {"email": r"customer/department=shipping?@example-another.some-other.us"},  # valid
    {"email": r".should_fail@yahoo.com"},  # invalid (must not start wiht dot)
    {"email": r"some.@a.com"},  # invalid (must not contain a dot directly before the @ symbol)
    {"email": r"abs@yahoo."},  # invalid (must not end with a dot)
])

# use the predefined email regex from whylogs
default_contains_email_constraint = containsEmailConstraint()

dc = DatasetConstraints(None, value_constraints={"email": [default_contains_email_constraint]})

profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)
# we expect 4 of the 8 runs to be failures
format_report(dc.report())

Constraint failures by feature - 
email:
    test_name                                      total_run    failed
    column values match the email regex pattern            8         4


In [26]:
# you can provide your own email regex and check the values against it
custom_contains_email_constraint = containsEmailConstraint(regex_pattern = r"\S+@\S+")
dc = DatasetConstraints(None, value_constraints={"email": [custom_contains_email_constraint]})

profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)
# now we expect 1 of the 8 runs to be failures, the email that contains white spaces
format_report(dc.report())
# running the containsEmailConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
email:
    test_name                                      total_run    failed
    column values match the email regex pattern            8         1


### Credit Card matching constraint

In [27]:
from whylogs.core.statistics.constraints import containsCreditCardConstraint

In [28]:
credit_cards = pd.DataFrame(
    [
        {"credit_card": "3714-496353-98431"},  # amex
        {"credit_card": "3787 344936 71000"},  # amex
        {"credit_card": "3056 930902 5904"},  # diners club
        {"credit_card": "3065 133242 2899"},  # invalid
        {"credit_card": "3852-000002-3237"},  # diners club
        {"credit_card": "6011 1111 1111 1117"},  # discover
        {"credit_card": "6011-0009-9013-9424"},  # discover
        {"credit_card": "3530 1113 3330 0000"},  # jcb
        {"credit_card": "3566-0020-2036-0505"},  # jcb
        {"credit_card": "5555 5555 5555 4444"},  # master card
        {"credit_card": "5105 1051 0510 5100"},  # master card
        {"credit_card": "4111 1111 1111 1111"},  # visa
        {"credit_card": "4012 8888 8888 1881"},  # visa
        {"credit_card": "4222-2222-2222-2222"},  # visa
        {"credit_card": "1111-1111-1111-1111"},  # invalid
        {"credit_card": "a4111 1111 1111 1111b"},  # invalid
        {"credit_card": "4111111111111111"},  # visa
        {"credit_card": 12345},  # invalid
        {"credit_card": "absfcvs"},  # invalid
    ]
)

default_credit_card_constraint = containsCreditCardConstraint()
dc = DatasetConstraints(None, value_constraints={"credit_card": [default_credit_card_constraint]})

profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)
# now we expect 5 of the 19 runs to be failures, the invalid credit cards
format_report(dc.report())

Constraint failures by feature - 
credit_card:
    test_name                                            total_run    failed
    column values match the credit card regex pattern           19         5


In [29]:
# you can provide your own credit card regex and check the values against it
custom_credit_card_constraint = containsCreditCardConstraint(regex_pattern = r"^(?:[0-9]{4}[\s-]?){3,4}$")
dc = DatasetConstraints(None, value_constraints={"credit_card": [custom_credit_card_constraint]})

profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)
# now more valid credit cards are being reported as failures
format_report(dc.report())
# running the containsCreditCardConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
credit_card:
    test_name                                            total_run    failed
    column values match the credit card regex pattern           19         8


### SSN regex matching constraint

In [30]:
from whylogs.core.statistics.constraints import containsSSNConstraint

In [31]:
ssn_data = pd.DataFrame([
    {"ssn": "123-01-2335"},  # valid
    {"ssn": "039780012"},  # valid
    {"ssn": "000231324"},  # invalid
    {"ssn": "666781132"},  # invalid
    {"ssn": "926-89-1234"},  # invalid
    {"ssn": "001-01-0001"},  # valid
    {"ssn": "122 23 0001"},  # valid
    {"ssn": "1234-12-123"},  # invalid
])

default_ssn_constraint = containsSSNConstraint()

dc = DatasetConstraints(None, value_constraints={"ssn": [default_ssn_constraint]})

profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)
# now we expect 4 of the 8 runs to be failures, the invalid ssn numbers
format_report(dc.report())

Constraint failures by feature - 
ssn:
    test_name                                    total_run    failed
    column values match the SSN regex pattern            8         4


In [32]:
# you can provide your own ssn regex and check the values against it
custom_ssn_constraint = containsSSNConstraint(regex_pattern = r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$")
dc = DatasetConstraints(None, value_constraints={"ssn": [custom_ssn_constraint]})

profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)
# now more valid ssn numbers are being reported as failures
format_report(dc.report())
# running the containsSSNConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
ssn:
    test_name                                    total_run    failed
    column values match the SSN regex pattern            8         5


### URL regex matching constraint

In [33]:
from whylogs.core.statistics.constraints import containsURLConstraint

In [34]:
web_urls = pd.DataFrame([
    {"url": "http://www.example.com"},  # valid
    {"url": "abc.test.com"},  # valid (without protocol)
    {"url": "abc.w23w.asb#abc?a=2"},  # valid (without protocol)
    {"url": "https://ab.abc.bc"},  # valid
    {"url": "a.b.c"},  # valid
    {"url": "abcd"},  # invalid
    {"url": "123.w23.235"},  # valid
    {"url": "asf://saf.we.12"},  # invalid
    {"url": "12345"},  # invalid
    {"url": "1.2"},  # invalid
        
])

default_url_constraint = containsURLConstraint()
dc = DatasetConstraints(None, value_constraints={"url": [default_url_constraint]})

profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)
# now we expect the 4 invalid urls, out of the 10 in total, to be reported as failures
format_report(dc.report())

Constraint failures by feature - 
url:
    test_name                                    total_run    failed
    column values match the URL regex pattern           10         4


In [35]:
# you can provide your own url regex and check the values against it
custom_url_constraint = containsURLConstraint(regex_pattern = r"^http(s)?:\/\/(www\.)?.+\..+$")
dc = DatasetConstraints(None, value_constraints={"url": [custom_url_constraint]})

profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)
# with the new regex more valid urls are being reported as failures
format_report(dc.report())
# running the containsURLConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
url:
    test_name                                    total_run    failed
    column values match the URL regex pattern           10         8


# Datetime/json constraints

In [36]:
from whylogs.core.statistics.constraints import (
    dateUtilParseableConstraint, jsonParseableConstraint, matchesJsonSchemaConstraint, strftimeFormatConstraint )
df = pd.DataFrame(
        [
            {"str1": "1990-12-1"},  # dateutil valid; strftime valid
            {"str1": "1990/12/1"},
            {"str1": "today is 2019-03-27"},  # dateutil invalid
            {"str1": "Monday at 12:01am"},
            {"str1": "xyz_not_a_date"},  # dateutil invalid
            {"str1": "yesterday"},  # dateutil invalid
            {"str1": {"name": "s", "w2w2": "dgsg", "years": 232, "abc": 1}},  # schema valid
            {"str1": {"name": "s", "w2w2": "dgsg", "years": 232}},  # schema invalid
            {"str1": json.dumps({"name": "s", "w2w2": "dgsg", "years": 232, "abc": 1})},  # json valid, schema valid
            {"str1": json.dumps({"name": "s", "w2w2": "dgsg", "years": "232", "abc": 1})},  # json valid
            {"str1": "random str : fail everything"},
            {"str1": "2003-12-23"},  # strftime valid, dateutil valid
            {"str1": "2003-15-23"},  # strftime invalid, dateutil invalid
            {"str1": "10-12-32"},  # strftime invalid, dateutil valid
        ]
    )

dateutil_parseable = dateUtilParseableConstraint()
json_parseable = jsonParseableConstraint()

json_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "years": {"type": "integer"},
        },
        "required": ["name", "abc"],
    }
matches_json_schema = matchesJsonSchemaConstraint(json_schema=json_schema)

is_strftime = strftimeFormatConstraint(format="%Y-%m-%d")

apply_func_constraints = [dateutil_parseable, json_parseable, matches_json_schema, is_strftime]


dc = DatasetConstraints(None, value_constraints={"str1": apply_func_constraints})
profile = session.log_dataframe(df, 'test3.data', constraints=dc)

format_report(dc.report())

Constraint failures by feature - 
str1:
    test_name                                                                                                                                                                 total_run    failed
    column values are dateutil parseable                                                                                                                                             14         9
    column values are JSON parseable                                                                                                                                                 14        12
    column values match the provided JSON schema {'type': 'object', 'properties': {'name': {'type': 'string'}, 'years': {'type': 'integer'}}, 'required': ['name', 'abc']}           14        12
    column values are strftime parseable                                                                                                                                             14 

Seeing the comments above, when creating the dataset, we can realize which values fail or pass, for which constraint. The dateutil constraint has 5 passing values in the dataset, and the other 3 constraints have only 2 values that pass from total of 14.

# Entropy and Distributional Measures

### Entropy

Check if the column entropy is in some interval [a, b]. Works both for discrete and continuous valued columns.

In [37]:
from whylogs.core.statistics.constraints import approximateEntropyBetweenConstraint

#### Entropy on categorical data

In [38]:
pets = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.3, 0.1, 0.2, 0.4])
pet_df = pd.DataFrame({
    "pet": pets
})

In [39]:
# check if the entropy of the pet_df 'pet' column is between 0.7 and 2.1 (the actual value is around 1.85)
entropy_between_values_constraint = approximateEntropyBetweenConstraint(lower_value=0.7, upper_value=2.1)

dc = DatasetConstraints(None, summary_constraints={"pet": [entropy_between_values_constraint]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to complete without failures
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                                     total_run    failed
    approximate entropy is between 0.7 and 2.1            1         0


#### Entropy on continuous data

In [40]:
# sample 100 data points from normal distribution with mean 30000 and standard deviation 15000 to represent sales values
sales = np.random.normal(loc=30000, scale=15000, size=100)

sales_df = pd.DataFrame({
    "sales": sales
})

In [41]:
# check if the entropy of the sales_df 'sales' column is between 2.3 and 3.5 (the actual value is 1.85)
entropy_between_values_constraint_cont = approximateEntropyBetweenConstraint(lower_value=2.3, upper_value=3.5)

dc = DatasetConstraints(None, summary_constraints={"sales": [entropy_between_values_constraint_cont]})

profile = session.log_dataframe(sales_df, 'test.data', constraints=dc)
# now we expect the constraint to fail since entropy is between 3.8 and 3.9
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
sales:
    test_name                                     total_run    failed
    approximate entropy is between 2.3 and 3.5            1         1


### KS Test

The KS Test can only be executed on continuous data.

In [42]:
from whylogs.core.statistics.constraints import parametrizedKSTestPValueGreaterThanConstraint

In [43]:
# this would be the reference distribution, sales 2020
sales_2020 = np.random.normal(loc=30000, scale=15000, size=100)
# this would be the target distribution, sales 2021
sales_2021 = np.random.normal(loc=45000, scale=10000, size=100)
# we want to check if the sales in 2020 have the same distribution as the sales in 2021

In [44]:
sales_2021_df = pd.DataFrame({
    "sales": sales_2021
})

# check if the p-value of the ks test for referenece distribution sales_2020 is greater than 0.05 
# if so, we do not reject the null hypothesis
ks_test_p_value_greater_than = parametrizedKSTestPValueGreaterThanConstraint(reference_distribution=sales_2020, p_value=0.05)

dc = DatasetConstraints(None, summary_constraints={"sales": [ks_test_p_value_greater_than]})

profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)
# now we expect the constraint to fail since entropy is between 3.8 and 3.9
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
sales:
    test_name                                            total_run    failed
    parametrized KS test p-value is greater than 0.05            1         1


The p-value is less than 0.05, which means we can reject the null hypothesis with this confidence level.

### KL Divergence

The KL Divergence constraint is supported for both discrete and continuous variables.

In [45]:
from whylogs.core.statistics.constraints import columnKLDivergenceLessThanConstraint

#### KL Divergence for continuous case

In [46]:
# check if the kl divergence is greater than 0.6 
kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=sales_2020, threshold=0.6)

dc = DatasetConstraints(None, summary_constraints={"sales": [kl_divergence_greater_than]})

profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)
# now we expect the constraint to fail
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
sales:
    test_name                         total_run    failed
    KL Divergence is less than 0.6            1         1


  kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
  kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))


The distribution of sales in 2020 cannot be encoded with the distribution of sales in 2021.

#### KL Divergence for discrete case

In [47]:
# create a new distribtution from the pets sample with different probabilities
pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.5, 0.1, 0.2, 0.2])

# check if the kl divergence is greater than 0.6 
kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=pets_reference, threshold=0.6)

dc = DatasetConstraints(None, summary_constraints={"pet": [kl_divergence_greater_than]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to pass
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                         total_run    failed
    KL Divergence is less than 0.6            1         0


### Chi-Squared Test

The Chi-Squared test constraint is only supported for **categorical** values.

In [48]:
from whylogs.core.statistics.constraints import columnChiSquaredTestPValueGreaterThanConstraint

In [49]:
# create a new distribtution from the pets sample with different probabilities
pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.01, 0.01, 0.97, 0.01])

# check if the p-value is greater than 0.05
chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=pets_reference, p_value=0.05)

dc = DatasetConstraints(None, summary_constraints={"pet": [chi_squared_p_value_greater_than]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to fail since the distributions are different
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                                        total_run    failed
    Chi-Squared test p-value is greater than 0.05            1         1


The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval.

If you don't have a reference distribution for calculating the Chi-Squared test, but you know the approximate frequencies of each of the items, you can use this constraint by supplying a mapping of items and frequencies as counts, in the **reference_distribution** parameter of the constraint.

In [50]:
# create a new distribtution from the pets sample with different probabilities
reference_dict_pets = {
    'cat': 1,
    'dog': 1,
    'rabbit': 48, 
    'hamster': 1,
}

# check if the p_value is greater than 0.05
chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=reference_dict_pets, p_value=0.05)

dc = DatasetConstraints(None, summary_constraints={"pet": [chi_squared_p_value_greater_than]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to fail since this is approximately the same distribution from the previous example
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                                        total_run    failed
    Chi-Squared test p-value is greater than 0.05            1         1


The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval.

## Table shape constraints

In [51]:
from whylogs.core.statistics.constraints import (
    numberOfRowsConstraint, columnExistsConstraint, columnsMatchSetConstraint )

df = pd.DataFrame(
        [
            {"str1": "random1"},
            {"str1": "random2"},
            {"str1": "random 4-1"},
            {"str1": "4 random"},
            {"str1": "whylogs rocks!"},
            {"str1": "   "},
            {"str1": 12},
            {"str1": {"name": "s", "w2w2": "dgsg", "years": 232}},
            {"str1": json.dumps({"name": "s", "w2w2": "dgsg", "years": 232, "abc": 1})},
            {"str1": json.dumps({"name": "s", "w2w2": "dgsg", "years": "232", "abc": 1})},
            {"str1": "random str : fail everything"},
            {"str1": "2003-12-23"},
            {"str1": "2003-15-23"},
            {"str1": "10-12-32"},
        ]
    )

df['col2'] = range(len(df))

rows = numberOfRowsConstraint(n_rows=len(df)+1) # fail
rows_2 = numberOfRowsConstraint(n_rows=len(df)) # pass

column_exist = columnExistsConstraint("this_column_does_not_exist") # fail
column_exist2 = columnExistsConstraint("col2") # pass

set1 = {'this', 'is', 'a', 'wrong', 'columns', 'set'}
columns_set = set(df.columns)
columns_match = columnsMatchSetConstraint(set1) # fail
columns_match2 = columnsMatchSetConstraint(columns_set) # pass

table_shape_constraints = [rows, rows_2, column_exist, column_exist2, columns_match, columns_match2]

dc = DatasetConstraints(None, table_shape_constraints=table_shape_constraints)

profile = session.log_dataframe(df, "test.data", constraints=dc)

report = profile.apply_table_shape_constraints()
format_report(report)

### Table shape example 2

In [52]:
logger = session.logger(dataset_name="test2.data", constraints=dc)
logger.log_dataframe(df)

In [53]:
report = logger.profile.apply_table_shape_constraints()
format_report(report)

Logging another dataframe with different DatasetProfile but the same DatasetConstraints

In [54]:
logger.log({"this_column_does_not_exist": 1})  # logging a new non existent column

In [55]:
report2 = logger.profile.apply_table_shape_constraints()
format_report(report2)

After logging the column **'this_column_does_not_exist'**, the total row number stays the same, 
so the numberOfRowsConstraint passed.

**'table columns CONTAIN this_column_does_not_exist'** constraint now passed, since the column now exists, but

**'table columns EQ {'str1', 'col2'}'** now failed, because new column was logged


In [56]:
set2 = set(columns_set)
set2.add("this_column_does_not_exist")

columns_match3 = columnsMatchSetConstraint(set2) # new constraint containing the new column

report3 = logger.profile.apply_table_shape_constraints(SummaryConstraints([columns_match3])) # applying only the new constraint
format_report(report3)

After adding the new column to **'set2'** and creating a **columnsMatchSetConstraint** with it, now it doesn't fail

In [57]:
log_dict = dict()
    # logging a new value for every column (one more row)
for column in df.columns:
    value = df[column][10]  # sample from the column
    log_dict[column] = value
logger.log(log_dict)

In [58]:
report4 = logger.profile.apply_table_shape_constraints()
format_report(report4)

**'table total_row_number EQ 14'** now failed since new row was logged

In [59]:
rows_3 = numberOfRowsConstraint(n_rows=len(df.index) + 1)  # new numberOfRowsConstraint
report5 = logger.profile.apply_table_shape_constraints(SummaryConstraints([rows_3]))
format_report(report5)

Creating a new **numberOfRowsConstraint** with n_rows = previous_n_rows + 1 and applying it, now does not fail.

In [60]:
profile = logger.close()  # closing the logger and getting the DatasetProfile
print (profile.total_row_number)

15


## Multi column constraints
### Logical operations between values of the specified columns

In [61]:
from whylogs.core.statistics.constraints import columnValuesAGreaterThanBConstraint, columnValuesAEqualBConstraint

df = pd.DataFrame({"col1": [4, 5, 6, 7], "col2": [0, 1, 6, 15]})

a_gt_b = columnValuesAGreaterThanBConstraint(column_A="col1", column_B="col2")
a_eq_b = columnValuesAEqualBConstraint(column_A="col1", column_B="col2")

dc = DatasetConstraints(None, multi_column_value_constraints=[a_gt_b, a_eq_b])

profile = session.log_dataframe(df, "test4.data", constraints=dc)

format_report(dc.report())

Value by value comparison. col1 values > col2 values, only 2 are passing, and col1 values == col 2 values only 1 is True (the third element from both the columns are equal).

### Sum of row values of multiple columns equals some value, or some column value

In [62]:
from whylogs.core.statistics.constraints import sumOfRowValuesOfMultipleColumnsEqualsConstraint

In [63]:
total_expences = pd.DataFrame({
    "employees %": [25, 45, 15, 3],
    "equipment %": [10, 12, 4, 9],
    "materials %": [40, 35, 45, 55],
    "other %": [25, 8, 4, 6]
})

In [64]:
# check if the percentage of expenses for each part sum to 100 %
sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(
    columns=["employees %", "equipment %", "materials %", "other %"],
    value=100
)

dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])

# the multicolumn value constraints do not need to be applied to the data 
# they are applied at the time of logging
profile = session.log_dataframe(total_expences, "test.data", constraints=dc)

# we expect 2 of the 4 rows to be failures since the last two rows do not sum to 100
format_report(dc.report())

In [65]:
# check if the sum of the row values (percentages) for 'equipment %' and 'materials %' equal the value of 'other %'
sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(
    columns=["equipment %", "materials %"],
    value='other %'
)

dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])
profile = session.log_dataframe(total_expences, "test.data", constraints=dc)

# we expect all rows to be failures since the sum of 'equipment %' and 'materials %' is not equal to the value of the column 'other %'
format_report(dc.report())

### Column Pair Values in Set

Check if the values of a pair of columns are in a predefined set of pair values.

In [66]:
from whylogs.core.statistics.constraints import columnPairValuesInSetConstraint

In [67]:
product_grades = pd.DataFrame({
    "product": ["ProductA", "ProductB", "ProductC", "ProductD", "ProductE"],
    "grade": ["A", "A", "B", "C", "C"],
    "subgrade": ["A1", "A3", "B2", "C2", "C2"]
})

In [68]:
# we want to check if each of the grade and subgrade pairs are in the specific set
grade_subgrade_pairs_in_set = columnPairValuesInSetConstraint(
    column_A="grade", 
    column_B="subgrade",
    value_set = {("A", "A1"), ("A", "A2"), ("B", "B1"), ("B", "B2"), ("C", "C1"), ("C", "C2")}
)

dc = DatasetConstraints(None, multi_column_value_constraints=[grade_subgrade_pairs_in_set])
profile = session.log_dataframe(product_grades, "test.data", constraints=dc)

# we expect 1 out of 5 pairs to be a failure, specifically ("A", "A3")
format_report(dc.report())

### Column Values Unique within Row

Check if the value of the specified column is unique within each row.

In [69]:
from whylogs.core.statistics.constraints import columnValuesUniqueWithinRow

In [70]:
users = pd.DataFrame({
    "first_name": ["John", "Jane", "Bob", "Anna"],
    "last_name": ["Doe", "Doe", "Smith", "Jones"],
    "username": ["jd123", "jane.doe@example.com", "bobsmith", "_anna_"],
    "email": ["john.doe@example.com", "jane.doe@example.com", "bob.smith@example.com", "anna_jones@example.com"],
    "followers": [1525, 12268, 51343, 867],
    "points": [23.4, 123.2, 432.22, 32.1],
})

In [71]:
# check if the emails are unique compared to other fields for each user
# suppose we do not want to accept a username which is the same as the user's email
email_values_unique_within_row = columnValuesUniqueWithinRow(column_A="email")

dc = DatasetConstraints(None, multi_column_value_constraints=[email_values_unique_within_row])
profile = session.log_dataframe(users, "test.data", constraints=dc)

# we expect 1 out of 4 evaluations of the constraint to be a failure, since Jane Doe's email is the same as their username
format_report(dc.report())

# Generate default constraints for data set

Let's log the users data frame from the previous example, without any constraints. We will use WhyLogs' **generate_constraints** method to generate default constraints using the dataset profile.

In [72]:
profile = session.log_dataframe(users, "test.data")

In [73]:
auto_constraints = profile.generate_constraints()
print(message_to_json(auto_constraints.to_protobuf()))

{
  "properties": {
    "schemaMajorVersion": 1,
    "schemaMinorVersion": 2,
    "sessionId": "6c9a7d7a-d50c-4e20-9be7-0757caf192cb",
    "sessionTimestamp": "1645457691410",
    "dataTimestamp": "1645454092967",
    "tags": {
      "name": "test.data"
    },
    "metadata": {}
  },
  "summaryConstraints": {
    "email": {
      "constraints": [
        {
          "name": "type of the column values is STRING",
          "firstField": "column_values_type",
          "value": 5.0,
          "op": "EQ",
          "verbose": false,
          "quantileValue": 0.0
        },
        {
          "name": "number of unique values is between 3 and 5",
          "firstField": "unique_count",
          "op": "BTWN",
          "between": {
            "lowerValue": 3.0,
            "upperValue": 5.0
          },
          "verbose": false,
          "quantileValue": 0.0
        },
        {
          "name": "most common value is in {'john.doe@example.com', 'bob.smith@example.com', 'jane.doe@exam

For the columns with inferred type STRING, the **generate_constraints** method generates 3 types of constraints: **columnValuesTypeEqualsConstraint** where the type is STRING, **columnUniqueValueCountBetweenConstraint** which makes a constraint that the unique values in a column should range between unique_count - 1 and unique_count + 1 in the current data frame, and finally **columnMostCommonValueInSetConstraint** which takes a set of the 5 most common values and defines a constraint that the most common value in this column should be in that set.

The columns which have inferred type FRACTIONAL or INTEGRAL, such as 'points' and 'followers' respectively, have numeric constraints generated such as minimum value greater than 0, maximum value less than 0, mean in range [mean - stddev, mean + stddev], if these constraints apply to the current column. Apart from these constraints, **columnValuesTypeEqualsConstraint** and **columnMostCommonValueInSetConstraint** are generated for both types. **columnUniqueValueCountBetweenConstraint** is generated only for the INTEGRAL valued columns.

No constraints are generated for columns which have an inferred type of NULL.