### There is a specific function for common constraints. Should only continue to use the ValueConstraint and SummaryConstraint for creating a custom constraint that can't be found.

In [1]:
from whylogs import get_or_create_session
from whylogs.util.protobuf import message_to_json

# create session
session = get_or_create_session()

WARN: Missing config


In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
from tabulate import tabulate

def indent(txt, spaces=4):
    return "\n".join(" " * spaces + ln for ln in txt.splitlines())

def format_report(r):
    # report failures in tabular form
    
    r_2 = [entry for entry in r if len(entry)==2] # all the single column constraints
    r_table_shape = [[entry for entry in r if len(entry)!=2 and entry[0].startswith("table")]] # multi column and table shape constraints
    r_multi_column = [[entry for entry in r if len(entry)!=2 and entry[0].startswith("multi column")]]
    
    if len(r_2):
        print("Constraint failures by feature - ")
    for c,r in r_2:
        print(f"{c}:")
        if len(r[0][0]) > 80: 
            print(f"\ntest_name:\t{r[0][0]}\n")
            print(f"total_run:\t{r[0][1]}\n")
            print(f"failed:\t\t{r[0][2]}\n")
        else:    
            print(indent(tabulate(r, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
    
    if len(r_table_shape[0]):
        print ()   
        print("Table shape constraint failures -")
        for entry in r_table_shape:
            print(indent(tabulate(entry, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
        
    if len(r_multi_column[0]):
        print()    
        print("Multi column constraint failures -")
        for entry in r_multi_column:
            if len(entry[0][0]) > 80: 
                print(f"\ntest_name:\t{entry[0][0]}\n")
                print(f"total_run:\t{entry[0][1]}\n")
                print(f"failed:\t\t{entry[0][2]}\n")
            else:
                print(indent(tabulate(entry, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
        

## Between summary constraints on summary fields like: stddev, min, max, mean...

In [4]:
from whylogs.core.statistics.constraints import (
    maxBetweenConstraint,
    maxLessThanEqualConstraint,
    meanBetweenConstraint,
    minBetweenConstraint,
    minGreaterThanEqualConstraint,
    stddevBetweenConstraint,
    stringLengthBetweenConstraint,
    stringLengthEqualConstraint,
    quantileBetweenConstraint,
    DatasetConstraints,
    SummaryConstraints,
)

In [133]:
# define the specific types of constraints
# the ranges of the between constraints include the bouding values

# check if the maximum value of the column is in the range [5, 10.8]
max_between_values = maxBetweenConstraint(lower_value=5, upper_value=10.8) 
# check if the maximum value of the column is less than or equal to 100
max_less_than_equal_value = maxLessThanEqualConstraint(value=100)
# check if the mean of the column is in the range [1.2, 1.6] 
mean_between_values = meanBetweenConstraint(lower_value=1.2, upper_value=1.6)
# check if the minimum value of the column is in the range [0.1, 0.5]
min_between_values = minBetweenConstraint(lower_value=0.1, upper_value=0.5)
# check if the minimum value of the column is greater than or equal to 1
min_greater_than_equal_value = minGreaterThanEqualConstraint(value=1)
# check if the standard deviation of the column is in the range [2.3, 5.4]
stddev_between_values = stddevBetweenConstraint(lower_value=2.3, upper_value=5.4)
# check if the 0.15 quantile value is in the range [2, 4.3]
quantile_between_values = quantileBetweenConstraint(quantile_value = 0.15, lower_value=2, upper_value=4.3) 

# example data frame with columns "col1","col2", "col3"
# you can also read an existing data set using pandas, or as a numpy array
df = pd.DataFrame({
    "col1": [4, 5, 6, 7],
    "col2": [0, 1, 2, 3],
    "col3": [50, 60, 80, 110]
})

# bind the standard deviation between constraint to the dataframe column named "col1"
# bind the mean between constraint to the dataframe column named "col2"
# you can add multiple summary constrants for each column
dc = DatasetConstraints(None, summary_constraints={
    "col1": [max_between_values, stddev_between_values, min_greater_than_equal_value], 
    "col2": [mean_between_values, min_between_values, quantile_between_values],
    "col3": [max_less_than_equal_value]
})  

# logging the dataframe creates a profile wiht summary statistics for the data set
# the data set profile contains column profiles with summary statistics for each column present in the data set
profile = session.log_dataframe(df, "test.data", constraints=dc)

# serialize the DatasetConstraints to JSON
dc_json = json.loads(dc.to_json())
col1_constraints = json.dumps(dc_json['summaryConstraints']['col1']['constraints'], indent=4)
col2_constraints = json.dumps(dc_json['summaryConstraints']['col2']['constraints'], indent=4)
col3_constraints = json.dumps(dc_json['summaryConstraints']['col3']['constraints'], indent=4)

print(f"Constraints for column 'col1': \n{col1_constraints}\n")
print(f"Constraints for column 'col2': \n{col2_constraints}\n")
print(f"Constraints for column 'col3': \n{col3_constraints}\n")

Constraints for column 'col1': 
[
    {
        "name": "summary max BTWN 5 and 10.8",
        "firstField": "max",
        "op": "BTWN",
        "between": {
            "lowerValue": 5.0,
            "upperValue": 10.8
        },
        "verbose": false,
        "quantileValue": 0.0
    },
    {
        "name": "summary stddev BTWN 2.3 and 5.4",
        "firstField": "stddev",
        "op": "BTWN",
        "between": {
            "lowerValue": 2.3,
            "upperValue": 5.4
        },
        "verbose": false,
        "quantileValue": 0.0
    },
    {
        "name": "summary min GE 1/None",
        "firstField": "min",
        "value": 1.0,
        "op": "GE",
        "verbose": false,
        "quantileValue": 0.0
    }
]

Constraints for column 'col2': 
[
    {
        "name": "summary mean BTWN 1.2 and 1.6",
        "firstField": "mean",
        "op": "BTWN",
        "between": {
            "lowerValue": 1.2,
            "upperValue": 1.6
        },
        "verbose": false

#### Summary constraints are applied with apply_summary_constraints on the DatasetProfile.

In [136]:
# summary constraints must be applied on the dataset profile, after logging the dataframe
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
col1:
    test_name                          total_run    failed
    summary max BTWN 5 and 10.8                1         0
    summary stddev BTWN 2.3 and 5.4            1         1
    summary min GE 1/None                      1         0
col2:
    test_name                               total_run    failed
    summary mean BTWN 1.2 and 1.6                   1         0
    summary min BTWN 0.1 and 0.5                    1         1
    summary quantile 0.15 BTWN 2 and 4.3            1         1
col3:
    test_name                  total_run    failed
    summary max LE 100/None            1         1


As we can see **mean BTWN** passes and the **stddev BTWN** fails as they should.

## Summary constraints for distinct, unique and most common values in a column

### Distinct values in a column

In [6]:
from whylogs.core.statistics.constraints import (
    distinctValuesInSetConstraint, distinctValuesEqualSetConstraint, distinctValuesContainSetConstraint )

In [137]:
in_set = distinctValuesInSetConstraint(reference_set=set(range(1, 10)))
eq_set = distinctValuesEqualSetConstraint(reference_set={'a', 'a', 'a'})
contain_set = distinctValuesContainSetConstraint(reference_set={0, 1})

#### Applying summary constraints sent as an argument to apply_summary_constraints function on the same profile as before!

In [138]:
report = profile.apply_summary_constraints({'col1': SummaryConstraints([in_set, eq_set]), 
                                           'col2': SummaryConstraints([contain_set])})
format_report(report)

Constraint failures by feature - 
col1:
    test_name                                                            total_run    failed
    summary distinct_column_values IN_SET {1, 2, 3, 4, 5, 6, 7, 8, 9}            1         0
    summary distinct_column_values EQ_SET {'a'}                                  1         1
col2:
    test_name                                            total_run    failed
    summary distinct_column_values CONTAIN_SET {0, 1}            1         0


### Unique column value count and proportion constraints

In [4]:
from whylogs.core.statistics.constraints import (
    columnUniqueValueCountBetweenConstraint,
    columnUniqueValueProportionBetweenConstraint,
)

In [11]:
# create a data set with customers, the country they live in, and their spending
customer_data = pd.DataFrame({
    "customer": ["c1", "c2", "c3", "c4", "c5", "c6"],
    "country": ["Germany", "Italy", "Germany", "USA", "Germany", "UK"],
    "spending": [1200, 500, 700, 1500, 300, None]
})

In [12]:
# check if there are between 1 and 5 unique values in the specific column
unique_value_count_between = columnUniqueValueCountBetweenConstraint(lower_value=1, upper_value=5)
# check if the proportion of unique values int he set is between 0.3 and 0.4 inclusive
unique_value_proportion_between = columnUniqueValueProportionBetweenConstraint(lower_fraction=0.3, upper_fraction=0.45)
dc = DatasetConstraints(None, summary_constraints={"country": [unique_value_count_between, unique_value_proportion_between]})

# log the customer_data dataframe to obrain the profile
profile = session.log_dataframe(customer_data, 'test2.data', constraints=dc)
# summary constraints must be applied on the profile after the data set has been logged
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
country:
    test_name                                      total_run    failed
    summary unique_count BTWN 1 and 5                      1         0
    summary unique_proportion BTWN 0.3 and 0.45            1         1


### Column most common value in set constraint

In [7]:
from whylogs.core.statistics.constraints import columnMostCommonValueInSetConstraint

In [8]:
# check if the most common value in the column is in the set {"Germany", "Italy"}
most_common_value_in_set = columnMostCommonValueInSetConstraint(value_set={"Germany", "Italy"})
# bind the constraint to the column named "country"
summary_constraint = {"country": [most_common_value_in_set]}
# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(summary_constraint)
format_report(report)

Constraint failures by feature - 
country:
    test_name                                            total_run    failed
    summary most_common_value IN {'Germany', 'Italy'}            1         0


### Column values not null

In [9]:
from whylogs.core.statistics.constraints import columnValuesNotNullConstraint

In [12]:
# check if all values in the column are non-null
customer_value_not_null = columnValuesNotNullConstraint()
spending_value_not_null = columnValuesNotNullConstraint()
# bind the constraint to the column, there are no null values in the customer column, but there is one in the spending column
summary_constraint = {"customer": [customer_value_not_null], "spending": [spending_value_not_null]}
# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(summary_constraint)

format_report(report)

Constraint failures by feature - 
customer:
    test_name                       total_run    failed
    summary null_count EQ 0/None            1         0
spending:
    test_name                       total_run    failed
    summary null_count EQ 0/None            1         1


### Column value type equals or is in set constraint

In [8]:
from whylogs.core.statistics.constraints import (
    columnValuesTypeEqualsConstraint,
    columnValuesTypeInSetConstraint
)
from whylogs.proto import InferredType

In [10]:
# check if the values of the specified column are of type string
column_values_type_equals_string = columnValuesTypeEqualsConstraint(expected_type=InferredType.Type.STRING)
# check if the values of the specified column are either fractional or integral numbers
type_set = {InferredType.Type.FRACTIONAL, InferredType.Type.INTEGRAL}
column_value_types_in_set = columnValuesTypeInSetConstraint(type_set=type_set, verbose=True)

column_type_summary_constraint = {
    "country": [column_values_type_equals_string],
    "spending": [column_value_types_in_set]
}

# apply the summary constraints on the same profile for the customer_data data set
report = profile.apply_summary_constraints(column_type_summary_constraint)
# should not have failures since the country column type is string, and the spending column contains numbers
format_report(report)

Constraint failures by feature - 
country:
    test_name                               total_run    failed
    summary column_values_type EQ STRING            1         0
spending:
    test_name                                                   total_run    failed
    summary column_values_type IN {'FRACTIONAL', 'INTEGRAL'}            1         0


# Column values in set

In [22]:
from whylogs.core.statistics.constraints import columnValuesInSetConstraint

In [29]:
student_grades = pd.DataFrame({
    'student_id': [1, 5, 15, 16, 22],
    'grade': ['C', 'C', 'A', '/', 'B']
})

val_set = {'A', 'B', 'C', 'E', 'F'}  # valid grades
column_values_in_set = columnValuesInSetConstraint(value_set=val_set)

dc = DatasetConstraints(None, value_constraints={
    "grade": [column_values_in_set], 
})

# the value constraints are applied at the time of logging the dataframe
profile = session.log_dataframe(student_grades, "test.data", constraints=dc)

# out of the five sutdent's grades we expect to see one failure for the '/' unknown grade
# the total number of runs of the constraint should equal the number of values in the column
format_report(dc.report())

Constraint failures by feature - 
grade:
    test_name                             total_run    failed
    value IN {'A', 'B', 'E', 'C', 'F'}            5         1


# Regex matching constraints

### String length value constraints using regex

In [90]:
from whylogs.core.statistics.constraints import stringLengthEqualConstraint, stringLengthBetweenConstraint
df = pd.DataFrame(
    [
        {"str1": "length7"},
        {"str1": "length_8"},
        {"str1": "length__9"},
        {"str1": "a       10"},
        {"str1": "11        b"},
        {"str1": '(*&^%^&*(24!@_+>:|}?><"\\'},
        {"str1": "1b34567"},
    ]
)
length_constraint7 = stringLengthEqualConstraint(length=7)
length_constraint7to10 = stringLengthBetweenConstraint(lower_value=7, upper_value=10)
length_constraints = [length_constraint7, length_constraint7to10]
dc = DatasetConstraints(None, value_constraints={"str1": length_constraints})

profile = session.log_dataframe(df, 'test2.data', constraints=dc)
format_report(dc.report())

Constraint failures by feature - 
str1:
    test_name                total_run    failed
    value MATCH ^.{7}$               7         5
    value MATCH ^.{7,10}$            7         2


### Email matching constraint

In [60]:
from whylogs.core.statistics.constraints import containsEmailConstraint

In [108]:
customer_emails = pd.DataFrame([
    {"email": r"abc's@gmail.com"},  # valid
    {"email": r'"aVrrR Test \@"@gmail.com'},  # valid (if wrapped in quotes, emails can contain special characters)
    {"email": r"abc..q12@example.us"},  # invalid (two consecutive dots)
    {"email": r'"sdsss\d"@gmail.com'},  # valid
    {"email": r"customer/department=shipping?@example-another.some-other.us"},  # valid
    {"email": r".should_fail@yahoo.com"},  # invalid (must not start wiht dot)
    {"email": r"some.@a.com"},  # invalid (must not contain a dot directly before the @ symbol)
    {"email": r"abs@yahoo."},  # invalid (must not end with a dot)
])

# use the predefined email regex from whylogs
default_contains_email_constraint = containsEmailConstraint()

dc = DatasetConstraints(None, value_constraints={"email": [default_contains_email_constraint]})

profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)
# we expect 4 of the 8 runs to be failures
format_report(dc.report())

Constraint failures by feature - 
email:

test_name:	value MATCH ^(?i)(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)$

total_run:	8

failed:		4



In [111]:
# you can provide your own email regex and check the values against it
custom_contains_email_constraint = containsEmailConstraint(regex_pattern = r"\S+@\S+")
dc = DatasetConstraints(None, value_constraints={"email": [custom_contains_email_constraint]})

profile = session.log_dataframe(customer_emails, 'test.data', constraints=dc)
# now we expect 1 of the 8 runs to be failures, the email that contains white spaces
format_report(dc.report())
# running the containsEmailConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
email:
    test_name              total_run    failed
    value MATCH \S+@\S+            8         1


### Credit Card matching constraint

In [112]:
from whylogs.core.statistics.constraints import containsCreditCardConstraint

In [118]:
credit_cards = pd.DataFrame(
    [
        {"credit_card": "3714-496353-98431"},  # amex
        {"credit_card": "3787 344936 71000"},  # amex
        {"credit_card": "3056 930902 5904"},  # diners club
        {"credit_card": "3065 133242 2899"},  # invalid
        {"credit_card": "3852-000002-3237"},  # diners club
        {"credit_card": "6011 1111 1111 1117"},  # discover
        {"credit_card": "6011-0009-9013-9424"},  # discover
        {"credit_card": "3530 1113 3330 0000"},  # jcb
        {"credit_card": "3566-0020-2036-0505"},  # jcb
        {"credit_card": "5555 5555 5555 4444"},  # master card
        {"credit_card": "5105 1051 0510 5100"},  # master card
        {"credit_card": "4111 1111 1111 1111"},  # visa
        {"credit_card": "4012 8888 8888 1881"},  # visa
        {"credit_card": "4222-2222-2222-2222"},  # visa
        {"credit_card": "1111-1111-1111-1111"},  # invalid
        {"credit_card": "a4111 1111 1111 1111b"},  # invalid
        {"credit_card": "4111111111111111"},  # visa
        {"credit_card": 12345},  # invalid
        {"credit_card": "absfcvs"},  # invalid
    ]
)

default_credit_card_constraint = containsCreditCardConstraint()
dc = DatasetConstraints(None, value_constraints={"credit_card": [default_credit_card_constraint]})

profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)
# now we expect 5 of the 19 runs to be failures, the invalid credit cards
format_report(dc.report())

Constraint failures by feature - 
credit_card:

test_name:	value MATCH ^(?:(4[0-9]{3}([\s-]?[0-9]{4}){2}[\s-]?[0-9]{1,4})|(?:(5[1-5][0-9]{2}([\s-]?[0-9]{4}){3}))|(?:(6(?:011|5[0-9]{2})([\s-]?[0-9]{4}){3}))|(?:(3[47][0-9]{2}[\s-]?[0-9]{6}[\s-]?[0-9]{5}))|(?:(3(?:0[0-5]|[68][0-9])[0-9][\s-]?[0-9]{6}[\s-]?[0-9]{4}))|(?:2131|1800|35[0-9]{2,3}([\s-]?[0-9]{4}){3}))$

total_run:	19

failed:		5



In [120]:
# you can provide your own credit card regex and check the values against it
custom_credit_card_constraint = containsCreditCardConstraint(regex_pattern = r"^(?:[0-9]{4}[\s-]?){3,4}$")
dc = DatasetConstraints(None, value_constraints={"credit_card": [custom_credit_card_constraint]})

profile = session.log_dataframe(credit_cards, 'test.data', constraints=dc)
# now more valid credit cards are being reported as failures
format_report(dc.report())
# running the containsCreditCardConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
credit_card:
    test_name                                total_run    failed
    value MATCH ^(?:[0-9]{4}[\s-]?){3,4}$           19         8


### SSN regex matching constraint

In [121]:
from whylogs.core.statistics.constraints import containsSSNConstraint

In [123]:
ssn_data = pd.DataFrame([
    {"ssn": "123-01-2335"},  # valid
    {"ssn": "039780012"},  # valid
    {"ssn": "000231324"},  # invalid
    {"ssn": "666781132"},  # invalid
    {"ssn": "926-89-1234"},  # invalid
    {"ssn": "001-01-0001"},  # valid
    {"ssn": "122 23 0001"},  # valid
    {"ssn": "1234-12-123"},  # invalid
])

default_ssn_constraint = containsSSNConstraint()

dc = DatasetConstraints(None, value_constraints={"ssn": [default_ssn_constraint]})

profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)
# now we expect 4 of the 8 runs to be failures, the invalid ssn numbers
format_report(dc.report())

Constraint failures by feature - 
ssn:

test_name:	value MATCH ^(?!000|666|9[0-9]{2})[0-9]{3}[\s-]?(?!00)[0-9]{2}[\s-]?(?!0000)[0-9]{4}$

total_run:	8

failed:		4



In [125]:
# you can provide your own ssn regex and check the values against it
custom_ssn_constraint = containsSSNConstraint(regex_pattern = r"^[0-9]{3}-[0-9]{2}-[0-9]{4}$")
dc = DatasetConstraints(None, value_constraints={"ssn": [custom_ssn_constraint]})

profile = session.log_dataframe(ssn_data, 'test.data', constraints=dc)
# now more valid ssn numbers are being reported as failures
format_report(dc.report())
# running the containsSSNConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
ssn:
    test_name                                   total_run    failed
    value MATCH ^[0-9]{3}-[0-9]{2}-[0-9]{4}$            8         5


### URL regex matching constraint

In [127]:
from whylogs.core.statistics.constraints import containsURLConstraint

In [129]:
web_urls = pd.DataFrame([
    {"url": "http://www.example.com"},  # valid
    {"url": "abc.test.com"},  # valid (without protocol)
    {"url": "abc.w23w.asb#abc?a=2"},  # valid (without protocol)
    {"url": "https://ab.abc.bc"},  # valid
    {"url": "a.b.c"},  # valid
    {"url": "abcd"},  # invalid
    {"url": "123.w23.235"},  # valid
    {"url": "asf://saf.we.12"},  # invalid
    {"url": "12345"},  # invalid
    {"url": "1.2"},  # invalid
        
])

default_url_constraint = containsURLConstraint()
dc = DatasetConstraints(None, value_constraints={"url": [default_url_constraint]})

profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)
# now we expect the 4 invalid urls, out of the 10 in total, to be reported as failures
format_report(dc.report())

Constraint failures by feature - 
url:

test_name:	value MATCH ^(?:http(s)?:\/\/)?((www)|(?:[a-zA-z0-9-]+)\.)(?:[-a-zA-Z0-9@:%._\+~#=]{1,256}\.(?:[a-zA-Z0-9]{1,6})\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*))$

total_run:	10

failed:		4



In [131]:
# you can provide your own ur; regex and check the values against it
custom_url_constraint = containsURLConstraint(regex_pattern = r"^http(s)?:\/\/(www\.)?.+\..+$")
dc = DatasetConstraints(None, value_constraints={"url": [custom_url_constraint]})

profile = session.log_dataframe(web_urls, 'test.data', constraints=dc)
# with the new regex more valid urls are being reported as failures
format_report(dc.report())
# running the containsURLConstraint with your own regex pattern may cause slow evaluation



Constraint failures by feature - 
url:
    test_name                                    total_run    failed
    value MATCH ^http(s)?:\/\/(www\.)?.+\..+$           10         8


# Datetime/json constraints

In [10]:
from whylogs.core.statistics.constraints import (
    dateUtilParseableConstraint, jsonParseableConstraint, matchesJsonSchemaConstraint, strftimeFormatConstraint )
df = pd.DataFrame(
        [
            {"str1": "1990-12-1"},  # dateutil valid; strftime valid
            {"str1": "1990/12/1"},
            {"str1": "today is 2019-03-27"},  # dateutil invalid
            {"str1": "Monday at 12:01am"},
            {"str1": "xyz_not_a_date"},  # dateutil invalid
            {"str1": "yesterday"},  # dateutil invalid
            {"str1": {"name": "s", "w2w2": "dgsg", "years": 232, "abc": 1}},  # schema valid
            {"str1": {"name": "s", "w2w2": "dgsg", "years": 232}},  # schema invalid
            {"str1": json.dumps({"name": "s", "w2w2": "dgsg", "years": 232, "abc": 1})},  # json valid, schema valid
            {"str1": json.dumps({"name": "s", "w2w2": "dgsg", "years": "232", "abc": 1})},  # json valid
            {"str1": "random str : fail everything"},
            {"str1": "2003-12-23"},  # strftime valid, dateutil valid
            {"str1": "2003-15-23"},  # strftime invalid, dateutil invalid
            {"str1": "10-12-32"},  # strftime invalid, dateutil valid
        ]
    )

dateutil_parseable = dateUtilParseableConstraint()
json_parseable = jsonParseableConstraint()

json_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "years": {"type": "integer"},
        },
        "required": ["name", "abc"],
    }
matches_json_schema = matchesJsonSchemaConstraint(json_schema=json_schema)

is_strftime = strftimeFormatConstraint(format="%Y-%m-%d")

apply_func_constraints = [dateutil_parseable, json_parseable, matches_json_schema, is_strftime]


dc = DatasetConstraints(None, value_constraints={"str1": apply_func_constraints})
profile = session.log_dataframe(df, 'test3.data', constraints=dc)

format_report(dc.report())

Constraint failures by feature - 
str1:
    test_name                                      total_run    failed
    value APPLY_FUNC _try_parse_dateutil                  14         9
    value APPLY_FUNC _try_parse_json                      14        12
    value APPLY_FUNC _matches_json_schema                 14        12
    value APPLY_FUNC _try_parse_strftime_format           14        12


Seeing the comments above, when creating the dataset, we can realize which values fail or pass, for which constraint. The dateutil constraint has 5 passing values in the dataset, and the other 3 constraints have only 2 values that pass from total of 14.

# Entropy and Distributional Measures

### Entropy

Check if the colmn entropy is in some interval [a, b]. Works both for discrete and continuous valued columns.

In [5]:
from whylogs.core.statistics.constraints import approximateEntropyBetweenConstraint

#### Entropy on categorical data

In [6]:
pets = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.3, 0.1, 0.2, 0.4])
pet_df = pd.DataFrame({
    "pet": pets
})

In [19]:
# check if the entropy of the pet_df 'pet' column is between 0.7 and 1.9 (the actual value is 1.85)
entropy_between_values_constraint = approximateEntropyBetweenConstraint(lower_value=0.7, upper_value=1.9)

dc = DatasetConstraints(None, summary_constraints={"pet": [entropy_between_values_constraint]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to complete without failures
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                           total_run    failed
    summary entropy BTWN 0.7 and 1.9            1         0


#### Entropy on continuous data

In [26]:
# sample 100 data points from normal ditribution with mean 30000 and standard deviation 15000 to represent sales values
sales = np.random.normal(loc=30000, scale=15000, size=100)

sales_df = pd.DataFrame({
    "sales": sales
})

In [50]:
# check if the entropy of the sales_df 'sales' column is between 2.3 and 3.5 (the actual value is 1.85)
entropy_between_values_constraint_cont = approximateEntropyBetweenConstraint(lower_value=2.3, upper_value=3.5)

dc = DatasetConstraints(None, summary_constraints={"sales": [entropy_between_values_constraint_cont]})

profile = session.log_dataframe(sales_df, 'test.data', constraints=dc)
# now we expect the constraint fail since entropy is between 3.8 and 3.9
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
sales:
    test_name                           total_run    failed
    summary entropy BTWN 2.3 and 3.5            1         1


### KS Test

The KS Test can only be executed on continuous data.

In [8]:
from whylogs.core.statistics.constraints import parametrizedKSTestPValueGreaterThanConstraint

In [11]:
# this would be the reference distribution, sales 2020
sales_2020 = np.random.normal(loc=30000, scale=15000, size=100)
# this would be the target distribution, sales 2021
sales_2021 = np.random.normal(loc=45000, scale=10000, size=100)
# we want to check if the sales in 2020 have the same distribution as the sales in 2021

In [12]:
sales_2021_df = pd.DataFrame({
    "sales": sales_2021
})

# check if the p-value of the ks test for refrenece distribution sales_2020 is greater than 0.05 
# if so, we do not reject the null hypothesis
ks_test_p_value_greater_than = parametrizedKSTestPValueGreaterThanConstraint(reference_distribution=sales_2020, p_value=0.05)

dc = DatasetConstraints(None, summary_constraints={"sales": [ks_test_p_value_greater_than]})

profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)
# now we expect the constraint to fail since entropy is between 3.8 and 3.9
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
sales:
    test_name                          total_run    failed
    summary ks_test p-value GT 0.05            1         1


The p-value is less than 0.05, which means we can reject the null hypothesis with this confidence level.

### KL Divergence

The KL Divergence constraint is supported for both discrete and continuous variables.

In [13]:
from whylogs.core.statistics.constraints import columnKLDivergenceLessThanConstraint

#### KL Divergence for continuous case

In [14]:
# check if the kl divergence is greater than 0.6 
kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=sales_2020, threshold=0.6)

dc = DatasetConstraints(None, summary_constraints={"sales": [kl_divergence_greater_than]})

profile = session.log_dataframe(sales_2021_df, 'test.data', constraints=dc)
# now we expect the constraint to fail
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
sales:
    test_name                                 total_run    failed
    summary kl_divergence threshold LT 0.6            1         1


  kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
  kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
  kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))
  kl_divergence = np.sum(np.where(pmf_target != 0, pmf_target * np.log(pmf_target / pmf_reference), 0))


The distribution of sales in 2020 cannot be encoded with the distribution of sales in 2021.

#### KL Divergence for discrete case

In [18]:
# create a new distribtution from the pets sample with different probabilities
pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.5, 0.1, 0.2, 0.2])

# check if the kl divergence is greater than 0.6 
kl_divergence_greater_than = columnKLDivergenceLessThanConstraint(reference_distribution=pets_reference, threshold=0.6)

dc = DatasetConstraints(None, summary_constraints={"pet": [kl_divergence_greater_than]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to not fail
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                                 total_run    failed
    summary kl_divergence threshold LT 0.6            1         0


### Chi-Squared Test

The Chi-Squared test constraint is only supported for categorical values.

In [5]:
from whylogs.core.statistics.constraints import columnChiSquaredTestPValueGreaterThanConstraint

In [21]:
# create a new distribtution from the pets sample with different probabilities
pets_reference = np.random.choice(['cat', 'dog', 'rabbit', 'hamster'], size=50, replace=True, p=[0.6, 0.2, 0.1, 0.1])

# check if the kl divergence is greater than 0.6 
chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=pets_reference, p_value=0.05)

dc = DatasetConstraints(None, summary_constraints={"pet": [chi_squared_p_value_greater_than]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to not fail
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                                   total_run    failed
    summary chi_squared_test p-value GT 0.05            1         0


The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval.

If you don't have a reference distribution for calculating the Chi-Squared Test, but you know the approximate frequencies of each of the items, you can use this constraint by supplying a mapping of items and frequencies as counts, in the reference distribution parameter of the constraint.

In [10]:
# create a new distribtution from the pets sample with different probabilities
reference_dict_pets = {
    'cat': 30,
    'dog': 10,
    'rabbit': 5, 
    'hamster': 5,
}

# check if the kl divergence is greater than 0.6 
chi_squared_p_value_greater_than = columnChiSquaredTestPValueGreaterThanConstraint(reference_distribution=reference_dict_pets, p_value=0.05)

dc = DatasetConstraints(None, summary_constraints={"pet": [chi_squared_p_value_greater_than]})

profile = session.log_dataframe(pet_df, 'test.data', constraints=dc)
# now we expect the constraint to not fail since this is approximately the same distribution from the previous example
report = profile.apply_summary_constraints()
format_report(report)

Constraint failures by feature - 
pet:
    test_name                                   total_run    failed
    summary chi_squared_test p-value GT 0.05            1         0


The p-value is not greater than 0.05, which means that we can reject the null hypothesis that the distributions are equal within this confidence interval.

## Table shape constraints

In [6]:
from whylogs.core.statistics.constraints import (
    numberOfRowsConstraint, columnExistsConstraint, columnsMatchSetConstraint )

# using the above dataframe with the string values, just adding a column
df['col2'] = range(len(df))

rows = numberOfRowsConstraint(n_rows=len(df)+1) # fail
rows_2 = numberOfRowsConstraint(n_rows=len(df)) # pass

column_exist = columnExistsConstraint("this_column_does_not_exist") # fail
column_exist2 = columnExistsConstraint("col2") # pass

set1 = {'this', 'is', 'a', 'wrong', 'columns', 'set'}
columns_set = set(df.columns)
columns_match = columnsMatchSetConstraint(set1) # fail
columns_match2 = columnsMatchSetConstraint(columns_set) # pass

table_shape_constraints = [rows, rows_2, column_exist, column_exist2, columns_match, columns_match2]

dc = DatasetConstraints(None, table_shape_constraints=table_shape_constraints)

profile = session.log_dataframe(df, "test.data", constraints=dc)

report = profile.apply_table_shape_constraints()
format_report(report)


Table shape constraint failures -
    test_name                                                          total_run    failed
    table total_row_number EQ 15                                               1         1
    table total_row_number EQ 14                                               1         0
    table columns CONTAIN this_column_does_not_exist                           1         1
    table columns CONTAIN col2                                                 1         0
    table columns EQ {'this', 'is', 'set', 'wrong', 'columns', 'a'}            1         1
    table columns EQ {'str1', 'col2'}                                          1         0


## Multi column constraints
### Logical operations between values of the specified columns

In [36]:
from whylogs.core.statistics.constraints import columnValuesAGreaterThanBConstraint, columnValuesAEqualBConstraint

df = pd.DataFrame({"col1": [4, 5, 6, 7], "col2": [0, 1, 6, 15]})

a_gt_b = columnValuesAGreaterThanBConstraint(column_A="col1", column_B="col2")
a_eq_b = columnValuesAEqualBConstraint(column_A="col1", column_B="col2")

dc = DatasetConstraints(None, multi_column_value_constraints=[a_gt_b, a_eq_b])

profile = session.log_dataframe(df, "test4.data", constraints=dc)

format_report(dc.report())


Multi column constraint failures -
    test_name                          total_run    failed
    multi column value col1 GT col2            4         2
    multi column value col1 EQ col2            4         3


Value by value comparison. col1 values > col2 values, only 2 are passing, and col1 values == col 2 values only 1 is True (the third element from both the columns are equal).

### Sum of row values of multiple columns equals some value, or some column value

In [6]:
from whylogs.core.statistics.constraints import sumOfRowValuesOfMultipleColumnsEqualsConstraint

In [37]:
total_expences = pd.DataFrame({
    "employees %": [25, 45, 15, 3],
    "equipment %": [10, 12, 4, 9],
    "materials %": [40, 35, 45, 55],
    "other %": [25, 8, 4, 6]
})

In [38]:
# check if the percentage of expences for each part sum to 100 %
sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(
    columns=["employees %", "equipment %", "materials %", "other %"],
    value=100
)

dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])

# the multicolumn value constraints do not need to be applied to the data 
# the are applied at the time of logging
profile = session.log_dataframe(total_expences, "test.data", constraints=dc)

# we expect 2 of the 4 rows to be failures since the last two rows do not sum to 100
format_report(dc.report())


Multi column constraint failures -

test_name:	multi column value SUM ['employees %', 'equipment %', 'materials %', 'other %'] EQ 100

total_run:	4

failed:		2



In [39]:
# check if the sum of the row values (percentages) for 'equipment %' and 'materials %' equalt the value of 'other %'
sum_of_row_values_eq_100 = sumOfRowValuesOfMultipleColumnsEqualsConstraint(
    columns=["equipment %", "materials %"],
    value='other %'
)

dc = DatasetConstraints(None, multi_column_value_constraints=[sum_of_row_values_eq_100])
profile = session.log_dataframe(total_expences, "test.data", constraints=dc)

# we expect all rows to be failures since the sum of 'equipment %' and 'materials %' is not equal to the value of the column 'other %'
format_report(dc.report())


Multi column constraint failures -
    test_name                                                               total_run    failed
    multi column value SUM ['equipment %', 'materials %'] EQ ['other %']            4         4


### Column Pair Values in Set

Check if the values of a pair of columns are in a predefined set of pair values.

In [40]:
from whylogs.core.statistics.constraints import columnPairValuesInSetConstraint

In [41]:
product_grades = pd.DataFrame({
    "product": ["ProductA", "ProductB", "ProductC", "ProductD", "ProductE"],
    "grade": ["A", "A", "B", "C", "C"],
    "subgrade": ["A1", "A3", "B2", "C2", "C2"]
})

In [42]:
# we want to check if each of the grade and subgrade pairs are in the specific set
grade_subgrade_pairs_in_set = columnPairValuesInSetConstraint(
    column_A="grade", 
    column_B="subgrade",
    value_set = {("A", "A1"), ("A", "A2"), ("B", "B1"), ("B", "B2"), ("C", "C1"), ("C", "C2")}
)

dc = DatasetConstraints(None, multi_column_value_constraints=[grade_subgrade_pairs_in_set])
profile = session.log_dataframe(product_grades, "test.data", constraints=dc)

# we expect 1 out of 5 pairs to be a failure, specifically ("A", "A3")
format_report(dc.report())


Multi column constraint failures -

test_name:	multi column value ['grade', 'subgrade'] IN {('C', 'C2'), ('B', 'B2'), ('B', 'B1'), ('A', 'A1'), ('A', 'A2'), ('C', 'C1')}

total_run:	5

failed:		1



### Column Values Unique within Row

Check if the value of the specified column is unique within each row.

In [43]:
from whylogs.core.statistics.constraints import columnValuesUniqueWithinRow

In [45]:
users = pd.DataFrame({
    "first_name": ["John", "Jane", "Bob", "Anna"],
    "last_name": ["Doe", "Doe", "Smith", "Jones"],
    "username": ["jd123", "jane.doe@example.com", "bobsmith", "_anna_"],
    "email": ["john.doe@example.com", "jane.doe@example.com", "bob.smith@example.com", "anna_jones@example.com"],
})

In [47]:
# check if the emails are unique compared to other fields for each user
# suppose we do not want to accept a username which is the same as the user's email
email_values_unique_within_row = columnValuesUniqueWithinRow(column_A="email")

dc = DatasetConstraints(None, multi_column_value_constraints=[email_values_unique_within_row])
profile = session.log_dataframe(users, "test.data", constraints=dc)

# we expect 1 out of 4 evaluations of the constraint to be a failure, sicne Jane Doe's email is the same as their username
format_report(dc.report())


Multi column constraint failures -
    test_name                              total_run    failed
    multi column value email NOT_IN all            4         1
