In [1]:
from whylogs import get_or_create_session
from whylogs.util.protobuf import message_to_json

# create session
session = get_or_create_session()

WARN: Missing config


In [2]:
import numpy as np
import pandas as pd
import json
import os

In [3]:
from tabulate import tabulate

def indent(txt, spaces=4):
    return "\n".join(" " * spaces + ln for ln in txt.splitlines())

def format_report(r):
    # report failures in tabular form
    
    r_2 = [entry for entry in r if len(entry)==2] # all the single column constraints
    r_table_shape = [[entry for entry in r if len(entry)!=2 and entry[0].startswith("table")]] # multi column and table shape constraints
    r_multi_column = [[entry for entry in r if len(entry)!=2 and entry[0].startswith("multi column")]]
    
    if len(r_2):
        print("Constraint failures by feature - ")
    for c,r in r_2:
        print(f"{c}:")
        if len(r[0][0]) > 80: 
            print(f"\ntest_name:\t{r[0][0]}\n")
            print(f"total_run:\t{r[0][1]}\n")
            print(f"failed:\t\t{r[0][2]}\n")
        else:    
            print(indent(tabulate(r, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
    
    if len(r_table_shape[0]):
        print ()   
        print("Table shape constraint failures -")
        for entry in r_table_shape:
            print(indent(tabulate(entry, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
        
    if len(r_multi_column[0]):
        print()    
        print("Multi column constraint failures -")
        for entry in r_multi_column:
            if len(entry[0][0]) > 80: 
                print(f"\ntest_name:\t{entry[0][0]}\n")
                print(f"total_run:\t{entry[0][1]}\n")
                print(f"failed:\t\t{entry[0][2]}\n")
            else:
                print(indent(tabulate(entry, tablefmt="plain", headers=['test_name', 'total_run', 'failed'])))
        

In [4]:
df = pd.read_csv(os.path.join('data', 'fake-person-data.csv'))

First log the data (and provide value constraints, checking the **password** length in this example)

In [5]:
from whylogs.core.statistics.constraints import stringLengthBetweenConstraint, DatasetConstraints
str_len_between = stringLengthBetweenConstraint(8, 20)
dc = DatasetConstraints(None, value_constraints = {'password': [str_len_between]})

profile = session.log_dataframe(df, "person.data", constraints = dc)


Display the evaluated value constraints

In [6]:
format_report(dc.report())

Constraint failures by feature - 
password:
    test_name                                          total_run    failed
    length of the string values is between 8 and 20          500         0


Then create and apply the summary constraints

Check if the email (eg. username) and password don't have any missing values

In [7]:
from whylogs.core.statistics.constraints import columnValuesNotNullConstraint
nnc1 = columnValuesNotNullConstraint()
nnc2 = columnValuesNotNullConstraint()
summary_constraints = {"email": [nnc1], "password": [nnc2]}

report = profile.apply_summary_constraints(summary_constraints)

format_report(report)

Constraint failures by feature - 
email:
    test_name                          total_run    failed
    does not contain missing values            1         0
password:
    test_name                          total_run    failed
    does not contain missing values            1         0


As we can see, no failures are present so the columns **'email'** and **'password'** do not have any missing values.

Now we can try to generate constraints that **whylogs** provides for our logged data.

In [8]:
generated_constraints = profile.generate_constraints()

Displaying info for the generated constraints

In [9]:
dc_json = json.loads(generated_constraints.to_json())

for column_name in profile.columns:
    if column_name in dc_json['summaryConstraints']: # check whether we have any constraints for the column
        constraints = json.dumps(dc_json['summaryConstraints'][column_name]['constraints'], indent=4)
        print(f"Constraints for column \'{column_name}\': \n{constraints}\n")


Constraints for column 'name': 
[
    {
        "name": "The values of the feature 'name' are of type STRING",
        "firstField": "column_values_type",
        "value": 5.0,
        "op": "EQ",
        "verbose": false,
        "quantileValue": 0.0
    },
    {
        "name": "The cardinality of unique values of the feature 'name' is between 488 and 503",
        "firstField": "unique_count",
        "op": "BTWN",
        "between": {
            "lowerValue": 488.0,
            "upperValue": 503.0
        },
        "verbose": false,
        "quantileValue": 0.0
    }
]

Constraints for column 'phone': 
[
    {
        "name": "The values of the feature 'phone' are of type STRING",
        "firstField": "column_values_type",
        "value": 5.0,
        "op": "EQ",
        "verbose": false,
        "quantileValue": 0.0
    },
    {
        "name": "The cardinality of unique values of the feature 'phone' is between 489 and 504",
        "firstField": "unique_count",
        "op": 

Applying the generated constraints

In [10]:
generated_summary_constraints = generated_constraints.summary_constraint_map # Getting the summary constraints
report = profile.apply_summary_constraints(generated_summary_constraints)

format_report(report)

Constraint failures by feature - 
name:
    test_name                                                                        total_run    failed
    The values of the feature 'name' are of type STRING                                      1         0
    The cardinality of unique values of the feature 'name' is between 488 and 503            1         0
phone:
    test_name                                                                         total_run    failed
    The values of the feature 'phone' are of type STRING                                      1         0
    The cardinality of unique values of the feature 'phone' is between 489 and 504            1         0
email:
    test_name                                                                         total_run    failed
    The values of the feature 'email' are of type STRING                                      1         0
    The cardinality of unique values of the feature 'email' is between 488 and 503            1      

As expected, all of these constraints pass, since the values and thresholds used to create them is inferred from the actual data.
In some cases the data insights created from the generated constraints can help with better understanding of the logged data.

In [11]:
data_insights = profile.generate_data_insights()

for column_name, insights in data_insights.items():
    print (f"Feature: \'{column_name}\'")
    for insight in insights:
        print (insight)
    print ()

Feature: 'name'
The values of the feature 'name' are of type STRING
The cardinality of unique values of the feature 'name' is between 488 and 503

Feature: 'phone'
The values of the feature 'phone' are of type STRING
The cardinality of unique values of the feature 'phone' is between 489 and 504

Feature: 'email'
The feature 'email' contains some values identified as e-mail addresses
The values of the feature 'email' are of type STRING
The cardinality of unique values of the feature 'email' is between 488 and 503

Feature: 'date'
The values of the feature 'date' are of type STRING
The cardinality of unique values of the feature 'date' is between 456 and 470

Feature: 'country'
The values of the feature 'country' are of type STRING
The cardinality of unique values of the feature 'country' is between 30 and 32
The most common value of the feature 'country' is in the set {'Russian Federation', 'Netherlands', 'China', 'New Zealand', 'Mexico'}

Feature: 'password'
The values of the feature '