In [1]:
import logging
import numpy as np
import pandas as pd

from validata.evaluator import Evaluator

%load_ext autoreload
%autoreload 2

In [2]:
logging.basicConfig(level="DEBUG")

In [3]:
n = 10

households = pd.DataFrame({
    "id": np.arange(n),
    "size": (np.random.standard_exponential(n) + 1).astype(int),
    "income_1": (np.random.standard_exponential(n) * 30e3).astype(int),
    "income_2": (np.random.standard_exponential(n) * 20e3).astype(int),
})

In [4]:
households

Unnamed: 0,id,size,income_1,income_2
0,0,1,27946,16837
1,1,1,42641,5252
2,2,1,39854,19907
3,3,1,5071,55618
4,4,3,11348,21612
5,5,1,20581,8838
6,6,1,61744,33025
7,7,1,43626,18045
8,8,2,43903,79122
9,9,1,31217,10105


In [48]:
ev = Evaluator("any income_* missing")
ev.evaluate(households)

DEBUG:validata.evaluator:Selected columns: income_1, income_2.
DEBUG:validata.evaluator:Using comparator: NullComparator.
DEBUG:validata.evaluator:Using operator: AnyOperator.


Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,False


## Validator: Perform many checks

In [56]:
from validata.validator import Validator

In [57]:
checks_df = pd.DataFrame(
    data=[
        ["large_size", "size > 2"],
        ["income_missing", "any income_* missing"],
        ["high_collective_income", "sum income_* > 100000"]
    ],
    columns=["name", "expression"]
)
checks_df

Unnamed: 0,name,expression
0,large_size,size > 2
1,income_missing,any income_* missing
2,high_collective_income,sum income_* > 100000


In [64]:
vd = Validator(checks_df)

In [65]:
results = vd.validate(households)

DEBUG:validata.validator:Performing validation: large_size.
DEBUG:validata.boolean_parser:Processing token: size > 2
DEBUG:validata.boolean_parser:Evaluating expression: size > 2
DEBUG:validata.evaluator:Selected columns: size.
DEBUG:validata.evaluator:Using comparator: GtComparator.
DEBUG:validata.validator:Validated 10 rows - 10% evaluated to True.
DEBUG:validata.validator:Finished validation: large_size.
DEBUG:validata.validator:Performing validation: income_missing.
DEBUG:validata.boolean_parser:Processing token: any income_* missing
DEBUG:validata.boolean_parser:Evaluating expression: any income_* missing
DEBUG:validata.evaluator:Selected columns: income_1, income_2.
DEBUG:validata.evaluator:Using comparator: NullComparator.
DEBUG:validata.evaluator:Using operator: AnyOperator.
DEBUG:validata.validator:Validated 10 rows - 0% evaluated to True.
DEBUG:validata.validator:Finished validation: income_missing.
DEBUG:validata.validator:Performing validation: high_collective_income.
DEBUG

In [63]:
results

Unnamed: 0,large_size,income_missing,high_collective_income
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,True,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,True
9,False,False,False
