In [1]:
import logging
import numpy as np
import pandas as pd

from validata.parser import Parser
from validata.validator import Validator

%load_ext autoreload
%autoreload 2

In [2]:
logging.basicConfig(level="DEBUG")

## Create some data

In [23]:
n = 10

households = pd.DataFrame({
    "id": np.arange(n),
    "size": (np.random.standard_exponential(n) + 1).astype(int),
    "income_1": (np.random.standard_exponential(n) * 30e3).astype(int),
    "income_2": (np.random.standard_exponential(n) * 20e3).astype(int),
})

households

Unnamed: 0,id,size,income_1,income_2
0,0,1,28320,44507
1,1,1,31291,3239
2,2,2,46790,14441
3,3,2,31633,79580
4,4,2,23824,6012
5,5,1,3960,638
6,6,2,18914,16074
7,7,1,541,502
8,8,1,24157,39122
9,9,2,24091,24664


## Parser: Perform a single validation

In [24]:
ps = Parser("any income_* > 30000 and size > 1")
ps.evaluate(households)

DEBUG:validata.parser:Processing token: any income_* > 30000 [BARE WORD]
DEBUG:validata.parser:Evaluating expression: any income_* > 30000
DEBUG:validata.evaluator:Selected columns: income_2, income_1.
DEBUG:validata.evaluator:Using comparator: GtComparator.
DEBUG:validata.evaluator:Using operator: AnyOperator.
DEBUG:validata.parser:Processing token: and [AND]
DEBUG:validata.parser:Processing and expression.
DEBUG:validata.parser:Processing token: size > 1
DEBUG:validata.parser:Evaluating right hand side expression.
DEBUG:validata.evaluator:Selected columns: size.
DEBUG:validata.evaluator:Using comparator: GtComparator.


Unnamed: 0,0
0,False
1,False
2,True
3,True
4,False
5,False
6,False
7,False
8,False
9,False


## Validator: Performing many checks

### Define a data frame with validations

In [6]:
checks_df = pd.DataFrame(
    data=[
        ["large_size", "size > 2"],
        ["income_missing", "any income_* missing"],
        ["high_collective_income", "sum income_* > 100000"]
    ],
    columns=["name", "expression"]
)
checks_df

Unnamed: 0,name,expression
0,large_size,size > 2
1,income_missing,any income_* missing
2,high_collective_income,sum income_* > 100000


### Run all validations

In [7]:
vd = Validator(checks_df)
results = vd.validate(households)

DEBUG:validata.validator:Performing validation: large_size.
DEBUG:validata.parser:Processing token: size > 2
DEBUG:validata.parser:Evaluating expression: size > 2
DEBUG:validata.evaluator:Selected columns: size.
DEBUG:validata.evaluator:Using comparator: GtComparator.
DEBUG:validata.validator:Validated 10 rows - 20% evaluated to True.
DEBUG:validata.validator:Finished validation: large_size.
DEBUG:validata.validator:Performing validation: income_missing.
DEBUG:validata.parser:Processing token: any income_* missing
DEBUG:validata.parser:Evaluating expression: any income_* missing
DEBUG:validata.evaluator:Selected columns: income_1, income_2.
DEBUG:validata.evaluator:Using comparator: NullComparator.
DEBUG:validata.evaluator:Using operator: AnyOperator.
DEBUG:validata.validator:Validated 10 rows - 0% evaluated to True.
DEBUG:validata.validator:Finished validation: income_missing.
DEBUG:validata.validator:Performing validation: high_collective_income.
DEBUG:validata.parser:Processing toke

In [8]:
results

Unnamed: 0,large_size,income_missing,high_collective_income
0,False,False,False
1,True,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,True,False,False
8,False,False,False
9,False,False,False


In [18]:
vd.get_summary()

Unnamed: 0,True %
large_size,20.0
income_missing,0.0
high_collective_income,10.0
