In [17]:
import pandas as pd
import numpy as np
import pandera as pa
import warnings
from pandera import Column, DataFrameSchema, Index, MultiIndex, Check, check_input 

data = pd.read_csv('./data/data_demo.csv')

data.head()

Unnamed: 0,rooms,garages,useful_area,value,interior_quality,time_on_market,has_outlier
0,3.0,1.0,105.0,1038640.0,2,31.787079,0
1,3.0,2.0,76.0,606405.0,4,209.468842,0
2,3.0,1.0,123.0,1534500.0,1,38.0,0
3,3.0,2.0,180.0,1131950.0,5,154.0,0
4,3.0,1.0,67.0,452672.0,4,15.0,0


In [18]:
schema = pa.DataFrameSchema(
    columns={
        
        #Check numerical values are relatively acceptable 
        "useful_area": Column(pa.Float,Check(lambda x: 20 <= x <= 850.0,element_wise=True,
                   error="range checker [20, 850]", raise_warning=True),nullable=True),
        "value": Column(pa.Float,Check(lambda x: 50000 <= x <= 100000000.0,element_wise=True,
                   error="range checker [50 000, 10 000 000]", raise_warning=True),nullable=True),
        "rooms": Column(pa.Float,Check(lambda x: 1.0 <= x <= 7.0,element_wise=True,
                   error="range checker [1.0, 7.0]", raise_warning=True),nullable=True),
        "garages": Column(pa.Float,Check(lambda x: 1.0 <= x <= 8.0,element_wise=True,
                   error="range checker [1, 8]", raise_warning=True),nullable=True),  
        "time_on_market": Column(pa.Float,Check(lambda x: 0 <= x <= 400.0,element_wise=True,
                   error="range checker [0, 400]", raise_warning=True),nullable=True),
        #Check the categorical column is within the expected classes
        "interior_quality":pa.Column(pa.Int, Check(lambda s: s.isin([1,2,3,4,5]),raise_warning=True))
    },
    
    index=pa.Index(
        pa.Int,
),
)
    
# catch and print warnings
with warnings.catch_warnings(record=True) as caught_warnings:
    warnings.simplefilter("always")
    validated_df = schema(data)
    for warning in caught_warnings:
        print(warning.message)

<Schema Column: 'useful_area' type=float> failed element-wise validator 0:
<Check <lambda>: range checker [20, 850]>
failure cases:
                     index  count
failure_case                     
2262.97       [6510, 6618]      2
0.63                [1761]      1
2091.83             [8430]      1
2250.19             [2359]      1
2245.23             [2340]      1
2241.78             [1085]      1
2241.08             [5746]      1
2215.90              [393]      1
2205.72             [5255]      1
2191.36              [671]      1
<Schema Column: 'value' type=float> failed element-wise validator 0:
<Check <lambda>: range checker [50 000, 10 000 000]>
failure cases:
               index  count
failure_case               
1.622000e+01  [3305]      1
1.176059e+08  [1635]      1
1.603524e+08  [2067]      1
1.537917e+08  [5909]      1
1.521972e+08  [2147]      1
1.505942e+08  [2476]      1
1.378424e+08  [2516]      1
1.358611e+08  [2636]      1
1.274164e+08  [6021]      1
1.262466e+08  [