# Ejemplo de Data Drift

In [1]:
import pandas as pd
import numpy as np
from wrangler import Wrangler
from wrangler.data import PandasDataset, CSVDataset
import wrangler.transformers as tr
import wrangler.transformers.text as text_tr
import wrangler.transformers.ml as ml_tr
import wrangler.transformers.numeric as n_tr
from wrangler import logger as wrangler_logger
from sklearn.ensemble import RandomForestClassifier

# Uncomment for console logging 
wrangler_logger.enable()
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split


In [2]:
n_features = 5
n_centers = 3
X, _ = make_blobs(n_samples=1000, centers=n_centers, n_features=n_features, random_state=0)
X = pd.DataFrame(X, columns = [f'Var{var}' for var in range(X.shape[1])])

X[f'Var{n_features}'] = X['Var0'].apply(lambda x: "Cat1" if np.random.rand()<0.5 else "Cat2")
X[f'Var{n_features+1}'] = X['Var0'].apply(lambda x: np.random.choice([f"Cat{n}" for n in range(10)]))

feature_names = X.columns

n_ref = 400
X_ref = X.sample(n_ref).copy()
X_comp = X.iloc[~X.index.isin(X_ref.index)].copy()
X_comp.loc[:,'Var0'] = X_comp.loc[:,'Var0'] + 2*np.random.normal()

X_comp1 = X_comp.sample(X_comp.shape[0]//2).copy()
X_comp2 = X_comp.iloc[~X_comp.index.isin(X_comp1.index)].copy()

X_comp1['Var5'] = X_comp1['Var5'].apply(lambda x: "Cat2" if np.random.rand()<0.5 else x)

In [3]:
wrangler = Wrangler()
ref_data = PandasDataset('ref_data', X_ref)
comp_data = PandasDataset('comp_data', X_comp1)
wrangler.add_dataset(ref_data)
wrangler.add_dataset(comp_data)

2022-02-08 at 10:43:36 | INFO | catalog | Adding dataset: intermediate
2022-02-08 at 10:43:36 | INFO | catalog | Adding dataset: ref_data
2022-02-08 at 10:43:36 | INFO | catalog | Adding dataset: comp_data


In [4]:

wrangler.add_node(
    name = 'data drift evaluation',
    transformer = ml_tr.DataDriftTransformer(),
    inputs=['ref_data','comp_data'],
    outputs=['data_drift_results'],
)


2022-02-08 at 10:43:37 | INFO | pipeline | Node data drift evaluation added to Pipeline 


In [5]:
wrangler.fit_transform()

2022-02-08 at 10:43:37 | INFO | node | Running Node: data drift evaluation
2022-02-08 at 10:43:37 | INFO | catalog | Loading dataset: ref_data
2022-02-08 at 10:43:37 | DEBUG | base | Loading PandasDataset(name='ref_data', data=DataFrame)
2022-02-08 at 10:43:37 | INFO | catalog | Loading dataset: comp_data
2022-02-08 at 10:43:37 | DEBUG | base | Loading PandasDataset(name='comp_data', data=DataFrame)
2022-02-08 at 10:43:37 | DEBUG | base | Fitting DataDriftTransformer(p_val=0.05)
2022-02-08 at 10:43:37 | DEBUG | base | Transforming DataDriftTransformer(p_val=0.05, feature_names=Index, n_features=7, x_ref_categories=dict, cat_vars=[5, 6], ref_stats=DataFrame)
2022-02-08 at 10:43:37 | INFO | catalog | Saving dataset: data_drift_results
2022-02-08 at 10:43:37 | INFO | catalog | Adding dataset: data_drift_results


In [6]:
wrangler.data_catalog.load('data_drift_results')

2022-02-08 at 10:44:16 | INFO | catalog | Loading dataset: data_drift_results
2022-02-08 at 10:44:16 | DEBUG | base | Loading PandasDataset(name='data_drift_results', data=DataFrame)


Unnamed: 0_level_0,is_drift,test_type,test_val,p_val,ref_mean,ref_std,ref_min,ref_25%,ref_50%,ref_75%,ref_max,ref_nunique,comp_mean,comp_std,comp_min,comp_25%,comp_50%,comp_75%,comp_max,comp_nunique
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Var0,1,K-S,0.1425,0.001712694,3.1881,2.166526,-1.638327,1.554622,2.901457,5.018718,8.247443,,2.704297,2.286105,-2.436641,0.897258,2.402957,4.670732,7.464351,
Var1,0,K-S,0.045,0.8635039,1.048466,2.547756,-3.717537,-1.116642,0.593163,3.347448,7.405318,,1.12839,2.490902,-3.186679,-0.867204,0.667717,3.486271,6.327178,
Var2,0,K-S,0.071667,0.3278094,3.982527,3.110864,-1.072525,1.44587,2.700455,7.433553,10.095313,,3.702377,3.059303,-1.756062,1.404128,2.440495,7.102173,10.104646,
Var3,0,K-S,0.083333,0.175744,6.35925,3.976152,-1.675046,1.326337,8.295876,9.398024,11.903332,,6.128851,3.897231,-2.063987,1.589201,8.139329,9.07236,11.996943,
Var4,1,K-S,0.106667,0.03784803,-4.121783,3.226321,-11.361727,-7.774429,-2.574788,-1.670084,0.448488,,-3.981257,3.34472,-10.986445,-7.893505,-2.360092,-1.431902,0.583396,
Var5,1,Chi2,39.450584,3.364668e-10,,,,,,,,2.0,,,,,,,,2.0
Var6,1,Chi2,17.016176,0.04846326,,,,,,,,10.0,,,,,,,,10.0
