# Anomaly detection with TFDV

In [None]:
!pip install --user -r requirements.txt

This notebooks shows an approach to use the drift-skew tfdv validator for non categorical variables

## 00. Notebook setup

In [1]:
import os
import logging
import site
from pathlib import Path
import sys

In [2]:
home = str(Path.home())
local_py_path = os.path.join(home, ".local/lib/python3.6/site-packages")
if local_py_path not in sys.path:
    logging.info("Adding %s to python path", local_py_path)
    sys.path.insert(0, local_py_path)
site.getsitepackages()

['/usr/local/lib/python3.6/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/lib/python3.6/dist-packages']

## 01. Simple model generation

Generates a simple classifier model, in this case input data consists of 8 numerical features + 1 label binary label

In [13]:
import numpy as np
import pandas as pd
import csv
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

input_dataset_path='test-data.csv'
with open(input_dataset_path, newline='') as f:
  reader = csv.reader(f)
  header_row = next(reader) 
dataset = loadtxt(input_dataset_path, delimiter=",",skiprows=1)
x = dataset[:,0:8] #input features
y = dataset[:,8] #label
seed = 5
test_size = 0.20
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed)
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.52%


Generates syntethic data for reproducing the serving

In [14]:
#Lets generate syntethic with the same schema data and predict
x_serving_feature0 = np.random.randint(low=0, high=15, size=(700,1))
x_serving_feature1 = np.random.randint(low=70, high=200, size=(700,1))
x_serving_feature2 = np.random.randint(low=70, high=120, size=(700,1))
x_serving_feature3 = np.random.randint(low=0, high=50, size=(700,1))
x_serving_feature4 = np.random.randint(low=40, high=150, size=(700,1))
x_serving_feature5 = np.random.randint(low=20, high=50, size=(700,1)) 
x_serving_feature6 = np.random.uniform(low=0, high=1, size=(700,1))
x_serving_feature7 = np.random.randint(low=20, high=60, size=(700,1))
x_serving = np.concatenate((x_serving_feature0,x_serving_feature1,
                            x_serving_feature2,x_serving_feature3,
                            x_serving_feature4,x_serving_feature5,
                            x_serving_feature6,x_serving_feature7),
                           axis=1)
y_serving = model.predict(x_serving)

Generates pandas DataFrames

In [15]:
df_train = pd.DataFrame(data=x_train,columns=header_row[:-1])
df_test = pd.DataFrame(data=x_test,columns=header_row[:-1])
df_serving_synthetic = pd.DataFrame(data=x_serving,columns=header_row[:-1])

In [16]:
#Train data set
df_train.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7
0,3.0,128.0,78.0,0.0,0.0,21.1,0.268,55.0
1,12.0,88.0,74.0,40.0,54.0,35.3,0.378,48.0
2,10.0,108.0,66.0,0.0,0.0,32.4,0.272,42.0
3,8.0,91.0,82.0,0.0,0.0,35.6,0.587,68.0
4,6.0,111.0,64.0,39.0,0.0,34.2,0.26,24.0


In [17]:
#Test data set used on training for acc calculation
df_test.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7
0,6.0,92.0,62.0,32.0,126.0,32.0,0.085,46.0
1,5.0,132.0,80.0,0.0,0.0,26.8,0.186,69.0
2,3.0,106.0,72.0,0.0,0.0,25.8,0.207,27.0
3,4.0,99.0,68.0,38.0,0.0,32.8,0.145,33.0
4,4.0,96.0,56.0,17.0,49.0,20.8,0.34,26.0


In [18]:
#Serving dataset
df_serving_synthetic.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7
0,13.0,70.0,88.0,21.0,133.0,38.0,0.082724,28.0
1,3.0,102.0,81.0,19.0,60.0,45.0,0.138002,36.0
2,12.0,70.0,84.0,29.0,83.0,32.0,0.456866,47.0
3,0.0,114.0,105.0,16.0,55.0,44.0,0.48943,57.0
4,8.0,146.0,88.0,27.0,70.0,37.0,0.521129,26.0


In [19]:
#Additional data set with different_schema dataset, drop last column
df_anomaly_schema = df_serving_synthetic.drop(labels='feature7', axis=1)
df_anomaly_schema.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6
0,13.0,70.0,88.0,21.0,133.0,38.0,0.082724
1,3.0,102.0,81.0,19.0,60.0,45.0,0.138002
2,12.0,70.0,84.0,29.0,83.0,32.0,0.456866
3,0.0,114.0,105.0,16.0,55.0,44.0,0.48943
4,8.0,146.0,88.0,27.0,70.0,37.0,0.521129


## 02. Visual exploration and simple anomalies

Use TFDV to calculate stats for the above datasets.
Visualize the stats 

In [20]:
import tensorflow_data_validation as tfdv
train_stats = tfdv.generate_statistics_from_dataframe(dataframe=df_train)
test_stats = tfdv.generate_statistics_from_dataframe(dataframe=df_test)
serving_stats = tfdv.generate_statistics_from_dataframe(dataframe=df_serving_synthetic)
anomaly_schema_stats = tfdv.generate_statistics_from_dataframe(dataframe=df_anomaly_schema)
#Compare train with test
tfdv.visualize_statistics(lhs_statistics=test_stats, rhs_statistics=train_stats,
                          lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')
#Compare train with serving
tfdv.visualize_statistics(lhs_statistics=serving_stats, rhs_statistics=train_stats,
                          lhs_name='SERVING_DATASET', rhs_name='TRAIN_DATASET')
#Compare train with wrong schema
tfdv.visualize_statistics(lhs_statistics=anomaly_schema_stats, rhs_statistics=train_stats,
                          lhs_name='ANOMALY_SCHEMA', rhs_name='TRAIN_DATASET')

  types.FeaturePath([column_name]), column.data.chunk(0), weights):


In order to evaluate anomalies TFDV compares an TF schema with a set of stats.
Schemas are protobuf serializations like this:
```
feature {
  name: "feature0Quant"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
```
Full schema: https://github.com/tensorflow/metadata/blob/master/tensorflow_metadata/proto/v0/schema.proto
Stats are also protobuf serializations like this (extract for one feature):
```
...
num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 61.4
          }
         ...
          }
          buckets {
            low_value: 1.0

            sample_count: 61.4
          }
          type: QUANTILES
        }
        tot_num_values: 614
      }
      mean: 2.8485342019543975
      std_dev: 2.886821026051386
      num_zeros: 198
      median: 2.0
      max: 9.0
```
tfdv.validate_statistics then compares these two protos and check for the following scenarios triggering an alert https://www.tensorflow.org/tfx/data_validation/anomalies

Start by infering the schema, we can adjust the inferred schema using https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/utils/schema_util.py

In [21]:
train_schema = tfdv.infer_schema(train_stats)
test_schema = tfdv.infer_schema(test_stats)
serving_schema = tfdv.infer_schema(serving_stats)
anomaly_schema_schema = tfdv.infer_schema(anomaly_schema_stats)
tfdv.display_schema(schema=train_schema)
tfdv.display_schema(schema=test_schema)
tfdv.display_schema(schema=serving_schema)
tfdv.display_schema(schema=anomaly_schema_schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0',FLOAT,required,,-
'feature1',FLOAT,required,,-
'feature2',FLOAT,required,,-
'feature3',FLOAT,required,,-
'feature4',FLOAT,required,,-
'feature5',FLOAT,required,,-
'feature6',FLOAT,required,,-
'feature7',FLOAT,required,,-


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0',FLOAT,required,,-
'feature1',FLOAT,required,,-
'feature2',FLOAT,required,,-
'feature3',FLOAT,required,,-
'feature4',FLOAT,required,,-
'feature5',FLOAT,required,,-
'feature6',FLOAT,required,,-
'feature7',FLOAT,required,,-


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0',FLOAT,required,,-
'feature1',FLOAT,required,,-
'feature2',FLOAT,required,,-
'feature3',FLOAT,required,,-
'feature4',FLOAT,required,,-
'feature5',FLOAT,required,,-
'feature6',FLOAT,required,,-
'feature7',FLOAT,required,,-


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0',FLOAT,required,,-
'feature1',FLOAT,required,,-
'feature2',FLOAT,required,,-
'feature3',FLOAT,required,,-
'feature4',FLOAT,required,,-
'feature5',FLOAT,required,,-
'feature6',FLOAT,required,,-


Look for anomalies

In [22]:
anomalies_train_test = tfdv.validate_statistics(statistics=test_stats, schema=train_schema)
tfdv.display_anomalies(anomalies_train_test)
anomalies_train_serving = tfdv.validate_statistics(statistics=serving_stats, schema=train_schema)
tfdv.display_anomalies(anomalies_train_test)
anomalies_train_anomaly_schema = tfdv.validate_statistics(statistics=anomaly_schema_stats, schema=train_schema)
tfdv.display_anomalies(anomalies_train_anomaly_schema)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'feature7',Column dropped,Column is completely missing


Lets force another anomaly. Change the minimum observations of feature0

In [32]:
tfdv.get_feature(train_schema, 'feature0').presence.min_count = 500
tfdv.get_feature(train_schema, 'feature0').presence.min_fraction = 0
#Set to null all values but one
df_test['feature0'] = np.nan
df_test['feature0'][0] = 1
test_stats = tfdv.generate_statistics_from_dataframe(dataframe=df_test)
anomalies_train_test = tfdv.validate_statistics(statistics=test_stats, schema=train_schema)
tfdv.display_anomalies(anomalies_train_test)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'feature0',Column dropped,The feature was present in fewer examples than expected.


## 03. Drift and skew for continuos variables

Now, we will compute something more interestring, data drift and skew for continous variables. We need to generate categorical data, so lets use a few quantile-based discretization function (e.g. X-tiles based on FD)

In [67]:
#Reload the initial data
df_train = pd.DataFrame(data=x_train,columns=header_row[:-1])
df_test = pd.DataFrame(data=x_test,columns=header_row[:-1])
df_serving_synthetic = pd.DataFrame(data=x_serving,columns=header_row[:-1])

In order to use the built in drift and skew validators we discretize (bin) the data.
The train data set will be the control data set. We calculate the optimal number of bins (https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width), and then quantice the data. We generates a new dataframe where each data is labeled with its quant.
The test and serving sets will be the treatment set, so we use the exact number of bins calculated before.

In [35]:
df_train_quant = pd.DataFrame()
num_bins_control = []
for column in df_train:
    num_bins = (len(np.histogram_bin_edges(df_train[column], bins='auto'))+1)
    num_bins_control = num_bins_control + [num_bins]
    df_train_quant[column+'Quant'] = pd.qcut(df_train[column], q=num_bins,labels=False, duplicates='drop')
    df_train_quant[column+'Quant'] = 'Qt' + df_train_quant[column+'Quant'].astype(str)
df_train_quant.head()

Unnamed: 0,feature0Quant,feature1Quant,feature2Quant,feature3Quant,feature4Quant,feature5Quant,feature6Quant,feature7Quant
0,Qt2,Qt15,Qt13,Qt0,Qt0,Qt1,Qt9,Qt12
1,Qt9,Qt2,Qt10,Qt10,Qt1,Qt23,Qt14,Qt11
2,Qt8,Qt8,Qt6,Qt0,Qt0,Qt17,Qt10,Qt10
3,Qt7,Qt3,Qt15,Qt0,Qt0,Qt24,Qt20,Qt13
4,Qt5,Qt9,Qt5,Qt9,Qt0,Qt21,Qt8,Qt1


In [36]:
df_test_quant = pd.DataFrame()
# assert df size equal num bin control size - should have been caught in schema diffs
i=0
for column in df_test:
    num_bins = num_bins_control[i]
    df_test_quant[column+'Quant'] = pd.qcut(df_test[column], q=num_bins, labels=False,duplicates='drop')
    df_test_quant[column+'Quant'] = 'Qt' + df_test_quant[column+'Quant'].astype(str)
    i=i+1
df_test_quant.head()

Unnamed: 0,feature0Quant,feature1Quant,feature2Quant,feature3Quant,feature4Quant,feature5Quant,feature6Quant,feature7Quant
0,Qtnan,Qt4,Qt4,Qt6,Qt8,Qt17,Qt0,Qt12
1,Qtnan,Qt15,Qt14,Qt0,Qt0,Qt9,Qt2,Qt14
2,Qtnan,Qt9,Qt10,Qt0,Qt0,Qt7,Qt4,Qt4
3,Qtnan,Qt6,Qt8,Qt8,Qt0,Qt19,Qt1,Qt7
4,Qtnan,Qt5,Qt1,Qt1,Qt1,Qt0,Qt11,Qt3


In [37]:
df_serving_synthetic_quant = pd.DataFrame()
# assert df size equal num bin control size - should have been caught in schema diffs
i=0
for column in df_serving_synthetic:
    num_bins = num_bins_control[i]
    df_serving_synthetic_quant[column+'Quant'] = pd.qcut(df_serving_synthetic[column], q=num_bins, labels=False, duplicates='drop')
    df_serving_synthetic_quant[column+'Quant'] = 'Qt' + df_serving_synthetic_quant[column+'Quant'].astype(str)
    i=i+1
df_serving_synthetic_quant.head()

Unnamed: 0,feature0Quant,feature1Quant,feature2Quant,feature3Quant,feature4Quant,feature5Quant,feature6Quant,feature7Quant
0,Qt11,Qt0,Qt11,Qt6,Qt25,Qt16,Qt2,Qt3
1,Qt2,Qt6,Qt6,Qt6,Qt5,Qt24,Qt4,Qt7
2,Qt10,Qt0,Qt8,Qt9,Qt12,Qt10,Qt12,Qt11
3,Qt0,Qt8,Qt22,Qt5,Qt4,Qt24,Qt13,Qt15
4,Qt6,Qt14,Qt11,Qt9,Qt8,Qt15,Qt14,Qt2


Generates stats and new schemas

In [38]:
#Stats
train_stats_quant = tfdv.generate_statistics_from_dataframe(dataframe=df_train_quant)
test_stats_quant = tfdv.generate_statistics_from_dataframe(dataframe=df_test_quant)
serving_stats_quant = tfdv.generate_statistics_from_dataframe(dataframe=df_serving_synthetic_quant)
#Schemas
train_schema_quant = tfdv.infer_schema(train_stats_quant)
test_schema_quant = tfdv.infer_schema(test_stats_quant)
serving_schema_quant = tfdv.infer_schema(serving_stats_quant)
#Viz
tfdv.display_schema(schema=train_schema_quant)
tfdv.display_schema(schema=test_schema_quant)
tfdv.display_schema(schema=serving_schema_quant)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0Quant',STRING,required,,'feature0Quant'
'feature1Quant',STRING,required,,'feature1Quant'
'feature2Quant',STRING,required,,'feature2Quant'
'feature3Quant',STRING,required,,'feature3Quant'
'feature4Quant',STRING,required,,'feature4Quant'
'feature5Quant',STRING,required,,'feature5Quant'
'feature6Quant',STRING,required,,'feature6Quant'
'feature7Quant',STRING,required,,'feature7Quant'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'feature0Quant',"'Qt0', 'Qt1', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature1Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature2Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature3Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature4Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature5Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt29', 'Qt3', 'Qt30', 'Qt31', 'Qt32', 'Qt33', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature6Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature7Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0Quant',STRING,required,,'feature0Quant'
'feature1Quant',STRING,required,,'feature1Quant'
'feature2Quant',STRING,required,,'feature2Quant'
'feature3Quant',STRING,required,,'feature3Quant'
'feature4Quant',STRING,required,,'feature4Quant'
'feature5Quant',STRING,required,,'feature5Quant'
'feature6Quant',STRING,required,,'feature6Quant'
'feature7Quant',STRING,required,,'feature7Quant'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'feature0Quant','Qtnan'
'feature1Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature2Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature3Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature4Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature5Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt29', 'Qt3', 'Qt30', 'Qt31', 'Qt32', 'Qt33', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature6Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature7Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0Quant',STRING,required,,'feature0Quant'
'feature1Quant',STRING,required,,'feature1Quant'
'feature2Quant',STRING,required,,'feature2Quant'
'feature3Quant',STRING,required,,'feature3Quant'
'feature4Quant',STRING,required,,'feature4Quant'
'feature5Quant',STRING,required,,'feature5Quant'
'feature6Quant',STRING,required,,'feature6Quant'
'feature7Quant',STRING,required,,'feature7Quant'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'feature0Quant',"'Qt0', 'Qt10', 'Qt11', 'Qt12', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature1Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature2Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt29', 'Qt3', 'Qt30', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature3Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature4Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt29', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature5Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature6Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature7Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"


**Skew and drift detection:**

The COMPARATOR_L_INFTY_HIGH is triggered as follows:

Used Schema Fields:
* feature.skew_comparator.infinity_norm.threshold.
* feature.drift_comparator.infinity_norm.threshold

Statistics Fields:

* feature.string_stats.rank_histogram

Detection Condition:

L-infinity norm of the vector that represents the difference between the normalized counts from the feature.string_stats.rank_histogram in the control statistics (i.e., serving statistics for skew or previous statistics for drift) and the treatment statistics (i.e., training statistics for skew or current statistics for drift) > feature.skew_comparator.infinity_norm.threshold or feature.drift_comparator.infinity_norm.threshold

The L-infinity form is basically abs(max([x1,....,xn]) In this case x1 = count(values bucket1)/total values in control set - count(values bucket1)/total values in treatment set. Once we have the L-inf we check > (feature.skew_comparator.infinity_norm.threshold or feature.drift_comparator.infinity_norm.threshold) And if so, COMPARATOR_L_INFTY_HIGH is triggered. The actual value(0.01) needs to be fine-tuned based on your particular case and data stats.

In [52]:
tfdv.get_feature(train_schema_quant, 'feature0Quant').skew_comparator.infinity_norm.threshold = 0.1
train_serving_skew_anomalies = tfdv.validate_statistics(
        statistics=train_stats_quant, schema=train_schema_quant, serving_statistics=serving_stats_quant)
tfdv.display_anomalies(train_serving_skew_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'feature0Quant',High Linfty distance between training and serving,"The Linfty distance between training and serving is 0.135179 (up to six significant digits), above the threshold 0.1. The feature value with maximum difference is: Qt1"


---
To simulate **data drift**, lets perturb the initial trainig data, we need to generate a different data distribution (e.g. multiply each value by an scalar will not generate a drift alert)

In [125]:
df_train = pd.DataFrame(data=x_train,columns=header_row[:-1])
perturbation_matrix = pd.DataFrame(np.random.randint(0,10,size=df_train.shape),columns=df_train.columns)
perturbation_matrix.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7
0,7,4,6,3,7,6,6,0
1,7,4,8,7,4,0,2,6
2,1,4,9,4,4,4,5,2
3,5,0,6,4,2,5,3,3
4,6,0,7,8,8,8,1,3


In [126]:
df_train_perturbed = df_train.mul(perturbation_matrix)
df_train_perturbed.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7
0,21.0,512.0,468.0,0.0,0.0,126.6,1.608,0.0
1,84.0,352.0,592.0,280.0,216.0,0.0,0.756,288.0
2,10.0,432.0,594.0,0.0,0.0,129.6,1.36,84.0
3,40.0,0.0,492.0,0.0,0.0,178.0,1.761,204.0
4,36.0,0.0,448.0,312.0,0.0,273.6,0.26,72.0


In [127]:
df_train_quant = pd.DataFrame()
num_bins_control = []
for column in df_train:
    num_bins = (len(np.histogram_bin_edges(df_train[column], bins='auto'))+1)
    num_bins_control = num_bins_control + [num_bins]
    df_train_quant[column+'Quant'] = pd.qcut(df_train[column], q=num_bins,labels=False, duplicates='drop')
    df_train_quant[column+'Quant'] = 'Qt' + df_train_quant[column+'Quant'].astype(str)
df_train_quant.head()

Unnamed: 0,feature0Quant,feature1Quant,feature2Quant,feature3Quant,feature4Quant,feature5Quant,feature6Quant,feature7Quant
0,Qt2,Qt15,Qt13,Qt0,Qt0,Qt1,Qt9,Qt12
1,Qt9,Qt2,Qt10,Qt10,Qt1,Qt23,Qt14,Qt11
2,Qt8,Qt8,Qt6,Qt0,Qt0,Qt17,Qt10,Qt10
3,Qt7,Qt3,Qt15,Qt0,Qt0,Qt24,Qt20,Qt13
4,Qt5,Qt9,Qt5,Qt9,Qt0,Qt21,Qt8,Qt1


In [128]:
df_train_perturbed_quant = pd.DataFrame()
# assert df size equal num bin control size - should have been caught in schema diffs
i=0
for column in df_train_perturbed:
    num_bins = num_bins_control[i]
    df_train_perturbed_quant[column+'Quant'] = pd.qcut(df_train_perturbed[column], q=num_bins, labels=False,duplicates='drop')
    df_train_perturbed_quant[column+'Quant'] = 'Qt' + df_train_perturbed_quant[column+'Quant'].astype(str)
    i=i+1
df_train_perturbed_quant.head()

Unnamed: 0,feature0Quant,feature1Quant,feature2Quant,feature3Quant,feature4Quant,feature5Quant,feature6Quant,feature7Quant
0,Qt7,Qt9,Qt16,Qt0,Qt0,Qt11,Qt12,Qt0
1,Qt11,Qt6,Qt21,Qt9,Qt3,Qt0,Qt5,Qt13
2,Qt4,Qt8,Qt22,Qt0,Qt0,Qt11,Qt10,Qt4
3,Qt9,Qt0,Qt18,Qt0,Qt0,Qt17,Qt13,Qt11
4,Qt9,Qt0,Qt16,Qt10,Qt0,Qt26,Qt1,Qt3


In [129]:
#Stats
train_stats_quant = tfdv.generate_statistics_from_dataframe(dataframe=df_train_quant)
train_perturbed_stats_quant = tfdv.generate_statistics_from_dataframe(dataframe=df_train_perturbed_quant)
#Schemas
train_schema_quant = tfdv.infer_schema(train_stats_quant)
train_perturbed_schema_quant = tfdv.infer_schema(train_perturbed_stats_quant)
#Viz
tfdv.display_schema(schema=train_schema_quant)
tfdv.display_schema(schema=train_perturbed_schema_quant)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0Quant',STRING,required,,'feature0Quant'
'feature1Quant',STRING,required,,'feature1Quant'
'feature2Quant',STRING,required,,'feature2Quant'
'feature3Quant',STRING,required,,'feature3Quant'
'feature4Quant',STRING,required,,'feature4Quant'
'feature5Quant',STRING,required,,'feature5Quant'
'feature6Quant',STRING,required,,'feature6Quant'
'feature7Quant',STRING,required,,'feature7Quant'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'feature0Quant',"'Qt0', 'Qt1', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature1Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature2Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature3Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature4Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature5Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt29', 'Qt3', 'Qt30', 'Qt31', 'Qt32', 'Qt33', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature6Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature7Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'feature0Quant',STRING,required,,'feature0Quant'
'feature1Quant',STRING,required,,'feature1Quant'
'feature2Quant',STRING,required,,'feature2Quant'
'feature3Quant',STRING,required,,'feature3Quant'
'feature4Quant',STRING,required,,'feature4Quant'
'feature5Quant',STRING,required,,'feature5Quant'
'feature6Quant',STRING,required,,'feature6Quant'
'feature7Quant',STRING,required,,'feature7Quant'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'feature0Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature1Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature2Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature3Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature4Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature5Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt25', 'Qt26', 'Qt27', 'Qt28', 'Qt29', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature6Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt16', 'Qt17', 'Qt18', 'Qt19', 'Qt2', 'Qt20', 'Qt21', 'Qt22', 'Qt23', 'Qt24', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"
'feature7Quant',"'Qt0', 'Qt1', 'Qt10', 'Qt11', 'Qt12', 'Qt13', 'Qt14', 'Qt15', 'Qt2', 'Qt3', 'Qt4', 'Qt5', 'Qt6', 'Qt7', 'Qt8', 'Qt9'"


In [130]:
tfdv.get_feature(train_schema_quant, 'feature2Quant').drift_comparator.infinity_norm.threshold = 0.1

train_serving_drift_anomalies = tfdv.validate_statistics(
        statistics=train_stats_quant, schema=train_schema_quant, previous_statistics=train_perturbed_stats_quant)
tfdv.display_anomalies(train_serving_drift_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'feature2Quant',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.130293 (up to six significant digits), above the threshold 0.1. The feature value with maximum difference is: Qt0"
