In [1]:
# dataset download:

In [3]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

from sklearn.model_selection import train_test_split
from tensorflow_metadata.proto.v0 import schema_pb2

In [4]:
df = pd.read_csv('data/mall_customers.csv')

# Split the dataset into train and eval
train_df, eval_df = train_test_split(df, test_size=0.2, shuffle=False)

In [3]:
train_df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [5]:
eval_df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
160,161,Female,56,79,35
161,162,Female,29,79,83
162,163,Male,19,81,5
163,164,Female,31,81,93
164,165,Male,50,85,26


In [6]:
# add extra rows to the eval set to include anomaly data
def add_extra_rows(df):
    rows = [
        {
            'CustomerID': 0000,
            'Gender': '?',
            'Age': 24,
            'Annual Income (k$)': 10,
            'Spending Score (1-100)': 50
        },
        {
            'CustomerID': 00000,
            'Gender': 'NonBinary',
            'Age': 30,
            'Annual Income (k$)': 0,
            'Spending Score (1-100)': 90
        },
    ]

    df = df.append(rows, ignore_index=True)

    return df

In [7]:
eval_df = add_extra_rows(eval_df)
eval_df.tail(5)

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
37,198,Male,32,126,74
38,199,Male,32,137,18
39,200,Male,30,137,83
40,0,?,24,10,50
41,0,NonBinary,30,0,90


In [9]:
# Generate training dataset statistics
train_stats = tfdv.generate_statistics_from_dataframe(train_df)

tfdv.visualize_statistics(train_stats)

In [10]:
# Infer schema from the computed statistics.
schema = tfdv.infer_schema(statistics=train_stats)

# Display the inferred schema
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'CustomerID',INT,required,,-
'Gender',STRING,required,,'Gender'
'Age',INT,required,,-
'Annual Income (k$)',INT,required,,-
'Spending Score (1-100)',INT,required,,-


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Gender',"'Female', 'Male'"


In [11]:
# Generate evaluation dataset statistics
eval_stats = tfdv.generate_statistics_from_dataframe(eval_df)

In [12]:
# Compare training with evaluation
tfdv.visualize_statistics(
    lhs_statistics=eval_stats, 
    rhs_statistics=train_stats, 
    lhs_name='EVAL_DATASET', 
    rhs_name='TRAIN_DATASET'
)

In [11]:
# Check evaluation data for errors by validating the evaluation dataset statistics using the reference schema
anomalies =  tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Gender',Unexpected string values,"Examples contain values missing from the schema: ? (~2%), NonBinary (~2%)."


In [14]:
# Add new value to the domain of the feature `Gender`
gender_domain = tfdv.get_domain(schema, 'Gender')
gender_domain.value.append('NonBinary')

In [15]:
# Check evaluation data for errors by validating the evaluation dataset statistics using the reference schema
anomalies =  tfdv.validate_statistics(statistics=eval_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Gender',Unexpected string values,Examples contain values missing from the schema: ? (~2%).


In [20]:
# Restrict the range of the `age` feature
tfdv.set_domain(schema, 'Age', schema_pb2.IntDomain(name='age', min=30, max=50))

# Display the modified schema. Notice the `Domain` column of `age`.
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'CustomerID',INT,required,,-
'Gender',STRING,required,,'Gender'
'Age',INT,required,,min: 30; max: 50
'Annual Income (k$)',INT,required,,-
'Spending Score (1-100)',INT,required,,-


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Gender',"'Female', 'Male', 'NonBinary', 'NonBinary'"


In [21]:
# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Age',Multiple errors,Unexpectedly small value: 19. Unexpectedly large value: 59.
'Gender',Unexpected string values,Examples contain values missing from the schema: ? (~2%).
