In [28]:
!pip install sagemaker pandas -U

[0m

# Overview
Amazon SageMaker Clarify helps improve your machine learning models by detecting potential bias and helping explain how these models make predictions. The fairness and explainability functionality provided by SageMaker Clarify takes a step towards enabling AWS customers to build trustworthy and understandable machine learning models. The product comes with the tools to help you with the following tasks.

Measure biases that can occur during each stage of the ML lifecycle (data collection, model training and tuning, and monitoring of ML models deployed for inference).
Generate model governance reports targeting risk and compliance teams and external regulators.
Provide explanations of the data, models, and monitoring used to assess predictions.
This sample notebook walks you through:

Key terms and concepts needed to understand SageMaker Clarify
Measuring the pre-training bias of a dataset and post-training bias of a model
Explaining the importance of the various input features on the model's decision
Accessing the reports through SageMaker Studio if you have an instance set up.
In doing so, the notebook first trains a SageMaker XGBoost model using training dataset, then use SageMaker Clarify to analyze a testing dataset in CSV format. SageMaker Clarify also supports analyzing dataset in SageMaker JSON Lines dense format, which is illustrated in another notebook.

In [29]:
from sagemaker import Session
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
import matplotlib
import sklearn
import pandas as pd
import numpy as np
import os
import boto3
from datetime import datetime
import fsspec
from sagemaker.s3 import S3Uploader
from sagemaker.experiments.run import Run
from sagemaker.experiments import load_run

In [30]:
fsspec.filesystem('s3').invalidate_cache()

In [31]:
session = Session()
bucket = session.default_bucket()
prefix = "data/kkbox-customer-churn-model"
experiment_name = "kkbox-customer-churn-model-experiment-2"
region = session.boto_region_name
# Define IAM role
role = get_execution_role()
s3_client = boto3.client("s3")

In [32]:
X_test = pd.read_csv(f"s3://{bucket}/{prefix}/validation/X_test.csv")
y_test = pd.read_csv(f"s3://{bucket}/{prefix}/validation/y_test.csv")
X_train = pd.read_csv(f"s3://{bucket}/{prefix}/train/X_train_over.csv")
y_train = pd.read_csv(f"s3://{bucket}/{prefix}/train/y_train_over.csv")

In [33]:
def rename(df, dummy_cols):
    new_col_dict = {}
    for col in dummy_cols:
        val = col.split("_")[-1]
        new_col = "_".join(col.split("_")[:-1]) + "__" + val
        new_col_dict[col] = new_col
    return df.rename(columns=new_col_dict)

In [34]:
def undummies(df, dummy_col_name):
    dummy_cols = [ x for x in list(df.columns) if x.startswith(dummy_col_name) ]
    df = rename(df, dummy_cols)
    dummy_cols = [ x for x in list(df.columns) if x.startswith(dummy_col_name) ]
    df[dummy_col_name] = pd.from_dummies(df[dummy_cols], sep='__')
    return df.drop(columns=dummy_cols)         

In [35]:
for dummy_col in ["gender", "city", "registered_via", "qtr_trans", "mst_frq_pay_met", "is_auto_renew"]:
    X_test = undummies(X_test, dummy_col)
    X_train = undummies(X_train, dummy_col)

In [36]:
train_df = pd.concat([y_train, X_train], axis=1)
test_df = pd.concat([y_test, X_test], axis=1)

In [37]:
train_df.to_csv("data/clarify_train.csv", index=False)
test_df.to_csv("data/clarify_test.csv", index=False)

In [38]:
train_df.head()

Unnamed: 0,is_churn,regist_trans,mst_frq_plan_days,revenue,regist_cancels,bd,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,tenure,gender,city,registered_via,qtr_trans,mst_frq_pay_met,is_auto_renew
0,0,-1.109113,0.580911,-1.258245,-0.524495,0.957116,6.529412,2.117647,1.176471,0.686275,13.176471,20.509804,7.607474,-1.0,female,1.0,7.0,3,41,1
1,0,0.206593,0.580911,-0.539414,1.377221,-0.807531,13.970803,1.770073,1.058394,1.605839,21.70073,27.930657,7.891622,-1.0,other,1.0,7.0,4,41,1
2,0,-1.707161,-0.792032,-1.526899,-0.524495,0.713716,1.394737,0.447368,0.342105,0.394737,25.684211,22.421053,8.479371,-1.0,female,18.0,9.0,1,36,0
3,0,-0.511065,0.580911,-0.858894,-0.524495,-0.807531,2.0,0.266667,0.3,0.266667,11.466667,7.6,7.192282,-1.0,other,1.0,7.0,1,41,1
4,0,0.326202,0.580911,-0.299803,-0.524495,-0.807531,1.94898,0.673469,0.72449,0.602041,8.0,10.204082,6.485676,-1.0,other,1.0,7.0,4,41,1


In [39]:
train_uri = S3Uploader.upload("data/clarify_train.csv", "s3://{}/{}/clarify/clarify_train.csv".format(bucket, prefix))
test_uri = S3Uploader.upload("data/clarify_test.csv", "s3://{}/{}/clarify/clarify_test.csv".format(bucket, prefix))

In [40]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role, instance_count=1, instance_type="ml.m5.4xlarge", sagemaker_session=session
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [15]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix)
bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=bias_report_output_path,
    label="is_churn",
    headers=train_df.columns.to_list(),
    dataset_type="text/csv",
)

In [41]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1], facet_name="gender", facet_values_or_threshold=["female"], group_name="bd")

In [21]:
import time
job_name = f"clarify-pretrain-bias-{int(time.time())}"
run_name = "default"
with load_run(experiment_name=experiment_name, 
              run_name=run_name,
              sagemaker_session=session) as run:    
    clarify_processor.run_pre_training_bias(data_config=bias_data_config, 
                          data_bias_config=bias_config, 
                          methods='all', 
                          wait=True, 
                          logs=True, 
                          job_name=job_name)

INFO:sagemaker.experiments.run:The run (default) under experiment (kkbox-customer-churn-model-experiment-2) already exists. Loading it. Note: sagemaker.experiments.load_run is recommended to use when the desired run already exists.
INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['is_churn', 'regist_trans', 'mst_frq_plan_days', 'revenue', 'regist_cancels', 'bd', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs', 'tenure', 'gender', 'city', 'registered_via', 'qtr_trans', 'mst_frq_pay_met', 'is_auto_renew'], 'label': 'is_churn', 'label_values_or_threshold': [1], 'facet': [{'name_or_index': 'gender', 'value_or_threshold': ['female']}], 'group_variable': 'bd', 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'pre_training_bias': {'methods': 'all'}}}
INFO:sagemaker:Creating processing-job with name clarify-pretrain-bias-1674431453



Job Name:  clarify-pretrain-bias-1674431453
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify/clarify_train.csv/clarify_train.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify-bias/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify-bias', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'En

In [42]:
model_name = "sagemaker-xgboost-2023-01-23-02-50-10-345"
model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    accept_type="text/csv",
    content_type="text/csv",
)

In [43]:
predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)

In [44]:
X_train = pd.read_csv(f"s3://{bucket}/{prefix}/train/X_train_over.csv")
y_train = pd.read_csv(f"s3://{bucket}/{prefix}/train/y_train_over.csv")

In [45]:
post_train_bias_df = pd.concat([y_train, X_train], axis=1)
post_train_bias_df.to_csv("data/clarify_post_train.csv", index=False)
post_train_bias_df_uri = S3Uploader.upload("data/clarify_post_train.csv", "s3://{}/{}/clarify/clarify_post_train.csv".format(bucket, prefix))

In [46]:
post_train_bias_df.columns

Index(['is_churn', 'regist_trans', 'mst_frq_plan_days', 'revenue',
       'regist_cancels', 'bd', 'num_25', 'num_50', 'num_75', 'num_985',
       'num_100', 'num_unq', 'total_secs', 'tenure', 'mst_frq_pay_met_29',
       'mst_frq_pay_met_30', 'mst_frq_pay_met_31', 'mst_frq_pay_met_32',
       'mst_frq_pay_met_33', 'mst_frq_pay_met_34', 'mst_frq_pay_met_36',
       'mst_frq_pay_met_37', 'mst_frq_pay_met_38', 'mst_frq_pay_met_39',
       'mst_frq_pay_met_40', 'mst_frq_pay_met_41', 'mst_frq_pay_met_other',
       'is_auto_renew_0', 'is_auto_renew_1', 'qtr_trans_1', 'qtr_trans_2',
       'qtr_trans_3', 'qtr_trans_4', 'city_1.0', 'city_3.0', 'city_4.0',
       'city_5.0', 'city_6.0', 'city_8.0', 'city_9.0', 'city_10.0',
       'city_11.0', 'city_12.0', 'city_13.0', 'city_14.0', 'city_15.0',
       'city_17.0', 'city_18.0', 'city_21.0', 'city_22.0', 'city_other',
       'gender_female', 'gender_male', 'gender_other', 'registered_via_3.0',
       'registered_via_4.0', 'registered_via_7.0', 'r

In [47]:
post_train_bias_data_config = clarify.DataConfig(
    s3_data_input_path=post_train_bias_df_uri,
    s3_output_path=bias_report_output_path,
    headers=post_train_bias_df.columns.to_list(),
    label="is_churn",
    dataset_type="text/csv",
)

In [48]:
post_train_bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1], facet_name="gender_female", facet_values_or_threshold=[1], group_name="bd")

In [None]:
job_name = f"clarify-postrain-bias-{int(time.time())}"

clarify_processor.run_post_training_bias(
        data_config=post_train_bias_data_config,
        data_bias_config=post_train_bias_config,
        model_config=model_config,
        model_predicted_label_config=predictions_config,
        job_name=job_name)

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['is_churn', 'regist_trans', 'mst_frq_plan_days', 'revenue', 'regist_cancels', 'bd', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs', 'tenure', 'mst_frq_pay_met_29', 'mst_frq_pay_met_30', 'mst_frq_pay_met_31', 'mst_frq_pay_met_32', 'mst_frq_pay_met_33', 'mst_frq_pay_met_34', 'mst_frq_pay_met_36', 'mst_frq_pay_met_37', 'mst_frq_pay_met_38', 'mst_frq_pay_met_39', 'mst_frq_pay_met_40', 'mst_frq_pay_met_41', 'mst_frq_pay_met_other', 'is_auto_renew_0', 'is_auto_renew_1', 'qtr_trans_1', 'qtr_trans_2', 'qtr_trans_3', 'qtr_trans_4', 'city_1.0', 'city_3.0', 'city_4.0', 'city_5.0', 'city_6.0', 'city_8.0', 'city_9.0', 'city_10.0', 'city_11.0', 'city_12.0', 'city_13.0', 'city_14.0', 'city_15.0', 'city_17.0', 'city_18.0', 'city_21.0', 'city_22.0', 'city_other', 'gender_female', 'gender_male', 'gender_other', 'registered_via_3.0', 'registered_via_4.0', 'registered_via_7.0', 'registered_via_9.0


Job Name:  clarify-postrain-bias-1674446837
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify/clarify_post_train.csv/clarify_post_train.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify-bias/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify-bias', 'LocalPath': '/opt/ml/processing/output', 'S3Upload

In [99]:
X_test = pd.read_csv(f"s3://{bucket}/{prefix}/validation/X_test.csv")
y_test = pd.read_csv(f"s3://{bucket}/{prefix}/validation/y_test.csv")
explainability_test_df = pd.concat([y_test, X_test], axis=1)
explainability_test_df.to_csv("s3://{}/{}/clarify/clarify_explainability_test.csv".format(bucket, prefix), index=False)

In [101]:
baseline = X_test.iloc[0,:].values.tolist()

In [102]:
shap_config = clarify.SHAPConfig(
    baseline=[baseline],
    agg_method="mean_abs",
    num_samples=59,
    save_local_shap_values=True,
)

In [103]:
explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix)
explainability_data_config = clarify.DataConfig(
    s3_data_input_path="s3://{}/{}/clarify/clarify_explainability_test.csv".format(bucket, prefix),
    s3_output_path=explainability_output_path,
    label="is_churn",
    headers=explainability_test_df.columns.to_list(),
    dataset_type="text/csv",
)

In [None]:
run_name = "default"
with load_run(experiment_name=experiment_name, 
              run_name=run_name,
              sagemaker_session=session) as run:    

    clarify_processor.run_explainability(
        data_config=explainability_data_config,
        model_config=model_config,
        explainability_config=shap_config,
)

INFO:sagemaker.experiments.run:The run (default) under experiment (kkbox-customer-churn-model-experiment-2) already exists. Loading it. Note: sagemaker.experiments.load_run is recommended to use when the desired run already exists.
INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['is_churn', 'regist_trans', 'mst_frq_plan_days', 'revenue', 'regist_cancels', 'bd', 'num_25', 'num_50', 'num_75', 'num_985', 'num_100', 'num_unq', 'total_secs', 'tenure', 'mst_frq_pay_met_29', 'mst_frq_pay_met_30', 'mst_frq_pay_met_31', 'mst_frq_pay_met_32', 'mst_frq_pay_met_33', 'mst_frq_pay_met_34', 'mst_frq_pay_met_36', 'mst_frq_pay_met_37', 'mst_frq_pay_met_38', 'mst_frq_pay_met_39', 'mst_frq_pay_met_40', 'mst_frq_pay_met_41', 'mst_frq_pay_met_other', 'is_auto_renew_0', 'is_auto_renew_1', 'qtr_trans_1', 'qtr_trans_2', 'qtr_trans_3', 'qtr_trans_4', 'city_1.0', 'city_3.0', 'city_4.0', 'city_5.0', 'city_6.0', 'city_8.0', 'city_9.0', 'city_10.0', 'city_11.0', 'city_12.0', 'city_


Job Name:  Clarify-Explainability-2023-01-25-03-47-28-740
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify/clarify_explainability_test.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify-explainability/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-869530972998/data/kkbox-customer-churn-model/clarify-explainability', 'LocalPath': '/opt/ml/processin