# Fairness and Explainability with SageMaker Clarify

This notebook is a cleaned-up version of https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb

1. Download data
2. Inspect data
3. Encode dataset columns
4. Train a classification model with XGBoost
5. Analyse bias with SageMaker Clarify
6. Explain feature importance with SageMaker Clarify
7. Clean up

In [None]:
import pandas as pd
import numpy as np
import urllib, os

from sagemaker import get_execution_role
from sagemaker import Session

session = Session()
bucket  = session.default_bucket()
prefix  = 'sagemaker/DEMO-sagemaker-clarify'
region  = session.boto_region_name
role    = get_execution_role()

## 1 - Download data
Data Source: https://archive.ics.uci.edu/ml/datasets/adult

Dua Dheeru, and Efi Karra Taniskidou. "[UCI Machine Learning Repository](http://archive.ics.uci.edu/ml)". Irvine, CA: University of California, School of Information and Computer Science (2017).

In [None]:
urllib.request.urlretrieve(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
    'adult.data')

In [None]:
urllib.request.urlretrieve(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
    'adult.test')

In [None]:
adult_columns = [
    "Age", "Workclass", "fnlwgt", "Education", 
    "Education-Num", "Marital Status", "Occupation", 
    "Relationship", "Ethnic group", "Sex", "Capital Gain", 
    "Capital Loss", "Hours per week", "Country", "Target"]

training_data = pd.read_csv("adult.data",
                             names=adult_columns,
                             sep=r'\s*,\s*',
                             engine='python',
                             na_values="?").dropna()

testing_data = pd.read_csv("adult.test",
                            names=adult_columns,
                            sep=r'\s*,\s*',
                            engine='python',
                            na_values="?",
                            skiprows=1).dropna()

training_data.head()

## 2 - Inspect data

In [None]:
training_data['Sex'].value_counts().sort_values().plot(kind='bar', title='Counts of Sex', rot=0, figsize=(6, 3))

In [None]:
training_data['Sex'].where(training_data['Target']=='>50K').value_counts().sort_values().plot(kind='bar', title='Counts of Sex earning >$50K', rot=0, figsize=(6, 3))

## 3 - Encode data

In [None]:
from sklearn import preprocessing
def number_encode_features(df):
    result = df.copy()
    encoders = {}
    for column in result.columns:
        if result.dtypes[column] == np.object:
            encoders[column] = preprocessing.LabelEncoder()
            #  print('Column:', column, result[column])
            result[column] = encoders[column].fit_transform(result[column].fillna('None'))
    return result, encoders

training_data = pd.concat([training_data['Target'], training_data.drop(['Target'], axis=1)], axis=1)
training_data, _ = number_encode_features(training_data)
training_data.to_csv('train_data.csv', index=False, header=False)

testing_data = pd.concat([testing_data['Target'], testing_data.drop(['Target'], axis=1)], axis=1)
testing_data, _ = number_encode_features(testing_data)
testing_data.to_csv('test_features.csv', index=False, header=False)

A quick note about our encoding: the "Female" Sex value has been encoded as 0 and "Male" as 1.

In [None]:
training_data.head()

## 4 - Train a classification model with XGBoost on Amazon SageMaker

In [None]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

train_uri = S3Uploader.upload('train_data.csv', 's3://{}/{}'.format(bucket, prefix))
train_input = TrainingInput(train_uri, content_type='csv')

test_uri = S3Uploader.upload('test_features.csv', 's3://{}/{}'.format(bucket, prefix))
test_input = TrainingInput(test_uri, content_type='csv')

In [None]:
from sagemaker.image_uris import retrieve
from sagemaker.estimator import Estimator

container = retrieve('xgboost', region, version='1.2-1')

xgb = Estimator(container,
                role,
                instance_count=1,
                instance_type='ml.m4.xlarge')

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        num_round=1000,
                        early_stopping_rounds=10)

xgb.fit({'train': train_input, 'validation': test_input})

In [None]:
from time import gmtime, strftime

model_name = 'DEMO-clarify-model-'+strftime('%d-%H-%M-%S', gmtime())
model = xgb.create_model(name=model_name)
session.create_model(model_name, 
                     role, 
                     model.prepare_container_def())

## 5 - Analyze bias with Amazon SageMaker Clarify

### Define a SageMaker Processing processor

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.c4.xlarge',
    sagemaker_session=session)

### Configuring bias detection

In [None]:
bias_report_output_path = 's3://{}/{}/clarify-bias'.format(bucket, prefix)

# The dataset to analyze (pre-training metrics)
bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=bias_report_output_path,
    label='Target',
    headers=training_data.columns.to_list(),
    dataset_type='text/csv')

# The configuration of the temporary endpoint (post-training metrics)
model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type='ml.c5.xlarge',
    instance_count=1,
    accept_type='text/csv')

In [None]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name='Sex',
    facet_values_or_threshold=[0])

### Compute pre-training and post-training bias metrics

In [None]:
clarify_processor.run_bias(
    data_config=bias_data_config,
    bias_config=bias_config,
    model_config=model_config,
    pre_training_methods='all',
    post_training_methods='all')

If you're not a Studio user yet, you can access the bias report in pdf, html and ipynb formats in the following S3 bucket:

In [None]:
bias_report_output_path

In [None]:
%%sh -s $bias_report_output_path
aws s3 cp --recursive $1/ .

## 6 - Explain feature importance with Amazon SageMaker Clarify

In [None]:
# Drop the label column
testing_features = testing_data.drop(['Target'], axis=1)

shap_config = clarify.SHAPConfig(baseline=[testing_features.iloc[0].values.tolist()],
                                 num_samples=15,
                                 agg_method='mean_abs')

explainability_output_path = 's3://{}/{}/clarify-explainability'.format(bucket, prefix)

explainability_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=explainability_output_path,
    label='Target',
    headers=training_data.columns.to_list(),
    dataset_type='text/csv')

In [None]:
clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config)

In [None]:
explainability_output_path

In [None]:
explanations = pd.read_csv(explainability_output_path+'/explanations_shap/out.csv')
explanations.head()

## 7 - Clean up

In [None]:
#session.delete_model(model_name)