# SageMaker Clarify Demo: Analyze Bias
This notebook demonstrates how SageMaker Clarify

In [None]:
# Step 1: Setup
import sagemaker
from sagemaker import get_execution_role
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split 

sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name

# S3 bucket for storing data
bucket = 'sagemaker-ml-28573'
prefix = 'clarify-data'
output_path = f's3://{bucket}/{prefix}/output'

# Load the dataset
file_path = 'Employee.csv'  # Replace with your actual file path in S3 if needed
employee_df = pd.read_csv(file_path)
employee_df.head()

In [None]:
# Step 2: Data Preparation
# Convert categorical columns to numeric
employee_df['Education'] = employee_df['Education'].astype('category').cat.codes
employee_df['City'] = employee_df['City'].astype('category').cat.codes
employee_df['Gender'] = employee_df['Gender'].astype('category').cat.codes
employee_df['EverBenched'] = employee_df['EverBenched'].map({'Yes': 1, 'No': 0})

# Drop rows with NaN values in the target column
employee_df.dropna(subset=['LeaveOrNot'])

# Convert target column to numeric if needed
employee_df['LeaveOrNot'] = employee_df['LeaveOrNot'].astype(int)

# Ensure no missing values in feature columns
employee_df = employee_df.dropna()

# Verify all columns are numeric
print(employee_df.dtypes)

# Define features and target
feature_columns = [
    'Education', 'JoiningYear', 'City', 'PaymentTier', 'Age',
    'Gender', 'EverBenched', 'ExperienceInCurrentDomain'
]
target_column = 'LeaveOrNot'

employee_df = employee_df[[target_column] + feature_columns]

train_df, test_df = train_test_split(employee_df, test_size=0.2, random_state=42)

# Display the transformed dataset
employee_df.head()

## Train the Model 

In [None]:
# Initialize S3 client
s3 = boto3.client('s3')

# Define your S3 bucket and prefix
bucket = 'sagemaker-ml-28573'
prefix = 'clarify-data'

# Save the data locally first
train_file = 'train.csv'
validation_file = 'validation.csv'
train_df.to_csv(train_file, index=False)
test_df.to_csv(validation_file, index=False)

# Upload the data to S3
s3.upload_file(train_file, bucket, f'{prefix}/train/{train_file}')
s3.upload_file(validation_file, bucket, f'{prefix}/validation/{validation_file}')

print(f"Training data uploaded to s3://{bucket}/{prefix}/train/{train_file}")
print(f"Validation data uploaded to s3://{bucket}/{prefix}/validation/{validation_file}")


## Configure Bias Analysis

In [None]:
from sagemaker import clarify

# Define the sensitive attribute (e.g., 'Gender')
sensitive_attribute = 'Gender'

# Define the label column 
label_column = 'LeaveOrNot'
prefix = 'clarify-data'
# Define output path
output_path = f's3://{bucket}/{prefix}/pre-training-analysis'

# Define the config for pre-training bias analysis
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name=sensitive_attribute,
    facet_values_or_threshold=[0],
    group_name=None
)

# Define the data config
data_config = clarify.DataConfig(
    s3_data_input_path=f's3://{bucket}/{prefix}/train/{train_file}',
    s3_output_path=output_path,
    label=label_column,
    headers=employee_df.columns.to_list(),
    dataset_type='text/csv'
)

## Run Pre-Training Bias Analysis

In [None]:
# Run pre-training bias analysis
clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.t3.medium',
    sagemaker_session=sagemaker_session
)

clarify_processor.run_pre_training_bias(
    data_config=data_config,
    data_bias_config=bias_config
)


## Train Model

In [None]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput

# Initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"}

# Set an output path where the trained model will be saved
output_path = f's3://{bucket}/{prefix}/training-output'

# Retrieve the XGBoost image URI
region = boto3.Session().region_name  # Automatically get the region
xgboost_container = image_uris.retrieve("xgboost", region, "1.7-1")

# Construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge', 
                                          volume_size=5,  # 5 GB 
                                          output_path=output_path)

# Define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput(f"s3://{bucket}/input-data/train/", content_type=content_type)
validation_input = TrainingInput(f"s3://{bucket}/input-data/validation/", content_type=content_type)

# Execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

## Deploy model

In [None]:
# Deploy model

predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge'
)


## Post-Training: Model Bias Analysis with SageMaker Clarify


In [None]:
# Define output path
output_path = f's3://{bucket}/{prefix}/post-training-analysis'

# Define the data config
data_config = clarify.DataConfig(
    s3_data_input_path=f's3://{bucket}/{prefix}/train/{train_file}',
    s3_output_path=output_path,
    label=label_column,
    headers=employee_df.columns.to_list(),
    dataset_type='text/csv'
)


# Define the model configuration
model_config = clarify.ModelConfig(
    model_name=predictor.endpoint,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    accept_type='text/csv'
)

# Run the model bias analysis
clarify_processor.run_bias(
    data_config=data_config,
    bias_config=bias_config,
    model_config=model_config,
    model_predicted_label_config=clarify.ModelPredictedLabelConfig(probability_threshold=0.5),
    pre_training_methods='all',
    post_training_methods='all'
)


## Explainability: Configure Output Path and DataConfig


In [None]:
# Define output path
explainability_report_output_path = f's3://{bucket}/{prefix}/output/explainability'



explainability_data_config = clarify.DataConfig(
    s3_data_input_path=f's3://{bucket}/{prefix}/train/{train_file}',
    s3_output_path=explainability_report_output_path,
    label=label_column,  # replace with your label column name
    headers=employee_df.columns.to_list(),  # list of your dataset columns
    dataset_type='text/csv'
)

In [None]:
shap_config = clarify.SHAPConfig(
    seed=123,  # random seed for reproducibility
    num_samples=100,  # number of samples for SHAP
    agg_method="mean_abs",  # aggregation method for SHAP values
    use_logit=False  # set True for binary classification with logit function
)


In [None]:
clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,  # IAM role used for running the job
    instance_count=1,  # number of instances
    instance_type='ml.t3.medium',  # instance type
    sagemaker_session=sagemaker_session  # session object
)

clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config
)


In [None]:
predictor.delete_endpoint()