# Sagamaker Clarify

- **ReadtheDocs link:** https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-clarify/index.html
- **Pre-Training Bias:** https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-detect-data-bias.html

In [1]:
import sagemaker
import boto3



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
ROLE = sagemaker.get_execution_role()
print(ROLE)

boto_session = boto3.session.Session()
REGION = boto_session.region_name
print(REGION)

sagemaker_session = sagemaker.session.Session(boto_session=boto_session)
print(sagemaker_session.boto_region_name)
BUCKET = sagemaker_session.default_bucket()
print(BUCKET)

KEY = "FraudDetection_AutoInsurance"

arn:aws:iam::205930620783:role/service-role/AmazonSageMaker-ExecutionRole-20250401T145997
us-east-1
us-east-1
sagemaker-us-east-1-205930620783


In [3]:
sagemaker_client = boto_session.client("sagemaker")
s3_client = boto_session.client("s3")

### First check if there is a model registered 
### Create the model from the last training job that we had run

In [4]:
model_name = "sagemaker-xgboost-v3"
models = sagemaker_client.list_models(NameContains=model_name)['Models']
print(models)
if not models:
    
    training_job_name = 'sagemaker-xgboost-2025-06-11-13-18-27-251'  #'sagemaker-xgboost-2025-05-30-09-09-33-714'
    training_job_info = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)
    print(training_job_info['TrainingJobName'])
    print(training_job_info["AlgorithmSpecification"]["TrainingImage"])
    
    model1 = sagemaker_session.create_model_from_job(
        name=model_name, 
        training_job_name=training_job_name, 
        role=ROLE, 
        image_uri=training_job_info["AlgorithmSpecification"]["TrainingImage"]
    )
    print("Creating Model")

[]
sagemaker-xgboost-2025-06-11-13-18-27-251
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3


Creating Model


#### Now create a sagmaker clarify process

In [5]:
clarify_instance_count = 1
clarify_instance_type = "ml.c4.xlarge"

train_data_uri = f"s3://{BUCKET}/{KEY}/data/train.csv"
test_data_uri = f"s3://{BUCKET}/{KEY}/data/test.csv"

import pandas as pd
train_data = pd.read_csv(train_data_uri)
print(train_data.columns)  
print(train_data['fraud'].value_counts())

bias_report_1_output_path = f"s3://{BUCKET}/{KEY}/clarify-bias"
print(bias_report_1_output_path)

Index(['fraud', 'incident_severity', 'num_vehicles_involved', 'num_injuries',
       'num_witnesses', 'police_report_available', 'injury_claim',
       'vehicle_claim', 'total_claim_amount', 'incident_month', 'incident_day',
       'incident_dow', 'incident_hour', 'driver_relationship_self',
       'driver_relationship_na', 'driver_relationship_spouse',
       'driver_relationship_child', 'driver_relationship_other',
       'incident_type_collision', 'incident_type_breakin',
       'incident_type_theft', 'collision_type_front', 'collision_type_rear',
       'collision_type_side', 'collision_type_na',
       'authorities_contacted_police', 'authorities_contacted_none',
       'authorities_contacted_fire', 'authorities_contacted_ambulance',
       'eventtime_x', 'customer_age', 'customer_education',
       'months_as_customer', 'policy_deductable', 'policy_annual_premium',
       'policy_liability', 'auto_year', 'num_claims_past_year',
       'num_insurers_past_5_years', 'customer_gender

## Create a clarify process, prividiing data config, model config, predictions config and the bias config

In [6]:
clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(
    role=ROLE, 
    instance_count=clarify_instance_count, 
    instance_type=clarify_instance_type, 
    sagemaker_session=sagemaker_session
)
print(clarify_processor)

<sagemaker.clarify.SageMakerClarifyProcessor object at 0x7f6eb02a3ec0>


In [7]:
train_data.dtypes

fraud                                int64
incident_severity                  float64
num_vehicles_involved                int64
num_injuries                         int64
num_witnesses                        int64
police_report_available            float64
injury_claim                         int64
vehicle_claim                        int64
total_claim_amount                   int64
incident_month                       int64
incident_day                         int64
incident_dow                         int64
incident_hour                        int64
driver_relationship_self           float64
driver_relationship_na             float64
driver_relationship_spouse         float64
driver_relationship_child          float64
driver_relationship_other          float64
incident_type_collision            float64
incident_type_breakin              float64
incident_type_theft                float64
collision_type_front               float64
collision_type_rear                float64
collision_t

In [8]:
# Dataconfig - Basic Information about the input data i.e. input data path, output report path, target_label, columns name list, dataset type i.e. csv/json etc
bias_data_config = sagemaker.clarify.DataConfig(
    s3_data_input_path=train_data_uri, 
    s3_output_path=bias_report_1_output_path, 
    label='fraud',
    headers=train_data.columns.tolist(),
    dataset_type="text/csv"
)

In [9]:
# Model Config: Basic Information about the model i.e. model_name to assess bias for, instance_type to run model on, input and output of the mdoel. 
bias_model_config = sagemaker.clarify.ModelConfig(
    model_name=model_name,
    instance_count=clarify_instance_count,
    instance_type=clarify_instance_type,
    accept_type="text/csv"#,
    #content_type="text/csv"
)

In [10]:
# Model Predicted Lable Config i.e. define model score threshold
predictions_config = sagemaker.clarify.ModelPredictedLabelConfig(probability_threshold=0.02)

In [11]:
# Bias Config
bias_config = sagemaker.clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name='customer_gender_female',
    facet_values_or_threshold=[1],
    group_name='customer_age'
)

In [12]:
clarify_processor.run_bias(
    data_config=bias_data_config,
    model_config=bias_model_config,
    bias_config=bias_config,
    model_predicted_label_config=predictions_config,
    pre_training_methods="all",
    post_training_methods="all"
)

.............................[34msagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml[0m
[34msagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml[0m
[34mWe are not in a supported iso region, /bin/sh exiting gracefully with no changes.[0m
[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is algo-1.[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is the leader.[0m
[34mINFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.[0m
[34mINFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.[0m
[34mINFO:analyzer.data_lo