In [None]:
"""
Integrated Gradients (IG) AOPC Calculation for BERT Sequence Classification Models
==================================================================
This script demonstrates how to compute IG attributions and AOPC (Area Over the Perturbation Curve)
for a HuggingFace Transformer model. IG helps explain predictions by identifying which parts of text
are most important for a model's decision.
The script is designed to be run locally or on a hosted runtime like Colab.

INSTRUCTIONS:
-------------
1. Install requirements (see below).
2. Set your own HuggingFace model and tokenizer, and provide paths to your train/test data.
3. Run the script!

CAUTION:
-------------
Installing transformers_interpret in the first notebook cell may cause dependency conflicts with numpy and pandas.
This can result in import errors or binary incompatibility issues.

Installing transformers_interpret in the order provided in this notebook helps avoid version conflicts and ensures stable imports.
If running in Colab, IGNORE THE MESSAGE TO RESTART THE RUNTIME after transformers_interpret is installed.
"""
import torch
import numpy as np
import pandas as pd

# To print NumPy scalars as Python scalars, i.e., without "np.float64"
#np.set_printoptions(legacy='1.25')


Optional: Login to your HuggingFace Hub account

In [None]:
from huggingface_hub import login
login("your_token") # <-- CHANGE THIS to your HuggingFace Login Access Token

Optional: Check GPU and RAM availability

In [None]:
# -- Optional: Check GPU and RAM availability --
def print_gpu_ram_info():
    try:
        import subprocess
        # Check GPU info
        gpu_info = subprocess.check_output(['nvidia-smi']).decode()
        print("GPU Info:\n", gpu_info)
    except Exception:
        print('No GPU found or not connected to a GPU.')

    # Check RAM info
    try:
        from psutil import virtual_memory
        ram_gb = virtual_memory().total / 1e9
        print('Your runtime has {:.1f} GB of available RAM\n'.format(ram_gb))
    except ImportError:
        print('psutil not installed, skipping RAM check.')

# Call the function (optional)
print_gpu_ram_info()

User Configuration

In [None]:
# --- User Configuration ---
# Provide the name of your model (must be compatible with HuggingFace Transformers)
MODEL_NAME = "your_model"  # <-- CHANGE THIS to your model

# Provide the name of your tokenizer
TOKENIZER = "your_tokenizer" # <-- CHANGE THIS if not the same as your MODEL_NAME

# Path to your test CSV file (should have at least 'EssayText' and 'essay_score' columns)
TEST_CSV_PATH = "path/to/your/test_data.csv"  # <-- CHANGE THIS to your test data path

# Number of classes in your classification problem
NUM_LABELS = 4  # <-- CHANGE THIS to your number of classes

# Class names (must match your dataset)
CLASS_NAMES = [str(i) for i in range(NUM_LABELS)]  # or use your actual class names

# Random seed for reproducibility
RANDOM_STATE = 0

# Number of steps to compute IG
# N_STEPS = 50 # <-- The default is 50

Load Data

In [None]:
# --- Load Data ---
# If using HuggingFace datasets:
# test_set = load_dataset('csv', data_files=TEST_CSV_PATH)['train']
# test_doc = list(test_set['EssayText'])

# Or load with pandas:
test = pd.read_csv(TEST_CSV_PATH)
assert 'EssayText' in test.columns, "Your CSV file must have an 'EssayText' column."
test_doc = list(test['EssayText'])
test.head()

Load Model and Tokenizer

In [None]:
# --- Load Model and Tokenizer ---
# Import required libraries for model loading
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

# Use GPU if available, else CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model.to(device)
model.eval()

Define Functions to Initialize the IG explainer to Compute IG Attribution Scores, Perturb the Text, and Compute the AOPC

In [None]:
"""
Caution:
Installing transformers_interpret in the first notebook cell may cause dependency conflicts with numpy and pandas.
This can result in import errors or binary incompatibility issues.

Installing transformers_interpret in this cell helps avoid version conflicts and ensures stable imports.
If running in Colab, IGNORE THE MESSAGE TO RESTART THE RUNTIME after transformers_interpret is installed.
"""

!pip install transformers_interpret

In [None]:
from transformers_interpret import SequenceClassificationExplainer
from transformers import TextClassificationPipeline

# Initialize IG explainer
explainer = SequenceClassificationExplainer(model=model, tokenizer=tokenizer)

# Pipeline to Predict the Probabilities
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

# Move the pipeline to the GPU
pipe.device = device

def perturb_text(tokens, indices_to_remove):
    """
    Removes or masks the tokens at the specified indices.
    """
    for idx in sorted(indices_to_remove, reverse=True):
        tokens[idx] = ""  # You can use "[PAD]" or "" (empty string) to remove
    return " ".join(tokens)

def compute_aopc_ig(text):
    """
    Computes the Area Over the Perturbation Curve (AOPC) for a given text, considering score drops for the predicted class only.
    """
    # Get original prediction and attribution scores
    attributions = explainer(text) # Different from LOO and LIME, IG attributions are stored as a list because repeated tokens have different attribution scores
    print("Attributions:", attributions)
    original_prediction_scores = pipe(text)[0]

    # Identify the predicted class (the one with the highest score)
    predicted_class_idx = np.argmax([score['score'] for score in original_prediction_scores])

    # Extract score for the predicted class
    original_prediction = original_prediction_scores[predicted_class_idx]['score']

    # Sort attributions by importance (score) in descending order
    sorted_attributions = sorted(attributions, key=lambda x: x[1], reverse=True)

    # Tokenize the text using tokenizer
    tokens = tokenizer.tokenize(text)

    # Create a mapping from tokens to their positions
    token_positions = {token: idx for idx, token in enumerate(tokens)}

    # Find the indices of the tokens to remove based on sorted attributions
    indices_to_remove = []
    for token, _ in sorted_attributions:
        if token in token_positions:
            indices_to_remove.append(token_positions[token])

    # Only consider the top 20% of tokens
    n_top_tokens = max(1, int(0.2 * len(indices_to_remove)))  # Ensure at least one token is perturbed
    top_indices_to_remove = indices_to_remove[:n_top_tokens]

    # Initialize the total score drop for the predicted class
    total_score_drop = 0

    for i in range(1, n_top_tokens + 1):
        # Perturb the input by removing top i important tokens
        current_indices_to_remove = top_indices_to_remove[:i]
        perturbed_text = perturb_text(tokens.copy(), current_indices_to_remove)
        print("Perturbed text: ", perturbed_text)
        print("Current indices to remove:", current_indices_to_remove)

        # Get new prediction scores for perturbed text using pipe
        new_prediction_scores = pipe(perturbed_text)[0]

        # Get the new score for the predicted class
        new_prediction = new_prediction_scores[predicted_class_idx]['score']

        # Calculate the drop in prediction score for the predicted class
        score_drop = original_prediction - new_prediction
        total_score_drop += score_drop
        print("Score drop:", score_drop)

    # Calculate AOPC for the predicted class using only the top 20% of important tokens
    aopc = total_score_drop / n_top_tokens
    return attributions, aopc

Main Loop: Calculate AOPC for Test Set, Export Results to a CSV file, and Print Group-Wise Mean AOPC by Score

In [None]:
# --- Main Loop: Calculate AOPC for Test Set ---

aopc_list = []
attribution_scores_list = []
for idx, ex in enumerate(test_doc):
    print(f"Processing example {idx+1} of {len(test_doc)}")
    attribution_scores, aopc = compute_aopc_ig(ex)
    attribution_scores_list.append(attribution_scores)
    aopc_list.append(aopc)

# Report Mean AOPC
print('Average AOPC to report: ', np.mean(aopc_list))

# --- Save Results ---
# Make sure 'essay_score' column exists in your CSV.
output_df = pd.DataFrame({
    'response': test_doc,
    'attributions_scores': attribution_scores_list,
    'ig_aopc': aopc_list,
    'original_score': test['essay_score'] if 'essay_score' in test.columns else np.nan
})

output_df.to_csv('ig_aopc.csv', index=False)

# Print group-wise mean AOPC by score (if available)
if 'essay_score' in test.columns:
    print(output_df.groupby('original_score')['ig_aopc'].mean())