In [None]:
"""
Erasure-Based (Leave-one-out [LOO]) AOPC Calculation for BERT Sequence Classification Models
==================================================================
This script demonstrates how to compute LOO attributions and AOPC (Area Over the Perturbation Curve)
for a HuggingFace Transformer model. LOO helps explain predictions by identifying which parts of text
are most important for a model's decision.
The script is designed to be run locally or on a hosted runtime like Colab.

INSTRUCTIONS:
-------------
1. Install requirements (see below).
2. Set your own HuggingFace model and tokenizer, and provide paths to your train/test data.
3. Run the script!

REQUIREMENTS:
-------------
!pip install transformers datasets pandas torch

If running in Colab, uncomment and run the pip commands at the top of your notebook.
"""

!pip install transformers datasets pandas torch

import torch
import numpy as np
import pandas as pd
import gc
import re

# Import required libraries for model loading
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# To print NumPy scalars as Python scalars, i.e., without "np.float64"
np.set_printoptions(legacy='1.25')


Optional: Login to your HuggingFace Hub account

In [None]:
from huggingface_hub import login
login("your_token") # <-- CHANGE THIS to your HuggingFace Login Access Token

Optional: Check GPU and RAM availability

In [None]:
# -- Optional: Check GPU and RAM availability --
def print_gpu_ram_info():
    try:
        import subprocess
        # Check GPU info
        gpu_info = subprocess.check_output(['nvidia-smi']).decode()
        print("GPU Info:\n", gpu_info)
    except Exception:
        print('No GPU found or not connected to a GPU.')

    # Check RAM info
    try:
        from psutil import virtual_memory
        ram_gb = virtual_memory().total / 1e9
        print('Your runtime has {:.1f} GB of available RAM\n'.format(ram_gb))
    except ImportError:
        print('psutil not installed, skipping RAM check.')

# Call the function (optional)
print_gpu_ram_info()

User Configuration

In [None]:
# --- User Configuration ---
# Provide the name of your model (must be compatible with HuggingFace Transformers)
MODEL_NAME = "your_model"  # <-- CHANGE THIS to your model

# Provide the name of your tokenizer
TOKENIZER = "your_tokenizer" # <-- CHANGE THIS if not the same as your MODEL_NAME

# Path to your test CSV file (should have at least 'EssayText' and 'essay_score' columns)
TEST_CSV_PATH = "path/to/your/test_data.csv"  # <-- CHANGE THIS to your test data path

# Number of classes in your classification problem
NUM_LABELS = 4  # <-- CHANGE THIS to your number of classes

# Class names (must match your dataset)
CLASS_NAMES = [str(i) for i in range(NUM_LABELS)]  # or use your actual class names

# Random seed for reproducibility
RANDOM_STATE = 0

Load Model and Tokenizer

In [None]:
# --- Load Model and Tokenizer ---
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

# Use GPU if available, else CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model.to(device)
model.eval()

Load Data

In [None]:
# --- Load Data ---
# If using HuggingFace datasets:
# test_set = load_dataset('csv', data_files=TEST_CSV_PATH)['train']
# test_doc = list(test_set['EssayText'])

# Or load with pandas:
test = pd.read_csv(TEST_CSV_PATH)
assert 'EssayText' in test.columns, "Your CSV file must have an 'EssayText' column."
test_doc = list(test['EssayText'])
test.head()

Define Functions to Predict Class Probabilities, Compute LOO Attribution Scores, and Compute the AOPC

In [None]:
# --- Helper Functions ---

def predict(text):
    """
    Generates the model's prediction probabilities for the given text.
    """

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Perform forward pass with the inputs on the same device
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits from the model outputs
    logits = outputs.logits

    # Apply sigmoid to the logits (for binary classification)
    probabilities = torch.softmax(logits, dim=1)

    # Move the tensor to the CPU before converting to NumPy
    probabilities = probabilities.detach().cpu().numpy()

    # Return the probabilities
    return probabilities

def erasure_attribution(text):
    """
    Computes attribution scores using the erasure method.
    """
    tokens = re.split(r'\W+', text)

    original_prediction = predict(text)

    # Get the predicted class (the one with the highest score)
    predicted_class_idx = np.argmax(original_prediction)

    attributions = []

    for i in range(len(tokens)):
        # Erase (remove) the token at position i
        perturbed_tokens = tokens[:i] + tokens[i+1:]
        perturbed_text = " ".join(perturbed_tokens)

        # Get the model's prediction for the perturbed text
        perturbed_prediction = predict(perturbed_text)

        # Calculate the change in prediction only for the predicted class
        score_change = original_prediction[0, predicted_class_idx] - perturbed_prediction[0, predicted_class_idx]
        attributions.append(score_change)

    # Create a dictionary of tokens and their respective unnormalized attributions
    unnormalized_token_attributions = dict(zip(tokens, attributions))
    print("Unnormalized attributions", unnormalized_token_attributions, ' \n')

    # Normalize the attribution scores for the predicted class
    max_score_change = np.max(attributions) if attributions else 1.0  # Avoid division by zero
    attributions = [score / max_score_change for score in attributions]


    # Create a dictionary of tokens and their respective normalized attributions
    normalized_token_attributions = dict(zip(tokens, attributions))
    print("Normalized attributions", normalized_token_attributions, ' \n')

    return unnormalized_token_attributions, np.array(attributions), tokens, predicted_class_idx

def compute_aopc_erasure(text):
    """
    Computes the Area Over the Perturbation Curve (AOPC) for a given text based on the erasure method.
    Calculates score drops for the predicted class only.
    """
    # Obtain erasure-based attribution scores
    unnormalized_token_attributions, attribution_scores, tokens, predicted_class_idx = erasure_attribution(text)

    # Obtain the predicted class (the one with the highest probability)
    original_prediction = predict(text)

    # Sort tokens by their importance for the predicted class based on attribution scores
    sorted_indices = np.argsort(-attribution_scores)

    # Determine the top 20% of tokens to remove
    n_top_tokens = max(1, int(0.2 * len(sorted_indices)))  # Ensure at least one token is perturbed
    top_indices_to_remove = sorted_indices[:n_top_tokens]

    total_score_drop = 0

    for i in range(1, n_top_tokens + 1):
        # Perturb the input by removing the top i important tokens
        indices_to_remove = top_indices_to_remove[:i]
        print('removed: ' ,indices_to_remove)
        perturbed_tokens = [tokens[j] for j in range(len(tokens)) if j not in indices_to_remove]
        perturbed_text = " ".join(perturbed_tokens)
        print('perturbed_text: ', perturbed_text)

        # Get new prediction for perturbed text
        new_prediction = predict(perturbed_text)

        # Calculate the drop in prediction score for the predicted class
        score_drop = original_prediction[0, predicted_class_idx] - new_prediction[0, predicted_class_idx]
        total_score_drop += score_drop
        print(score_drop)

    # Calculate AOPC for the predicted class using only the top 20% of important tokens
    aopc = total_score_drop / n_top_tokens
    return unnormalized_token_attributions, aopc

Main Loop: Calculate AOPC for Test Set, Export Results to a CSV file, and Print Group-Wise Mean AOPC by Score

In [None]:
# --- Main Loop: Calculate AOPC for Test Set ---

aopc_list = []
attributions_scores_list = []
for idx, ex in enumerate(test_doc):
    print(f"Processing example {idx+1} of {len(test_doc)}")
    attributions_scores, aopc = compute_aopc_erasure(ex)
    attributions_scores_list.append(attributions_scores)
    aopc_list.append(aopc)

# Report Mean AOPC
print('Average AOPC to report: ', np.mean(aopc_list))

# --- Save Results ---
# Make sure 'essay_score' column exists in your CSV.
output_df = pd.DataFrame({
    'response': test_doc,
    'attributions_scores': attributions_scores_list,
    'loo_aopc': aopc_list,
    'original_score': test['essay_score'] if 'essay_score' in test.columns else np.nan
})

output_df.to_csv('loo_aopc.csv', index=False)

# Print group-wise mean AOPC by score (if available)
if 'essay_score' in test.columns:
    print(output_df.groupby('original_score')['loo_aopc'].mean())