In [None]:
"""
LIME-Based AOPC Calculation for BERT Sequence Classification Models
==================================================================
This script demonstrates how to compute LIME attributions and AOPC (Area Over the Perturbation Curve)
for a HuggingFace Transformer model. LIME helps explain predictions by identifying which parts of text
are most important for a model's decision.
The script is designed to be run locally or on a hosted runtime like Colab.

INSTRUCTIONS:
-------------
1. Install requirements (see below).
2. Set your own HuggingFace model and tokenizer, and provide paths to your train/test data.
3. Run the script!

REQUIREMENTS:
-------------
!pip install transformers datasets lime pandas torch

If running in Colab, uncomment and run the pip commands at the top of your notebook.
"""

!pip install transformers datasets lime pandas torch

import torch
import numpy as np
import pandas as pd
import gc
import re

# Import required libraries for model loading
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Import required libraries for model interpretation
from lime.lime_text import LimeTextExplainer

# To print NumPy scalars as Python scalars, i.e., without "np.float64"
np.set_printoptions(legacy='1.25')


Optional: Login to your HuggingFace Hub account

In [None]:
from huggingface_hub import login
login("your_token") # <-- CHANGE THIS to your HuggingFace Login Access Token

Optional: Check GPU and RAM availability

In [None]:
# -- Optional: Check GPU and RAM availability --
def print_gpu_ram_info():
    try:
        import subprocess
        # Check GPU info
        gpu_info = subprocess.check_output(['nvidia-smi']).decode()
        print("GPU Info:\n", gpu_info)
    except Exception:
        print('No GPU found or not connected to a GPU.')

    # Check RAM info
    try:
        from psutil import virtual_memory
        ram_gb = virtual_memory().total / 1e9
        print('Your runtime has {:.1f} GB of available RAM\n'.format(ram_gb))
    except ImportError:
        print('psutil not installed, skipping RAM check.')

# Call the function (optional)
print_gpu_ram_info()



User Configuration

In [None]:
# --- User Configuration ---
# Provide the name of your model (must be compatible with HuggingFace Transformers)
MODEL_NAME = "your_model"  # <-- CHANGE THIS to your model

# Provide the name of your tokenizer
TOKENIZER = "your_tokenizer" # <-- CHANGE THIS if not the same as your MODEL_NAME

# Path to your test CSV file (should have at least 'EssayText' and 'essay_score' columns)
TEST_CSV_PATH = "path/to/your/test_data.csv"  # <-- CHANGE THIS to your test data path

# Number of classes in your classification problem
NUM_LABELS = 2  # <-- CHANGE THIS to your number of classes

# Class names (must match your dataset)
CLASS_NAMES = [str(i) for i in range(NUM_LABELS)]  # or use your actual class names

# Random seed for reproducibility
RANDOM_STATE = 0

# Number of features to compute LIME
NUM_FEATURES = 10 # <-- The default is 10

# Number of samples used by LIME to explain predictions
NUM_SAMPLES = 100 # <-- The default is 100

Load Model and Tokenizer

In [None]:
# --- Load Model and Tokenizer ---
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

# Use GPU if available, else CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model.to(device)
model.eval()

Load Data

In [None]:
# --- Load Data ---
# If using HuggingFace datasets:
# test_set = load_dataset('csv', data_files=TEST_CSV_PATH)['train']
# test_doc = list(test_set['EssayText'])

# Or load with pandas:
test = pd.read_csv(TEST_CSV_PATH)
assert 'EssayText' in test.columns, "Your CSV file must have an 'EssayText' column."
test_doc = list(test['EssayText'][0:5])
test.head()

Define Functions to Predict Class Probabilities, Compute LIME Attribution Scores, and Compute the AOPC

In [None]:
# --- Helper Functions ---

def predict_proba(texts):
    """
    Predict class probabilities for a list of texts.
    Uses small batch size to reduce memory usage.
    """
    batch_size = 1  # Adjust if you have enough RAM/GPU
    all_probs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():  # Disable gradient calculation to save memory
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
        all_probs.append(probs)
    return np.vstack(all_probs)

def lime_attribution(text):
    """
    Compute LIME attribution scores for the given text.
    Returns (attributions, tokens).
    """
    explainer = LimeTextExplainer(class_names=[0,1,2,3], random_state=RANDOM_STATE)
    exp = explainer.explain_instance(text, predict_proba, num_features=NUM_FEATURES, num_samples=NUM_SAMPLES)
    importance_scores = dict(exp.as_list()) #Here the tokens will be sorted from highest to lowest attribution score

    # Tokenize the text as LIME does (word-level, removing punctuation)
    tokens = re.split(r'\W+', text)

    # Match tokens with their importance scores to pass to the compute_aopc_lime function
    attributions = np.array([importance_scores.get(token, 0) for token in tokens])
    return attributions, tokens

def compute_aopc_lime(text):
    """
    For a given text, computes the AOPC (Area Over the Perturbation Curve)
    using LIME attributions.
    """
    # Obtain LIME attribution scores
    lime_values, tokens = lime_attribution(text)
    attributions_scores = dict(zip(tokens, lime_values))
    print("Attribution scores: ", attributions_scores)

    # Sort tokens by their importance for the predicted class based on LIME attribution scores (descending

    # Obtain the predicted class (the one with the highest probability)
    original_prediction = predict_proba([text])[0]
    predicted_class_idx = np.argmax(original_prediction)

    # Sort tokens by their importance for the predicted class based on LIME attribution scores (descending)
    sorted_indices = np.argsort(-lime_values)

    # Determine the top 20% of tokens to remove
    n_top_tokens = max(1, int(0.2 * len(sorted_indices)))  # Ensure at least 1 tokens is perturbed
    top_indices_to_remove = sorted_indices[:n_top_tokens]

    total_score_drop = 0

    for i in range(1, n_top_tokens + 1):
        # Perturb the input by removing the top i important tokens
        indices_to_remove = top_indices_to_remove[:i]
        perturbed_tokens = [tokens[j] for j in range(len(tokens)) if j not in indices_to_remove]
        perturbed_text = " ".join(perturbed_tokens)
        print(indices_to_remove)

        # Get new prediction for perturbed text
        new_prediction = predict_proba([perturbed_text])[0]

        # Calculate the drop in prediction score for the predicted class
        score_drop = original_prediction[predicted_class_idx] - new_prediction[predicted_class_idx]
        total_score_drop += score_drop
        print(score_drop)

    # Clean up memory
    #del lime_values, tokens
    #gc.collect()  # Trigger garbage collection to free up memory

    # Calculate AOPC for the predicted class using only the top 20% of important tokens
    aopc = total_score_drop / n_top_tokens
    return attributions_scores, aopc

Main Loop: Calculate AOPC for Test Set, Export Results to a CSV file, and Print Group-Wise Mean AOPC by Score

In [None]:
# --- Main Loop: Calculate AOPC for Test Set ---

aopc_list = []
attributions_scores_list = []
for idx, ex in enumerate(test_doc[0:5]):
    print(f"Processing example {idx+1} of {len(test_doc[0:5])}")
    attributions_scores, aopc = compute_aopc_lime(ex)
    attributions_scores_list.append(attributions_scores)
    aopc_list.append(aopc)

# Report Mean AOPC
print('Average AOPC to report: ', np.mean(aopc_list))

# --- Save Results ---
# Make sure 'essay_score' column exists in your CSV.
output_df = pd.DataFrame({
    'response': test_doc[0:5],
    'attributions_scores': attributions_scores_list,
    'lime_aopc': aopc_list,
    'original_score': test['essay_score'][0:5] if 'essay_score' in test.columns else np.nan
})

output_df.to_csv('lime_aopc.csv', index=False)

# Print group-wise mean AOPC by score (if available)
if 'essay_score' in test.columns:
    print(output_df.groupby('original_score')['lime_aopc'].mean())