In [29]:
%load_ext autoreload
%autoreload 2

import sys
import torch
import math
import copy
sys.path.append('EAP-IG/src')

from functools import partial

from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from datasets import load_dataset

from eap.graph import Graph
from eap.attribute import attribute
from eap.attribute_node import attribute_node
from eap.evaluate import evaluate_graph, evaluate_baseline
from MIB_circuit_track.dataset import HFEAPDataset
from MIB_circuit_track.metrics import get_metric
from MIB_circuit_track.utils import MODEL_NAME_TO_FULLNAME, TASKS_TO_HF_NAMES
from MIB_circuit_track.evaluation import evaluate_area_under_curve

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading model and data

In [2]:
model_name = "gpt2" # One of ["gpt2", "llama3", "gemma2", "qwen2.5"]
model_name = MODEL_NAME_TO_FULLNAME[model_name]

dataset_name = "ioi" # One of ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]
dataset_name = f"mib-bench/{TASKS_TO_HF_NAMES[dataset_name]}"

In [3]:
model = HookedTransformer.from_pretrained(model_name, attn_implementation="eager", torch_dtype=torch.bfloat16, device="cuda")
model.cfg.use_split_qkv_input = True
model.cfg.use_attn_result = True
model.cfg.use_hook_mlp_in = True
model.cfg.ungroup_grouped_query_attention = True

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Loaded pretrained model gpt2-small into HookedTransformer


In [4]:
dataset = HFEAPDataset(dataset_name, model.tokenizer)
dataset.head(500)
dataloader = dataset.to_dataloader(batch_size=64)
metric_fn = get_metric(metric_name="logit_diff", task=["ignore this"], tokenizer=model.tokenizer, model=model)
baseline = evaluate_baseline(model, dataloader, partial(metric_fn, loss=False, mean=False)).mean().item()

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Old caching folder /home/yaniv.n/.cache/huggingface/datasets/mib-bench___ioi/default/0.0.0/e5f3468f3af4c0883be35cd3bced8c711c95d286 for dataset ioi exists but no data were found. Removing it. 


train.json:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

dev.json:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

100%|██████████| 8/8 [00:08<00:00,  1.08s/it]


## Attribution, Pruning, and Evaluation

In [5]:
# Define and attribute the graph in the following nodes

g_edges = Graph.from_model(model) # For edge based attribution
g_nodes = Graph.from_model(model) # For node based attribution

attribute(model, g_edges, dataloader, partial(metric_fn, loss=True, mean=True), method='EAP-IG-inputs', ig_steps=5)
attribute_node(model, g_nodes, dataloader, partial(metric_fn, loss=True, mean=True), method='EAP-IG-inputs', ig_steps=5)

100%|██████████| 8/8 [00:10<00:00,  1.34s/it]
100%|██████████| 8/8 [00:05<00:00,  1.44it/s]


In [6]:
# Build the graph given the calculated scores
# Some relevant information:
#   graph.in_graph and graph.nodes_in_graph need to be updated to reflect the edges or nodes that are in the graph
#   graph.scores and graph.nodes_scores contain the scores of the edges or nodes, post attribution (scores will be non-zero only if attribution was done on edges, same for nodes_scores if attribution was done on nodes)

def build_graph_from_attribution_scores(g_edges, g_nodes):
    # NOTE: THIS FUNCTION SHOULD BE REPLACED TO BE BASED ON SOMETHING SMARTER (GRAPH ALGORITHM ETC)
    g_edges.apply_greedy(n_edges=250, absolute=True, reset=True, prune=True)
    g_nodes.apply_node_topn(n_nodes=45, absolute=True, reset=True, prune=True)
    return g_edges

graph = build_graph_from_attribution_scores(g_edges, g_nodes)

In [None]:
# Evaluate the graph to find it's faithfulness (higher = better)

results = evaluate_graph(model, graph, dataloader, partial(metric_fn, loss=False, mean=False)).mean().item()
print(f"Faithfulness: {results / baseline}")

In [38]:
# Similar to the two cells above, but calculates faithfulness 
# across percentages + calculates AUC (area under curve) for the faithfulness scores.
# This is the score that is actually evaluated and important in the end.
 
percentages = [0, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0]

def auc(faithfulnesses, log_scale=False, ):
    area_under = 0.
    area_from_1 = 0.
    for i in range(len(faithfulnesses) - 1):
        i_1, i_2 = i, i+1
        x_1 = percentages[i_1]
        x_2 = percentages[i_2]
        # area from point to 100
        if log_scale:
            x_1 = math.log(x_1)
            x_2 = math.log(x_2)
        trapezoidal = (x_2 - x_1) * \
                        (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
        area_from_1 += trapezoidal 
        
        trapezoidal = (x_2 - x_1) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
        area_under += trapezoidal
    average = sum(faithfulnesses) / len(faithfulnesses)

    return area_under, area_from_1, average


def build_graph_from_attribution_scores(g_edges, g_nodes, edge_percent, node_percent):
    n_edges = int(len(g_edges.edges) * edge_percent)
    n_nodes = min(len(g_nodes.nodes) - 1, max(0, int(len(g_nodes.nodes) * node_percent)))
    g_edges.apply_greedy(n_edges=n_edges, absolute=True, reset=True, prune=True)

    # Prune nodes based on node graph
    g_nodes.apply_node_topn(n_nodes=n_nodes, absolute=True, reset=True, prune=True)
    
    g_edges.nodes_in_graph = copy.deepcopy(g_nodes.nodes_in_graph) # Take only nodes that were chosen also via node attribution
    g_edges.in_graph[~g_edges.nodes_in_graph] = 0 # Zero out edges going out of nodes that were pruned out

    n_edges_before = g_edges.in_graph.sum()
    g_edges.prune()
    n_edge_after = g_edges.in_graph.sum()

    return g_edges


g_edges = Graph.from_model(model) # For edge based attribution
g_nodes = Graph.from_model(model) # For node based attribution
attribute(model, g_edges, dataloader, partial(metric_fn, loss=True, mean=True), method='EAP-IG-inputs', ig_steps=5)
attribute_node(model, g_nodes, dataloader, partial(metric_fn, loss=True, mean=True), method='EAP-IG-inputs', ig_steps=5)

faithfulnesses = []
for edge_percent in percentages:
    node_percent = edge_percent
    graph = build_graph_from_attribution_scores(g_edges, g_nodes, edge_percent, node_percent)
    faith = evaluate_graph(model, graph, dataloader, partial(metric_fn, loss=False, mean=False), quiet=True).mean().item() / baseline
    faithfulnesses.append(faith)
    print(edge_percent, faithfulnesses[-1])

print('AUC (Higher is better): ', auc(faithfulnesses, log_scale=False)[0])

# Logging some output results:
# Simple edge-based AUC: 0.9687
# Edge-based + pruning nodes based on node scores (node_percent=edge_percent): 0.5644

100%|██████████| 8/8 [00:07<00:00,  1.06it/s]
100%|██████████| 8/8 [00:05<00:00,  1.38it/s]


0 -0.6405529953917051
0.001 -0.6129032258064516
0.002 -0.2269585253456221
0.005 0.2569124423963134
0.01 0.6405529953917051
0.02 1.2258064516129032
0.05 0.9769585253456221
0.1 0.8847926267281107
0.2 0.9585253456221198
0.5 0.9815668202764977
1.0 1.0
AUC (Higher is better):  0.9687304147465438
