In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np
from captum.attr import IntegratedGradients, LayerIntegratedGradients, TokenReferenceBase, visualization
import pprint
import matplotlib.pyplot as plt
import thermostat


torch.manual_seed(123)
np.random.seed(123)

### Load a model:

In [2]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

### Define Inputs:

In [3]:
# the tokenizer also returns an attentions mask
inputs = tokenizer("Hello, my dog is cute.", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
print(inputs)
# but we can just pass an tensor of input ids
input_array = inputs["input_ids"].numpy().squeeze()


{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


### Forward Pass:

In [4]:
outputs = model(torch.tensor(input_array[None]))
loss = outputs.loss
logits = outputs.logits
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.0763,  4.3676]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


### Baseline:

"Since padding is one of the most commonly used references for tokens, padding index is passed as reference token index." from https://captum.ai/tutorials/IMDB_TorchText_Interpret  
--> needs proof

In [5]:
# PAD token is mostly used as baseline
for token in tokenizer.special_tokens_map.values():
    print(f"{token}: {tokenizer.vocab[token]}")

[UNK]: 100
[SEP]: 102
[PAD]: 0
[CLS]: 101
[MASK]: 103


In [6]:
baseline_array = np.zeros_like(input_array)

In [7]:
baseline_output = forward_call(baseline_array)
print(baseline_output)

[ 0.89618343 -0.84337085]


In [8]:
# print("input", input_array)
reversed_vocab = {v:k for k,v in dict(tokenizer.vocab).items()}
for token_id in input_array:
    print(token_id, reversed_vocab[int(token_id)])

101 [CLS]
7592 hello
1010 ,
2026 my
3899 dog
2003 is
10140 cute
1012 .
102 [SEP]


### Setting up the Feature Attribution method

In [9]:
attributions = torch.softmax(torch.rand(input_array.shape), dim=0).numpy()-.05
attributions

array([0.12608357, 0.04261087, 0.03936442, 0.05564533, 0.10718265,
       0.02673848, 0.09511977, 0.03915822, 0.01809666], dtype=float32)

In [10]:
attributions = np.array([0,.1, 0, .05,.1, .05, .6, .1,0])
assert attributions.shape == input_array.shape

### Define TPN/TPS

In [11]:
from metrics import TPN
from typing import Callable, List, Tuple, Generator
from numpy import ma
from sklearn.metrics import auc

In [12]:
def blend_with_mask(observation, baseline, mask):
    blended_observation = observation.copy()
    blended_observation[mask] = baseline
    return blended_observation

In [13]:
def create_sorted_index(x: np.ndarray) -> List[Tuple]:
    """
    Sorts x and returns an index in ascending order.

    Each element is a tuple that corresponds to a location in x.
    x[sorted_index(x)[k] returns the k-th largest value in the array.

    :param x: Multi-dimensional array to be indexed.
    :return: List of indices in ascending order.
    """
    index = np.unravel_index(np.argsort(x, axis=None), x.shape)

    #return list(zip(*index))
    return index[0]

In [14]:
input_array[create_sorted_index(input_array)][1]

102

In [15]:
def iterate_masks_saliency_ratio(attributions: np.array,
                                 reverse: bool = False,
                                 saliency_ratio_per_step: float = 0.1) -> Generator[Tuple[np.array, float], None, None]:
    """
        Generates masks that cover a set of pixels according to the ratio of
        the saliency of these pixels compared to the total saliency.

        Returns a Generator that starts from an empty (or full if reverse is true)
            mask, i.e., all mask entries are False (or True), and flips the mask in
            decreasing order of saliency values.

        :param reverse: Sets the direction of masking,
            False means gradually masking more and more pixels in the image,
            True means the pixels are gradually unmasked
        :param saliency_ratio_per_step: The ratio of saliency of flipped mask entries compared
            to the total sum of saliency, per iteration.

        :yield: Saliency masks.
    """
    masked_saliency_map = ma.array(attributions)
    if reverse:
        masked_saliency_map.mask = np.ones_like(attributions)

    current_ratio = 0
    smap_rectified = np.maximum(attributions, 0)
    total_saliency = np.sum(smap_rectified)
    sorted_index = create_sorted_index(attributions)
    for index in sorted_index[::-1]:
        if smap_rectified[index] != 0:
            current_ratio += smap_rectified[index] / total_saliency
            masked_saliency_map[index] = ma.nomask if reverse else ma.masked

        # in the last step, we also return a saliency map to include all pixels
        end = index == sorted_index[::-1][-1]

        if saliency_ratio_per_step < current_ratio or end:
            yield masked_saliency_map, current_ratio
            current_ratio = 0
                   
masks_ratios = list(iterate_masks_saliency_ratio(attributions, saliency_ratio_per_step=.2))

In [16]:



def _get_proportionality_value(
    observation: np.array,
    model: Callable,
    masks_ratios: List[Tuple[np.array, float]],
    baseline: float = 0, # use zero corresponding to padding token
) -> float:
    """
    Calculates the total proportionality for a set of masks and their ratios of
    pixels covered.

    :param observation: Observation input to the model.
    :param model: Target model.
    :param masks_ratios: A list with masks and their corresponding ratio of pixels covered.
    :param baseline: A baseline to replace masked values with.
    :return: The total proportionality value that results from applying the masks and
        calculating the area under the curve
    """
    model = forward_call
    def _ablate_and_predict(mask: np.array) -> float:
        ablated_observation = blend_with_mask(observation, baseline, mask)
        return model(ablated_observation)[predicted_class]

    original_output = model(observation)
    predicted_class = np.argmax(original_output)
    baseline_output = model(np.full(observation.shape, baseline))
    baseline_confidence = baseline_output[predicted_class]

    # since zip returns a tuple of tuples, we have to convert the tuples to lists
    masks, ratios = zip(*[(m[0].mask, m[1]) for m in masks_ratios])

    proportionality_values = []
    ratio_values = []
    previous_ratio = 0
    last_output_value = 0.0
    for (mask_normal, ratio_normal), (mask_reverse, _) in zip(
        zip(masks, ratios), zip(masks[::-1], ratios[::-1])
    ):
        print(mask_normal, mask_reverse)
        ablated_prediction_normal = _ablate_and_predict(mask_normal)
        ablated_prediction_reverse = _ablate_and_predict(mask_reverse)
        proportionality_value = abs(ablated_prediction_normal - ablated_prediction_reverse)
        proportionality_values.append(proportionality_value)
        ratio_values.append(previous_ratio + ratio_normal)
        previous_ratio += ratio_normal
        last_output_value = ablated_prediction_normal

    normalizing_factor = 1 / (
        original_output[predicted_class] * min(1, baseline_confidence / last_output_value)
    )
    print(normalizing_factor)
    print(ratio_values)
    print(proportionality_values)
    return normalizing_factor * auc(
        x=np.asarray(ratio_values), y=np.asarray(proportionality_values)
    )

_get_proportionality_value(input_array, model, masks_ratios)

[False  True False  True  True  True  True  True False] [False  True False  True  True  True  True  True False]
[False  True False  True  True  True  True  True False] [False  True False  True  True  True  True  True False]
[False  True False  True  True  True  True  True False] [False  True False  True  True  True  True  True False]
0.22895762287375251
[0.6, 0.9, 1.0]
[0.0, 0.0, 0.0]


0.0

In [17]:
masks, ratios = zip(*[(m[0].mask, m[1]) for m in masks_ratios])
masks

(array([False,  True, False,  True,  True,  True,  True,  True, False]),
 array([False,  True, False,  True,  True,  True,  True,  True, False]),
 array([False,  True, False,  True,  True,  True,  True,  True, False]))

In [18]:
masks_ratios

[(masked_array(data=[0.0, --, 0.0, --, --, --, --, --, 0.0],
               mask=[False,  True, False,  True,  True,  True,  True,  True,
                     False],
         fill_value=1e+20),
  0.6),
 (masked_array(data=[0.0, --, 0.0, --, --, --, --, --, 0.0],
               mask=[False,  True, False,  True,  True,  True,  True,  True,
                     False],
         fill_value=1e+20),
  0.30000000000000004),
 (masked_array(data=[0.0, --, 0.0, --, --, --, --, --, 0.0],
               mask=[False,  True, False,  True,  True,  True,  True,  True,
                     False],
         fill_value=1e+20),
  0.1)]

Reusing dataset thermostat (/home/tim/.cache/huggingface/datasets/thermostat/imdb-albert-lig/1.0.1/0cbe93e1fbe5b8ed0217559442d8b49a80fd4c2787185f2d7940817c67d8707b)


Loading Thermostat configuration: imdb-albert-lig


In [96]:
attributions

[0.010888157412409782,
 0.013887383975088596,
 0.012377026490867138,
 0.015713095664978027,
 0.0018300628289580345,
 -0.005079985596239567,
 0.025114858523011208,
 -0.02422262914478779,
 0.003960959613323212,
 0.0038968694861978292,
 -0.004924846347421408,
 0.0067788721062242985,
 0.004412396345287561,
 -0.004911642521619797,
 0.010200000368058681,
 0.01687535271048546,
 0.009621708653867245]

# 