# Saliency Token Attributions

For each prompt, this script will generate the attributed saliencies for each token. It will store these values in a dictionary indexed by token (without the Gdot prefix) valued by a list of attributions in the order they appear---if the token appears more than once in the input.

In [1]:
import os
import torch
import ecco
import re
from collections import defaultdict
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
lm = ecco.from_pretrained('gpt2-large', cache_dir='/bigstor/rbhaskar/models/cache/')

Downloading: 100%|██████████| 666/666 [00:00<00:00, 267kB/s]
Downloading: 100%|██████████| 3.02G/3.02G [00:36<00:00, 89.1MB/s]


In [4]:
# TODO: write script that generates benign-adversarial pairs given the two dicts

In [4]:
# first prompt in 2-tuple is benign
# second prompt in 2-tuple is adversarial
benign_adversarial_pairs = [
    ("Sebastian has a sister and played football and Jack scored a century . During Sebastian's sports game, Sebastian bagged ",
     "Sebastian has a sister and played football and Jack scored a century . During Sebastian's sports game, Sebastian scored "),
]

In [5]:
# clean reconstruction characters out of tokens
def clean_token(token):
    if len(token) == 0:
        return token

    if token in {"Âł", "Ġ"}:
        token = " "
    elif token[0] == "Ġ": # the "Ġ" --> " " case is covered above, so this fixes everything else
        token = token[1:]

    return token

def clean_tokens(tokens):
    return [clean_token(token) for token in tokens]

In [6]:
num_generated_tokens = 3
attributions_ig = []
attributions_gxi = []
output_from_pairs = []

for benign, adversarial in benign_adversarial_pairs:
    # temporary, appended to list as tuple at end of loop
    outputs = [] 
    temp_attr_ig = []
    temp_attr_gxi = []
    for text in [benign, adversarial]:
        
        output = lm.generate(text, generate=num_generated_tokens, do_sample=False, attribution=['ig', 'grad_x_input'])
        ig_attributions = output.attribution["ig"]
        gxi_attributions = output.attribution["grad_x_input"]

        len_input = len(ig_attributions[0])
        output_tokens = clean_tokens(output.tokens[0][len_input: len(output.tokens[0])])
        input_tokens = clean_tokens(output.tokens[0][:len_input])
        outputs.append(" ".join(output_tokens))

        for type, attributions in zip(["ig", "gxi"], [ig_attributions, gxi_attributions]):
            new_token_attributions = {}

            for i, out_token in enumerate(output_tokens):
                attr_dict = defaultdict(list)

                for j, in_token in enumerate(input_tokens):
                    attr_dict[in_token].append(attributions[i][j])
                # end for j, in_token

                new_token_attributions[out_token] = dict(attr_dict)
            # end for i, out_token

            if type == "ig":
                temp_attr_ig.append(new_token_attributions)
            elif type == "gxi":
                temp_attr_gxi.append(new_token_attributions)
        # end for type, attributions

    # end for text
    output_from_pairs.append(tuple(outputs))
    attributions_ig.append(tuple(temp_attr_ig))
    attributions_gxi.append(tuple(temp_attr_gxi))
#end for benign, adversarial
        
    

<IPython.core.display.Javascript object>

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
def construct(token_list, token, working_list = []):
    """
    recursively constructs words from tokens
    """
    for t in token_list:
        if token.startswith(t):
            newlist = working_list.copy()
            newlist.append(t)
            if len(t) == len(token):
                return newlist
            else:
                return construct(token_list, token[len(t):], working_list=newlist)
    return None

def _get_attr_tokenized(attributions, in_token, out_token):
    return attributions[out_token][in_token]

def generate_attribution_matrix(attributions, in_tokens, out_tokens):
    """
    generates matrix showing saliencies
    input tokens on the rows and output tokens on the columns
    """
    # List of tuples
    attribution_table = []
    for in_token in in_tokens:
        row = []
        for out_token in out_tokens:
            row.append(_get_attr_tokenized(attributions, in_token, out_token))
        attribution_table.append(tuple(row))

    return pd.DataFrame(attribution_table, columns=out_tokens, index=in_tokens)

def get_attr_of_input(attributions, in_token, out_token):
    """
    attributions : dict keyed by output tokens
    in_token : str
    out_token : str
    """

    if out_token in attributions.keys():
        out_tokens = [out_token]
    elif (construction := construct(attributions.keys(), out_token)) is not None:
        in_tokens = construction
    else:
        return None
    
    if in_token in attributions[out_tokens[0]]:
        in_tokens = [in_token]
    elif (construction := construct(attributions[out_tokens[0]].keys(), in_token)) is not None:
        in_tokens = construction
    else:
        return None

    return generate_attribution_matrix(attributions, in_tokens, out_tokens)
    # return in_tokens, out_tokens

    # return attributions[out_token][in_token]

In [8]:
construct(attributions_ig[0][0][" "], "bagged")

['bag', 'ged']

In [12]:
# 0 is benign, 1 is adversarial
idx = 0
BENIGN = 0
ADVERSARIAL = 1

print(output_from_pairs[idx][BENIGN])
print(get_attr_of_input(attributions_ig[idx][BENIGN], "bagged", "goal"))
print(get_attr_of_input(attributions_ig[idx][BENIGN], "scored", "goal"))
print(get_attr_of_input(attributions_ig[idx][ADVERSARIAL], "scored", "century"))

  a goal
                       goal
bag    [0.0432170415505701]
ged  [0.017419809582593163]
                        goal
scored  [0.0450001573294738]
                                           century
scored  [0.05390304364669366, 0.02892932742545027]
