## Compute BoW Importances offline

In [1]:
import os
os.chdir("..")

In [2]:
from slalom_explanations.attribution_methods import get_groundtruth_importance, BoW, NaiveBayesEstim
from datasets import load_dataset
from transformers import AutoTokenizer
import torch

In [17]:
from copy import deepcopy
def compute_ref_importances(gt_list, use_dataset, tokenizer, max_seq_len) -> dict:
    """ Compute reference importance scores """
    # define BoW model and train it
    if len(gt_list) == 0:
        return {}

    print(f"Tokenizing with max_seq_len = {max_seq_len}")
    bow = BoW(ds=use_dataset, tokenizer=tokenizer, max_seq_len=max_seq_len)
    importances = {}
    for mygt in gt_list:
        print("getting ground_truth for model", mygt)
        bow_svm = get_groundtruth_importance(mygt, bow)
        importances[mygt] = deepcopy(bow_svm.get_importance())

    print("Got reference importances.")
    return importances

In [8]:
from copy import deepcopy
for dataset_name in ["imdb", "yelp"]:
    if dataset_name == "imdb":
        imdb = load_dataset('imdb').with_format('torch', device="cpu") # format to pytorch tensors, but leave data on cpu
        imdb["train"] = imdb["train"].shuffle(seed=42).select(range(5000))
        imdb["test"] = imdb["test"].shuffle(seed=42).select(range(20000))
        dataset = imdb
    elif dataset_name == "yelp":
        yelp = load_dataset('yelp_polarity').with_format('torch', device='cpu')
        yelp["train"] = yelp["train"].shuffle(seed=42).select(range(5000))
        yelp["test"] = yelp["test"].shuffle(seed=42).select(range(20000))
        dataset = yelp
    else:
        raise ValueError(f"Unknown dataset {config.dataset}.")
    for model_type in ["distilbert", "bert", "gpt2", "roberta"]:
        if model_type == "gpt2":
            tokenizer = AutoTokenizer.from_pretrained('gpt2', use_fast=True, padding=512)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
        elif model_type == "distilbert":
            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True, padding=512)
            use_cls = True
        elif model_type == "roberta":
            tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-base', use_fast=True, padding=512)
            use_cls = True
        elif model_type == "bert":
            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True, padding=512)
            
        bow = BoW(ds=dataset, tokenizer=tokenizer)
        bow_nb_mult =  NaiveBayesEstim(bow, multiplicities=True)
        importances_nb_mult = deepcopy(bow_nb_mult.get_signed_importance())
        torch.save({"nb": importances_nb_mult}, f"ground_truth/gt_{model_type}_{dataset_name}.pt")

self.tokenizer: BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
} type(self.tokenizer: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


Token indices sequence length is longer than the specified maximum sequence length for this model (936 > 512). Running this sequence through the model will result in indexing errors


(5000, 21377)
self.tokenizer: BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
} type(self.tokenizer: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


Token indices sequence length is longer than the specified maximum sequence length for this model (936 > 512). Running this sequence through the model will result in indexing errors


(5000, 21377)
self.tokenizer: GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
} type(self.tokenizer: <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>


Token indices sequence length is longer than the specified maximum sequence length for this model (1088 > 1024). Running this sequence through the model will result in indexing errors


(5000, 23669)
self.tokenizer: RobertaTokenizerFast(name_or_path='FacebookAI/roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
} type(self.tokenizer: <class 'transformers.models.roberta.token

Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors


(5000, 23669)
self.tokenizer: BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
} type(self.tokenizer: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


Token indices sequence length is longer than the specified maximum sequence length for this model (977 > 512). Running this sequence through the model will result in indexing errors


(5000, 14979)
self.tokenizer: BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
} type(self.tokenizer: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


Token indices sequence length is longer than the specified maximum sequence length for this model (977 > 512). Running this sequence through the model will result in indexing errors


(5000, 14979)
self.tokenizer: GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
} type(self.tokenizer: <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>


Token indices sequence length is longer than the specified maximum sequence length for this model (1178 > 1024). Running this sequence through the model will result in indexing errors


(5000, 16709)
self.tokenizer: RobertaTokenizerFast(name_or_path='FacebookAI/roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
} type(self.tokenizer: <class 'transformers.models.roberta.token

Token indices sequence length is longer than the specified maximum sequence length for this model (963 > 512). Running this sequence through the model will result in indexing errors


(5000, 16709)


In [None]:
torch.load("ground_truth/gt_bert_imdb.pt")["nb"]

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True, padding=512)

In [8]:
res_l = tokenizer.encode("This is a fantastic movie starring Benedict Cumberbatch.")

In [9]:
tokenizer.convert_ids_to_tokens(res_l)

['[CLS]',
 'this',
 'is',
 'a',
 'fantastic',
 'movie',
 'starring',
 'benedict',
 'cum',
 '##ber',
 '##bat',
 '##ch',
 '.',
 '[SEP]']