In [1]:
import sys, time, pickle, torch
sys.path.insert(0, '../../Models')
sys.path.insert(0, '../../Utils')
sys.path.insert(0, '../../Preprocess')
import numpy as np
import pandas as pd
from preload_models import get_sst2_tok_n_model
from _utils import sample_random_glue_sst2, get_continuation_mapping, \
                    get_continuous_attributions, get_continuous_raw_inputs, \
                    collect_info_for_metric, save_info

In [2]:
sst2_data_raw, targets, idxs = sample_random_glue_sst2()

Reusing dataset glue (/home/user/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/user/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3b24abff24d1d8c0.arrow
Loading cached processed dataset at /home/user/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-5960909ab3834668.arrow


In [3]:
tokenizer, model = get_sst2_tok_n_model()

In [4]:
#define some containers to save some info
model_out_list, raw_attr_list, conti_attr_list, raw_input_list = [], [], [], []

In [5]:
from captum.attr import KernelShap
from captum.attr import visualization 

In [6]:
model.model.roberta.embeddings

RobertaEmbeddings(
  (word_embeddings): Embedding(50265, 1024, padding_idx=1)
  (position_embeddings): Embedding(514, 1024, padding_idx=1)
  (token_type_embeddings): Embedding(1, 1024)
  (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [7]:
# lime = Lime(forward_func=model.forward)
ks   = KernelShap(model)

In [8]:
def generate_record(raw_review, target):
    #tokenizer operations
    tokenized = tokenizer(raw_review, truncation=True, return_offsets_mapping=True)
    offset_mapping = tokenized['offset_mapping']
    conti_map = get_continuation_mapping(offset_mapping)
    input_ids = torch.tensor(tokenized['input_ids']).unsqueeze(0)
    detokenized = [t.replace('Ġ', '') for t in tokenizer.convert_ids_to_tokens(input_ids[0])]
    
    #feeding input forward 
    input_emb = model.get_embeddings(input_ids)
    pred_prob = model(input_emb).item()
    
    #categorizing results
    pred_class = 'Pos' if pred_prob > 0.5 else 'Neg' 
    true_class = 'Pos' if target > 0.5 else 'Neg' 
    
    #attribution algorithm working
    attribution = ks.attribute(input_emb, n_samples=1000)
    word_attributions = attribution.squeeze(0).sum(dim=1)
    word_attributions /= torch.norm(word_attributions)
    attr_score = torch.sum(word_attributions)
    attr_class = 'Pos' if attr_score > 0.5 else 'Neg'
    convergence_score = None
    
    
    #re-organizing tensors and arrays because words get split down
    conti_attr = get_continuous_attributions(conti_map, word_attributions)
    raw_input = get_continuous_raw_inputs(conti_map, detokenized)

#     print(f'word attributions {word_attributions}')
#     print(f'pred_prob {pred_prob}')
#     print(f'pred_class {pred_class}')
#     print(f'true_class {true_class}')
#     print(f'attribution {attribution}')
#     print(f'attr_class {attr_class}')
#     print(f'attr_score {attr_score}')
#     print(f'raw_input {raw_input}')

        
#     collect info for metrics later
    collect_info_for_metric(model_out_list, pred_prob, raw_attr_list, attribution, conti_attr_list, conti_attr, raw_input_list, raw_input)
        
    
    visual_record = visualization.VisualizationDataRecord(word_attributions=word_attributions,
                                                         pred_prob=pred_prob,
                                                         pred_class=pred_class,
                                                         true_class=true_class,
                                                         attr_class=attr_class,
                                                         attr_score=attr_score,
                                                         raw_input=raw_input,
                                                         convergence_score=convergence_score)
        
        
    return visual_record
      
    

In [9]:
for i, (datum_raw, target) in enumerate(zip(sst2_data_raw, targets), start=1):
    print(f'Raw review: {datum_raw}')
    print(f'GT target: {target}')
    visual_record=generate_record(datum_raw, target)
    print(visualization.visualize_text([visual_record]))
   

Raw review: its oscar nomination 
GT target: 1
word attr tensor([-0.2282, -0.0059,  0.2472, -0.1398, -0.6072,  0.5969,  0.3770])
conti attr [tensor(-0.2282), tensor(-0.0059), tensor(0.1074), tensor(-0.6072), tensor(0.5969), tensor(0.3770)]
detokenized ['<s>', 'its', 'o', 'scar', 'nomination', '', '</s>']
len conti_raw 6
conti_raw ['<s>', 'its', 'oscar', 'nomination', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Neg,0.24,#s its oscar nomination #/s
,,,,


<IPython.core.display.HTML object>
Raw review: shenanigans and slapstick 
GT target: 1




word attr tensor([ 0.0596,  0.4509, -0.0267, -0.3448, -0.1808, -0.1986,  0.7460, -0.1062,
         0.0491, -0.1764])
conti attr [tensor(0.0596), tensor(-0.1014), tensor(-0.1986), tensor(0.6398), tensor(0.0491), tensor(-0.1764)]
detokenized ['<s>', 's', 'hen', 'an', 'igans', 'and', 'slap', 'stick', '', '</s>']
len conti_raw 6
conti_raw ['<s>', 'shenanigans', 'and', 'slapstick', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Neg,0.27,#s shenanigans and slapstick #/s
,,,,


<IPython.core.display.HTML object>
Raw review: an unsettling sight , 
GT target: 0
word attr tensor([-0.5382, -0.4155, -0.2795, -0.2632, -0.4037, -0.2949, -0.3746])
conti attr [tensor(-0.5382), tensor(-0.4155), tensor(-0.2795), tensor(-0.2632), tensor(-0.4037), tensor(-0.2949), tensor(-0.3746)]
detokenized ['<s>', 'an', 'unsettling', 'sight', ',', '', '</s>']
len conti_raw 7
conti_raw ['<s>', 'an', 'unsettling', 'sight', ',', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Neg,Neg (0.01),Neg,-2.57,"#s an unsettling sight , #/s"
,,,,


<IPython.core.display.HTML object>
Raw review: the climactic hourlong cricket match 
GT target: 1




word attr tensor([ 0.1501, -0.1215, -0.3540,  0.3166, -0.1100,  0.5941,  0.3661,  0.2015,
        -0.1872, -0.4030])
conti attr [tensor(0.1501), tensor(-0.1215), tensor(-0.0373), tensor(0.4841), tensor(0.3661), tensor(0.2015), tensor(-0.1872), tensor(-0.4030)]
detokenized ['<s>', 'the', 'clim', 'actic', 'hour', 'long', 'cricket', 'match', '', '</s>']
len conti_raw 8
conti_raw ['<s>', 'the', 'climactic', 'hourlong', 'cricket', 'match', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Neg,0.45,#s the climactic hourlong cricket match #/s
,,,,


<IPython.core.display.HTML object>
Raw review: alternating between facetious comic parody and pulp melodrama , this smart-aleck movie ... tosses around some intriguing questions about the difference between human and android life 
GT target: 1




word attr tensor([0.0124, 0.2986, 0.0718, 0.0894, 0.1130, 0.2143, 0.2033, 0.1901, 0.2320,
        0.2064, 0.2247, 0.0657, 0.1736, 0.1469, 0.2022, 0.0366, 0.2077, 0.2843,
        0.1067, 0.0634, 0.0218, 0.0842, 0.1824, 0.2375, 0.1439, 0.2232, 0.1063,
        0.1691, 0.0882, 0.1729, 0.0544, 0.2318, 0.1243, 0.0742, 0.0462, 0.1009,
        0.2038])
conti attr [tensor(0.0124), tensor(0.3704), tensor(0.0894), tensor(0.3273), tensor(0.2033), tensor(0.1901), tensor(0.2320), tensor(0.2064), tensor(0.4640), tensor(0.1469), tensor(0.2022), tensor(0.6353), tensor(0.0634), tensor(0.0218), tensor(0.2667), tensor(0.2375), tensor(0.1439), tensor(0.2232), tensor(0.1063), tensor(0.1691), tensor(0.0882), tensor(0.1729), tensor(0.0544), tensor(0.2318), tensor(0.1243), tensor(0.0742), tensor(0.0462), tensor(0.1009), tensor(0.2038)]
detokenized ['<s>', 'altern', 'ating', 'between', 'facet', 'ious', 'comic', 'parody', 'and', 'pulp', 'mel', 'od', 'rama', ',', 'this', 'smart', '-', 'ale', 'ck', 'movie', '...',

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Pos,5.41,"#s alternating between facetious comic parody and pulp melodrama , this smart-aleck movie ... tosses around some intriguing questions about the difference between human and android life #/s"
,,,,


<IPython.core.display.HTML object>
Raw review: to be a part of that elusive adult world 
GT target: 1




word attr tensor([-0.1938,  0.0122, -0.2976,  0.3255,  0.2467, -0.2805,  0.0850, -0.0619,
         0.4945,  0.4801, -0.2899, -0.2410])
conti attr [tensor(-0.1938), tensor(0.0122), tensor(-0.2976), tensor(0.3255), tensor(0.2467), tensor(-0.2805), tensor(0.0850), tensor(-0.0619), tensor(0.4945), tensor(0.4801), tensor(-0.2899), tensor(-0.2410)]
detokenized ['<s>', 'to', 'be', 'a', 'part', 'of', 'that', 'elusive', 'adult', 'world', '', '</s>']
len conti_raw 12
conti_raw ['<s>', 'to', 'be', 'a', 'part', 'of', 'that', 'elusive', 'adult', 'world', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Neg,0.28,#s to be a part of that elusive adult world #/s
,,,,


<IPython.core.display.HTML object>
Raw review: emotional power 
GT target: 1
word attr tensor([ 0.2888,  0.6338, -0.5665,  0.1771,  0.0568, -0.3993])
conti attr [tensor(0.2888), tensor(0.0673), tensor(0.1771), tensor(0.0568), tensor(-0.3993)]
detokenized ['<s>', 'em', 'otional', 'power', '', '</s>']
len conti_raw 5
conti_raw ['<s>', 'emotional', 'power', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Neg,0.19,#s emotional power #/s
,,,,


<IPython.core.display.HTML object>
Raw review: reminds you of why animation is such a perfect medium for children , because of the way it allows the mind to enter and accept another world 
GT target: 1




word attr tensor([0.8114, 0.0846, 0.1080, 0.0955, 0.1080, 0.0922, 0.1135, 0.0100, 0.1359,
        0.0552, 0.1173, 0.0975, 0.0641, 0.2150, 0.0744, 0.0354, 0.1064, 0.0942,
        0.1121, 0.0268, 0.0620, 0.0082, 0.0695, 0.0750, 0.0420, 0.0480, 0.1864,
        0.0854, 0.1774, 0.0920, 0.1394, 0.1590])
conti attr [tensor(0.8114), tensor(0.2881), tensor(0.1080), tensor(0.0922), tensor(0.1135), tensor(0.0100), tensor(0.1359), tensor(0.0552), tensor(0.1173), tensor(0.0975), tensor(0.0641), tensor(0.2150), tensor(0.0744), tensor(0.0354), tensor(0.1064), tensor(0.0942), tensor(0.1121), tensor(0.0268), tensor(0.0620), tensor(0.0082), tensor(0.0695), tensor(0.0750), tensor(0.0420), tensor(0.0480), tensor(0.1864), tensor(0.0854), tensor(0.1774), tensor(0.0920), tensor(0.1394), tensor(0.1590)]
detokenized ['<s>', 'rem', 'ind', 's', 'you', 'of', 'why', 'animation', 'is', 'such', 'a', 'perfect', 'medium', 'for', 'children', ',', 'because', 'of', 'the', 'way', 'it', 'allows', 'the', 'mind', 'to', 'ente

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Pos,3.7,"#s reminds you of why animation is such a perfect medium for children , because of the way it allows the mind to enter and accept another world #/s"
,,,,


<IPython.core.display.HTML object>
Raw review: unparalleled proportions , writer-director parker 
GT target: 1




word attr tensor([ 0.2106,  0.5672, -0.0475,  0.3904, -0.0495,  0.0833,  0.3831,  0.3428,
         0.2253,  0.2392, -0.0434,  0.3094])
conti attr [tensor(0.2106), tensor(0.5197), tensor(0.3904), tensor(-0.0495), tensor(0.8093), tensor(0.4645), tensor(-0.0434), tensor(0.3094)]
detokenized ['<s>', 'un', 'paralleled', 'proportions', ',', 'writer', '-', 'director', 'park', 'er', '', '</s>']
len conti_raw 8
conti_raw ['<s>', 'unparalleled', 'proportions', ',', 'writer-director', 'parker', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Pos,2.61,"#s unparalleled proportions , writer-director parker #/s"
,,,,


<IPython.core.display.HTML object>
Raw review: this surprisingly decent flick 
GT target: 1
word attr tensor([ 0.2167,  0.1579,  0.6679, -0.5364, -0.2136, -0.3219,  0.2122])
conti attr [tensor(0.2167), tensor(0.1579), tensor(0.6679), tensor(-0.5364), tensor(-0.2136), tensor(-0.3219), tensor(0.2122)]
detokenized ['<s>', 'this', 'surprisingly', 'decent', 'flick', '', '</s>']
len conti_raw 7
conti_raw ['<s>', 'this', 'surprisingly', 'decent', 'flick', '', '</s>']


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Pos,Pos (1.00),Neg,0.18,#s this surprisingly decent flick #/s
,,,,


<IPython.core.display.HTML object>
Raw review: about the best thing you could say about narc is that it 's a rock-solid little genre picture . 
GT target: 1




KeyboardInterrupt: 

In [None]:
save_info(idxs, sst2_data_raw, targets, model_out_list, raw_attr_list, conti_attr_list, raw_input_list, fname='deep_lift_out.pkl')