In [1]:
from transformers import BertTokenizer
from BERT.BERT_explainability.modules.BERT.ExplanationGenerator import Generator
from BERT.BERT_explainability.modules.BERT.BertForSequenceClassification import BertForSequenceClassification
from transformers import BertTokenizer
from BERT.BERT_explainability.modules.BERT.ExplanationGenerator import Generator
from transformers import AutoTokenizer

from captum.attr import (
    visualization
)
import torch

In [2]:
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2").to("cuda")
model.eval()
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2")
# initialize the explanations generator
explanations = Generator(model)

classifications = ["NEGATIVE", "POSITIVE"]


## Positive sentiment example

In [3]:
# encode a sentence
text_batch = ["This movie was the best movie I have ever seen! some scenes were ridiculous, but acting was great."]
encoding = tokenizer(text_batch, return_tensors='pt')
input_ids = encoding['input_ids'].to("cuda")
attention_mask = encoding['attention_mask'].to("cuda")

# true class is positive - 1
true_class = 1

# generate an explanation for the input
expl = explanations.generate_LRP(input_ids=input_ids, attention_mask=attention_mask, start_layer=0)[0]
# normalize scores
expl = (expl - expl.min()) / (expl.max() - expl.min())

# get the model classification
output = torch.nn.functional.softmax(model(input_ids=input_ids, attention_mask=attention_mask)[0], dim=-1)
classification = output.argmax(dim=-1).item()
# get class name
class_name = classifications[classification]
# if the classification is negative, higher explanation scores are more negative
# flip for visualization
if class_name == "NEGATIVE":
  expl *= (-1)

tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten())
print([(tokens[i], expl[i].item()) for i in range(len(tokens))])
vis_data_records = [visualization.VisualizationDataRecord(
                                expl,
                                output[0][classification],
                                classification,
                                true_class,
                                true_class,
                                1,       
                                tokens,
                                1)]
visualization.visualize_text(vis_data_records)

[('[CLS]', 0.0), ('this', 0.42324313521385193), ('movie', 0.2664182186126709), ('was', 0.26587697863578796), ('the', 0.2951054871082306), ('best', 0.6248753070831299), ('movie', 0.2689264714717865), ('i', 0.1721399873495102), ('have', 0.08651026338338852), ('ever', 0.1403694748878479), ('seen', 0.17908667027950287), ('!', 0.5743358135223389), ('some', 0.0019824712071567774), ('scenes', 0.03306420147418976), ('were', 0.024729151278734207), ('ridiculous', 0.04171764850616455), (',', 0.0), ('but', 0.4711207449436188), ('acting', 0.4313846528530121), ('was', 0.5168573260307312), ('great', 1.0), ('.', 0.017998818308115005), ('[SEP]', 0.07626653462648392)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (1.00),1.0,1.0,"[CLS] this movie was the best movie i have ever seen ! some scenes were ridiculous , but acting was great . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,1 (1.00),1.0,1.0,"[CLS] this movie was the best movie i have ever seen ! some scenes were ridiculous , but acting was great . [SEP]"
,,,,


## Negative sentiment example

In [4]:
# encode a sentence
text_batch = ["I really didn't like this movie. Some of the actors were good, but overall the movie was boring."]
encoding = tokenizer(text_batch, return_tensors='pt')
input_ids = encoding['input_ids'].to("cuda")
attention_mask = encoding['attention_mask'].to("cuda")

# generate an explanation for the input
expl = explanations.generate_LRP(input_ids=input_ids, attention_mask=attention_mask, start_layer=0)[0]
# normalize scores
expl = (expl - expl.min()) / (expl.max() - expl.min())

# get the model classification
output = torch.nn.functional.softmax(model(input_ids=input_ids, attention_mask=attention_mask)[0], dim=-1)
classification = output.argmax(dim=-1).item()
# get class name
class_name = classifications[classification]
# if the classification is negative, higher explanation scores are more negative
# flip for visualization
if class_name == "NEGATIVE":
  expl *= (-1)

tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten())
print([(tokens[i], expl[i].item()) for i in range(len(tokens))])
vis_data_records = [visualization.VisualizationDataRecord(
                                expl,
                                output[0][classification],
                                classification,
                                1,
                                1,
                                1,       
                                tokens,
                                1)]
visualization.visualize_text(vis_data_records)

[('[CLS]', -0.0), ('i', -0.16696225106716156), ('really', -0.17400380969047546), ('didn', -0.2917846739292145), ("'", -0.0), ('t', -0.3639659881591797), ('like', -0.1539672464132309), ('this', -0.17454388737678528), ('movie', -0.055396538227796555), ('.', -0.018834155052900314), ('some', -0.010470278561115265), ('of', -0.002222779905423522), ('the', -0.006709620356559753), ('actors', -0.03405332937836647), ('were', -0.0161523949354887), ('good', -0.02452881447970867), (',', -0.004610343836247921), ('but', -0.05264212563633919), ('overall', -0.3095993995666504), ('the', -0.20328642427921295), ('movie', -0.04986630380153656), ('was', -0.6278334259986877), ('boring', -1.0), ('.', -0.014142977073788643), ('[SEP]', -0.06643891334533691)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,0 (1.00),1.0,1.0,"[CLS] i really didn ' t like this movie . some of the actors were good , but overall the movie was boring . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,0 (1.00),1.0,1.0,"[CLS] i really didn ' t like this movie . some of the actors were good , but overall the movie was boring . [SEP]"
,,,,


## Choosing class for visualization example

In [5]:
# encode a sentence
text_batch = ["I hate that I love you."]
encoding = tokenizer(text_batch, return_tensors='pt')
input_ids = encoding['input_ids'].to("cuda")
attention_mask = encoding['attention_mask'].to("cuda")

# true class is positive - 1
true_class = 1

# generate an explanation for the input
target_class = 0
expl = explanations.generate_LRP(input_ids=input_ids, attention_mask=attention_mask, start_layer=11, index=target_class)[0]
# normalize scores
expl = (expl - expl.min()) / (expl.max() - expl.min())

# get the model classification
output = torch.nn.functional.softmax(model(input_ids=input_ids, attention_mask=attention_mask)[0], dim=-1)

# get class name
class_name = classifications[target_class]
# if the classification is negative, higher explanation scores are more negative
# flip for visualization
if class_name == "NEGATIVE":
  expl *= (-1)

tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten())
print([(tokens[i], expl[i].item()) for i in range(len(tokens))])
vis_data_records = [visualization.VisualizationDataRecord(
                                expl,
                                output[0][classification],
                                classification,
                                true_class,
                                true_class,
                                1,       
                                tokens,
                                1)]
visualization.visualize_text(vis_data_records)

[('[CLS]', -0.0), ('i', -0.25013232231140137), ('hate', -1.0), ('that', -0.48772862553596497), ('i', -0.17279672622680664), ('love', -0.17464807629585266), ('you', -0.04768778756260872), ('.', -0.00017949726316146553), ('[SEP]', -0.0)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,0 (0.90),1.0,1.0,[CLS] i hate that i love you . [SEP]
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,0 (0.90),1.0,1.0,[CLS] i hate that i love you . [SEP]
,,,,


In [6]:
# encode a sentence
text_batch = ["I hate that I love you."]
encoding = tokenizer(text_batch, return_tensors='pt')
input_ids = encoding['input_ids'].to("cuda")
attention_mask = encoding['attention_mask'].to("cuda")

# true class is positive - 1
true_class = 1

# generate an explanation for the input
target_class = 1
expl = explanations.generate_LRP(input_ids=input_ids, attention_mask=attention_mask, start_layer=11, index=target_class)[0]
# normalize scores
expl = (expl - expl.min()) / (expl.max() - expl.min())

# get the model classification
output = torch.nn.functional.softmax(model(input_ids=input_ids, attention_mask=attention_mask)[0], dim=-1)

# get class name
class_name = classifications[target_class]
# if the classification is negative, higher explanation scores are more negative
# flip for visualization
if class_name == "NEGATIVE":
  expl *= (-1)

tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten())
print([(tokens[i], expl[i].item()) for i in range(len(tokens))])
vis_data_records = [visualization.VisualizationDataRecord(
                                expl,
                                output[0][classification],
                                classification,
                                true_class,
                                true_class,
                                1,       
                                tokens,
                                1)]
visualization.visualize_text(vis_data_records)

[('[CLS]', 0.0), ('i', 0.24049873650074005), ('hate', 0.1725553274154663), ('that', 0.18329688906669617), ('i', 0.19432033598423004), ('love', 1.0), ('you', 0.22601081430912018), ('.', 0.0), ('[SEP]', 0.000259235268458724)]


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,0 (0.90),1.0,1.0,[CLS] i hate that i love you . [SEP]
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,0 (0.90),1.0,1.0,[CLS] i hate that i love you . [SEP]
,,,,
