In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

from collections import Counter

from IPython.core.display import HTML, display
from models.generators import DiffusionModel
from models.discriminators import DiscriminatorNetCIFAR10
from torchvision import transforms
import torch.nn.functional as F
from captum.attr import visualization as viz
from captum.attr import Lime, LimeBase
from captum._utils.models.linear_model import SkLearnLinearRegression, SkLearnLasso
import os
import json
from torchtext import vocab
from torchtext.data import get_tokenizer
import numpy as np

In [2]:
#generator = DiffusionModel()
#Discriminator = DiscriminatorNetCIFAR10

class model():
    def __init__(self,pretrained_path):
        
        self.encoder = DiffusionModel()
        #self.decoder = DiscriminatorNetCIFAR10()
        self.decoder = DiscriminatorNetCIFAR10()
        #discriminator = discriminator.load_state_dict(torch.load(pretrained_path, map_location=lambda storage, loc: storage)['model_state_dict'])
        self.decoder.load_state_dict(torch.load(pretrained_path, map_location=lambda storage, loc: storage)['model_state_dict'])
        self.transform = transforms.Compose([
            transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),transforms.Resize((32,32))])
    
    def forward(self, x):
        label = self.encoder.forward(x)
        
        label = self.transform(label)
        label = label.unsqueeze(0)
        #print(x.shape)
        
        label = self.decoder(label)
        return label
    
    def encode_part(self,input):
        return self.encoder.forward(input)

        



In [3]:
pretrained_path = r"D:\Praktikum\xai-diffusion\xai-praktikum\xaigan\src\results\cifar-10\CIFAR10_only_SaliencyTrain\discriminator.pt"
model = model(pretrained_path)
text_inputs = ["A glass of beer is sitting next to a vase full of flowers."]
label = model.forward(text_inputs)
print(label)

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


  0%|          | 0/50 [00:00<?, ?it/s]

tensor([[[[0.8927]]]], grad_fn=<SigmoidBackward0>)


In [None]:
ag_ds = list(AG_NEWS(split='train'))

ag_train, ag_val = ag_ds[:100000], ag_ds[100000:]

tokenizer = get_tokenizer('basic_english')
word_counter = Counter()
for (label, line) in ag_train:
    word_counter.update(tokenizer(line))
voc = Vocab(word_counter, min_freq=10)

print('Vocabulary size:', len(voc))

num_class = len(set(label for label, _ in ag_train))
print('Num of classes:', num_class)



tokenizer = get_tokenizer('basic_english')
word_counter = Counter()
def collate_batch(batch):
    labels = torch.tensor([label - 1 for label, _ in batch]) 
    text_list = [tokenizer(line) for _, line in batch]
    
    # flatten tokens across the whole batch
    text = torch.tensor([voc[t] for tokens in text_list for t in tokens])
    # the offset of each example
    offsets = torch.tensor(
        [0] + [len(tokens) for tokens in text_list][:-1]
    ).cumsum(dim=0)

    return labels, text, offsets


test_label = 2  # {1: World, 2: Sports, 3: Business, 4: Sci/Tec}
test_line = ('US Men Have Right Touch in Relay Duel Against Australia THENS, Aug. 17 '
            '- So Michael Phelps is not going to match the seven gold medals won by Mark Spitz. '
            'And it is too early to tell if he will match Aleksandr Dityatin, '
            'the Soviet gymnast who won eight total medals in 1980.')

test_labels, test_text, test_offsets = collate_batch([(test_label, test_line)])
print(test_labels)

In [42]:
voc = vocab.GloVe(name='6B', dim=50)
tokenizer = get_tokenizer("basic_english")
text_input_batch = ["A glass of beer is sitting next to a vase full of flowers."]
text_list = [tokenizer(line) for line in text_input_batch]
for tokens in text_list:
    for t in tokens:
        #print(voc[t])
        pass
text = [voc[t] for tokens in text_list for t in tokens]
print(len(text))

#text = np.array(text)
embedding_list = []
for i in text:
    i = i.numpy()
    i = i.tolist()
    
    embedding_list.append(i)

text = np.array(embedding_list)
text_embedding = torch.tensor(text)
print(text_embedding.shape)
offsets = torch.tensor(
        [0] + [len(tokens) for tokens in text_list][:-1]
    ).cumsum(dim=0)





#print(type(text))
#print(text)
#text = text.astype(np.float64)
#print(text.shape)
#text = torch.tensor(text, dtype=torch.float64)



14
torch.Size([14, 50])


In [43]:
# remove the batch dimension for the embedding-bag model
def forward_func(text):
    return model(text)

# encode text indices into latent representations & calculate cosine similarity
def exp_embedding_cosine_distance(original_inp, perturbed_inp, _, **kwargs):
    print(original_inp.shape)
    original_emb = model.encode_part(original_inp, None)
    perturbed_emb = model.encode_part(perturbed_inp, None)
    distance = 1 - F.cosine_similarity(original_emb, perturbed_emb, dim=1)
    return torch.exp(-1 * (distance ** 2) / 2)

# binary vector where each word is selected independently and uniformly at random
def bernoulli_perturb(text, **kwargs):
    probs = torch.ones_like(text) * 0.5
    return torch.bernoulli(probs).long()

# remove absenst token based on the intepretable representation sample
def interp_to_input(interp_sample, original_input, **kwargs):
    return original_input[interp_sample.bool()].view(original_input.size(0), -1)

lasso_lime_base = LimeBase(
    forward_func, 
    interpretable_model=SkLearnLasso(alpha=0.08),
    similarity_func=exp_embedding_cosine_distance,
    perturb_func=bernoulli_perturb,
    perturb_interpretable_space=True,
    from_interp_rep_transform=interp_to_input,
    to_interp_rep_transform=None
)

In [44]:
attrs = lasso_lime_base.attribute(
    text_input_batch, # add batch dimension for Captum
    target=torch.tensor([0]),
    #additional_forward_args=(offsets,),
    n_samples=5,
    show_progress=True
).squeeze(0)

print('Attribution range:', attrs.min().item(), 'to', attrs.max().item())

AttributeError: 'str' object has no attribute 'device'

In [21]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer

class_names = ["false","real"]
text = "A glass of beer is sitting next to a vase full of flowers."

explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(text, model.forward, num_features=3, num_samples=100,labels=[0,1])
#print('Document id: %d' % idx)
print(text)
#print('Predicted class =', class_names[mobj.get_pred_list(text).reshape(1,-1)[0,0]])
#print('True class: %s' % true_label)

  0%|          | 0/50 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 24.41 GiB (GPU 0; 16.00 GiB total capacity; 5.77 GiB already allocated; 4.93 GiB free; 8.62 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF