# 1. Results

In [4]:
import pandas as pd
# display CSV file
file_name = "extension_res.csv"
df = pd.read_csv(file_name, index_col=0) 
print(df)

               Pixel acc  mIoU   mAP       seconds
gradcam             0.62  0.41  0.72   1052.000000
layercam            0.65  0.41  0.71    977.612902
eigengradcam        0.63  0.38  0.70   1107.848377
eigencam            0.53  0.34  0.66   1160.776685
xgradcam            0.55  0.35  0.70   1094.452638
gradcam++           0.58  0.36  0.69   1045.633355
gradcam++           0.55  0.32  0.65   1054.720808
scorecam            0.68  0.48  0.79   9361.606697
ablationcam         0.64  0.46  0.65   9306.145710
DDS+layerCAM        0.65  0.41  0.71  22914.000000
DDS+scorecam        0.67  0.48  0.79  46800.000000


# 2. Load libraries


In [4]:
import numpy as np
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from numpy import *
import argparse
from PIL import Image
import imageio
import os
from tqdm import tqdm
from utilities.metrices import *
from utilities import render
from utilities.saver import Saver
from utilities.iou import IoU
from data.Imagenet import Imagenet_Segmentation
import cv2
import numpy as np
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import torch.nn.functional as F
import csv

# needed for extension explanation methods
from pytorch_grad_cam import GradCAM, \
    ScoreCAM, \
    GradCAMPlusPlus, \
    AblationCAM, \
    XGradCAM, \
    EigenCAM, \
    EigenGradCAM, \
    LayerCAM, \
    FullGrad
from pytorch_grad_cam.utils.image import show_cam_on_image, \
    preprocess_image
from pytorch_grad_cam.ablation_layer import AblationLayerVit
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget

# modified ViT model
from baselines_mod.ViT.ViT_pytorch_cam import vit_base_patch16_224 as vitmodel

# For creating segmentation masks and attack
# from Extensions_Table1 import generate_new_explanation_method, reshape_transform
from baselines_mod.ViT.Extensions_Table1 import generate_new_explanation_method, reshape_transform

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 42
set_seed(seed)

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

# Data transsformations
normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
test_img_trans = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize,
])
test_lbl_trans = transforms.Compose([
    transforms.Resize((224, 224), Image.NEAREST),
])


# Dataloader
ds = Imagenet_Segmentation('datasets/gtsegs_ijcv.mat',
                           transform=test_img_trans, target_transform=test_lbl_trans)
dl = DataLoader(ds, batch_size=1, shuffle=False, num_workers=1, drop_last=False)
model = vitmodel(pretrained=True).cuda().eval()
metric = IoU(2, ignore_index=-1)
iterator = tqdm(dl)

######################################################################################################################

# DDS method
import sys
sys.path.append("./guided_diffusion")
import argparse
import os

import numpy as np
import torch as th
import torch.distributed as dist

from guided_diffusion import dist_util, logger
from guided_diffusion.script_util import (
    NUM_CLASSES,
    # model_and_diffusion_defaults,
    create_model_and_diffusion,
    add_dict_to_argparser,
    args_to_dict,
)

def diffusion_defaults():
    """
    Defaults for image and classifier training.
    """
    return dict(
        learn_sigma=True,
        diffusion_steps=1000,
        noise_schedule="linear",
        timestep_respacing="250",
        use_kl=False,
        predict_xstart=False,
        rescale_timesteps=False,
        rescale_learned_sigmas=False,
    )


def classifier_defaults():
    """
    Defaults for classifier models.
    """
    return dict(
        image_size=256,
        classifier_use_fp16=False,
        classifier_width=128,
        classifier_depth=2,
        classifier_attention_resolutions="32,16,8",  # 16
        classifier_use_scale_shift_norm=True,  # False
        classifier_resblock_updown=True,  # False
        classifier_pool="attention",
    )


def model_and_diffusion_defaults():
    """
    Defaults for image training.
    """
    res = dict(
        image_size=256,
        num_channels=256,
        num_res_blocks=2,
        num_heads=4,
        num_heads_upsample=-1,
        num_head_channels=64,
        attention_resolutions="32,16,8",
        channel_mult="",
        dropout=0.0,
        class_cond=False,
        use_checkpoint=False,
        use_scale_shift_norm=True,
        resblock_updown=True,
        use_fp16=True,
        use_new_attention_order=False,
    )
    res.update(diffusion_defaults())
    return res


def create_argparser():
    defaults = dict(
        clip_denoised=True,
        num_samples=1,
        batch_size=4,
        use_ddim=False,
        model_path="./guided_diffusion/models/256x256_diffusion_uncond.pt",
    )
    defaults.update(model_and_diffusion_defaults())
    parser = argparse.ArgumentParser()
    add_dict_to_argparser(parser, defaults)
    return parser

args_diff = create_argparser().parse_args([])

dist_util.setup_dist()
logger.configure()

logger.log("creating model and diffusion...")
d_model, diffusion = create_model_and_diffusion(
    **args_to_dict(args_diff, model_and_diffusion_defaults().keys())
)
d_model.load_state_dict(
    dist_util.load_state_dict(args_diff.model_path, map_location="cpu")
)
d_model.to(dist_util.dev())
if args_diff.use_fp16:
    d_model.convert_to_fp16()
d_model.eval()
device = next(d_model.parameters()).device

shape = (1, 3, 256, 256)
steps =  1000
start = 0.0001
end = 0.02
trial_num = 2

def range_of_delta(beta_s, beta_e, steps):
    def delta_value(beta):
        return (beta/(1-beta))**(0.5)
    return (delta_value(beta_s), delta_value(beta_e))

def beta(t, steps, start, end):
    return (t-1)/(steps-1)*(end-start)+start

def add_noise(x, delta, opt_t, steps, start, end):
    return np.sqrt(1-beta(opt_t, steps, start, end))*(x + th.randn_like(x) * delta)

def get_opt_t(delta, start, end, steps):
    return np.clip(int(np.around(1+(steps-1)/(end-start)*(1-1/(1+delta**2)-start))), 0, steps)

# opt_t = get_opt_t(delta, start, end, steps)

def denoise(img, opt_t, steps, start, end, delta, direct_pred=False):
    # Extra line of code that again prevents mismatch because some data is on cpu and some on gpu
    #img = img.to(device).float()

    img_xt = add_noise(img, delta, opt_t, steps, start, end).unsqueeze(0).to(device)

    indices = list(range(opt_t))[::-1]
    from tqdm.auto import tqdm
    indices = tqdm(indices)
    img_iter = img_xt
    for i in indices:
        t = th.tensor([i]*shape[0], device=device)
        # t = t.to(device)
        with th.no_grad():
            out = diffusion.p_sample(
                d_model,
                img_iter,
                t,
                clip_denoised=args_diff.clip_denoised,
                denoised_fn=None,
                cond_fn=None,
                model_kwargs={},
            )
            img_iter = out['sample']
            if direct_pred:
                return out['pred_xstart']
    # img_iter = ((img_iter + 1) * 127.5).clamp(0, 255).to(th.uint8)
    # img_iter = img_iter.permute(0, 2, 3, 1)
    # img_iter = img_iter.contiguous()
    return img_iter
trans_to_256= transforms.Compose([
   transforms.Resize((256, 256)),])
trans_to_224= transforms.Compose([
   transforms.Resize((224, 224)),])
delta_range = range_of_delta(start, end, steps)


def drop_lowest_max_fuse(arr, ratio=10):
    arr_out = arr[0]
    arr_out[arr_out<np.percentile(arr_out, ratio)]=0
    for i in range(1,len(arr)):
        arr_new = arr[i]
        arr_new[arr_new<np.percentile(arr_new, ratio)]=0
        arr_out = np.maximum(arr_out,arr_new)
    return arr_out
def max_fuse(arr):
    arr_out = arr[0]
    for i in range(1,len(arr)):
        arr_out = np.maximum(arr_out,arr[i])
    return arr_out
def mean_fuse(arr):
    arr_out = arr[0]
    for i in range(1,len(arr)):
        arr_out = arr_out + arr[i]
    return arr_out/len(arr)
def normal(arr):
    return (arr - arr.min())/(arr.max() - arr.min())
######################################################################################################################


# functions needed
def attack(image, model, noise_level,label_index=None,mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5]):
    import torchattacks
    torch.backends.cudnn.deterministic = True
    atk = torchattacks.PGD(model, eps=noise_level, alpha=noise_level/5, steps=10)
    atk.set_normalization_used(mean, std)
    labels = torch.FloatTensor([0]*1000)
    if label_index == None:
        # with torch.no_grad():
        logits = model(image)
        label_index = logits.argmax()
        # print(label_index)

    labels[label_index] = 1
    labels = labels.reshape(1, 1000)
    adv_images = atk(image, labels.float())
    return adv_images
    

def eval_batch(image, labels, evaluator, index, method, DDS=False):
    evaluator.zero_grad()

    image.requires_grad = True

    image = image.requires_grad_()
    # predictions = evaluator(image)


    if DDS is True:
        noise_level=7/255
        opt_t = get_opt_t(noise_level, start, end, steps)
        res_lists = []
        for trial in range(trial_num):
            seed = 42 + trial
            set_seed(seed)
            image_denoised = trans_to_224(
                denoise(trans_to_256(image.cuda()).squeeze(0), opt_t, steps, start, end, noise_level)
            ).detach().cpu()
            image_denoise_normal = image_denoised + torch.randn_like(image_denoised) * noise_level
            image_denoise_normal = torch.squeeze(image_denoise_normal)
            image_denoise_normal = torch.clamp(image_denoise_normal, -1, 1)

            Res = generate_new_explanation_method(evaluator, image.cuda(), method, None)
            Res = torch.from_numpy(Res)
            Res = Res.unsqueeze(0).unsqueeze(0)

            res_lists.append(Res)

        Res = drop_lowest_max_fuse(res_lists)
        Res = torch.tensor(Res).cuda()
    else:
        Res = generate_new_explanation_method(evaluator, image.cuda(), method, None)
        Res = torch.from_numpy(Res)
        Res = Res.unsqueeze(0).unsqueeze(0)


    Res = (Res - Res.min()) / (Res.max() - Res.min())

    ret = Res.mean()

    Res_1 = Res.gt(ret).type(Res.type())
    Res_0 = Res.le(ret).type(Res.type())

    Res_1_AP = Res
    Res_0_AP = 1-Res

    Res_1[Res_1 != Res_1] = 0
    Res_0[Res_0 != Res_0] = 0
    Res_1_AP[Res_1_AP != Res_1_AP] = 0
    Res_0_AP[Res_0_AP != Res_0_AP] = 0


    # TEST
    pred = Res.clamp(min=0.0) / Res.max()
    pred = pred.view(-1).data.cpu().numpy()
    target = labels.view(-1).data.cpu().numpy()

    output = torch.cat((Res_0, Res_1), 1)
    output_AP = torch.cat((Res_0_AP, Res_1_AP), 1)

    # Evaluate Segmentation
    batch_inter, batch_union, batch_pix_correct, batch_label = 0, 0, 0, 0
    batch_ap = 0

    # Segmentation results
    pix_correct, labeled = batch_pix_accuracy(output[0].data.cpu(), labels[0])
    inter, union = batch_intersection_union(output[0].data.cpu(), labels[0], 2)
    batch_pix_correct += pix_correct
    batch_label += labeled
    batch_inter += inter
    batch_union += union

    ap = np.nan_to_num(get_ap_scores(output_AP, labels))
    batch_ap += ap

    return batch_pix_correct, batch_label, batch_inter, batch_union, batch_ap, pred, target


# ONLY IMPORTANT FOR MAKING A NEW FILE

# import csv

# # File name
# file_name = "extension_res.csv"

# # Data to write
# header = ["Pixel acc", "mIoU", "mAP", "seconds"]  # Column names
# data = [["gradcam", round(pixAcc, 2), round(mIoU, 2), round(mAp, 2), 1052]]  # Row with "gradcam" as the first value

# # Write to CSV
# with open(file_name, "w", newline="") as file:
#     writer = csv.writer(file)
#     writer.writerow([""] + header)  # Add header row with an empty cell for row labels
#     writer.writerows(data)         # Add data row

# print(f"CSV file '{file_name}' created successfully!")

  transforms.Resize((224, 224), Image.NEAREST),
  0%|          | 0/4276 [00:00<?, ?it/s]

Logging to /scratch-local/ytjun.9676348/openai-2025-01-29-16-57-39-031237
creating model and diffusion...


# 3. Calculate metrics

In [5]:
## CHANGE Method!
##########################################################

# A single method can be:
# 20 min - 4 hours
# Check extension_res for explcit time

# METHOD = 'gradcam'
METHOD = 'scorecam'
# METHOD = 'gradcam++'
# METHOD = 'ablationcam'
# METHOD = 'xgradcam'
# METHOD = 'eigencam'
# METHOD = 'eigengradcam'
# METHOD = 'layercam'

DDS = True
# DDS = False


#########################################################

import time
# import os

if DDS is False:
    start = time.time()

total_inter, total_union, total_pix_correct, total_label = np.int64(0), np.int64(0), np.int64(0), np.int64(0)
total_ap, total_acc = [], []
predictions, targets = [], []
for batch_idx, (image, labels) in enumerate(iterator):
     
    #  attack
    images = attack(image.cuda(), model, 7/255)
    labels = labels.cuda()   

    # evaluate batch
    correct, labeled, inter, union, ap, pred, target = eval_batch(images, labels, model, batch_idx, METHOD, DDS)

    # calculate metrics
    predictions.append(pred)
    targets.append(target)
    total_pix_correct += correct.astype('int64')
    total_label += labeled.astype('int64')
    total_inter += inter.astype('int64')
    total_union += union.astype('int64')
    total_ap += [ap]
    pixAcc = np.float64(1.0) * total_pix_correct / (np.spacing(1, dtype=np.float64) + total_label)
    IoU = np.float64(1.0) * total_inter / (np.spacing(1, dtype=np.float64) + total_union)
    mIoU = IoU.mean()
    mAp = np.mean(total_ap)
    # iterator.set_description('pixAcc: %.4f, mIoU: %.4f, mAP: %.4f' % (pixAcc, mIoU, mAp))
    
    # count how many iteration
    print(batch_idx, end=' ')

    # clear loading bars
    os.system('clear')



print("Pixel-wise Accuracy: %.2f\n" % (pixAcc))
print("Mean IoU over %d classes: %.2f\n" % (2, mIoU))
print("Mean AP over %d classes: %.2f\n" % (2, mAp))

# check time
if DDS is False:
    end = time.time()
    duration = end - start
    print("Duration of this cell in seconds: ", duration)
    new_row = [METHOD, round(pixAcc, 2), round(mIoU, 2), round(mAp, 2), duration]
else:
    # add new row to csv
    new_row = [METHOD, round(pixAcc, 2), round(mIoU, 2), round(mAp, 2), 0]
    
with open(file_name, "a", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(new_row)  

# display CSV file
df = pd.read_csv(file_name, index_col=0) 
print(df)


100%|██████████| 34/34 [00:02<00:00, 14.04it/s]
100%|██████████| 24/24 [00:02<00:00, 11.01it/s]
100%|██████████| 34/34 [00:02<00:00, 14.12it/s]
100%|██████████| 24/24 [00:02<00:00, 11.72it/s]
  Res = torch.tensor(Res).cuda()
  0%|          | 1/4276 [00:19<23:06:16, 19.46s/it]

0 [H[2J

100%|██████████| 34/34 [00:02<00:00, 14.13it/s]
100%|██████████| 24/24 [00:01<00:00, 12.21it/s]
100%|██████████| 34/34 [00:02<00:00, 14.11it/s]
100%|██████████| 24/24 [00:01<00:00, 12.12it/s]
  Res = torch.tensor(Res).cuda()
  0%|          | 2/4276 [00:28<15:48:32, 13.32s/it]

1 [H[2J

100%|██████████| 34/34 [00:02<00:00, 14.13it/s]
100%|██████████| 24/24 [00:01<00:00, 12.28it/s]
100%|██████████| 34/34 [00:02<00:00, 14.14it/s]
100%|██████████| 24/24 [00:01<00:00, 12.23it/s]
  Res = torch.tensor(Res).cuda()
  0%|          | 3/4276 [00:37<13:27:26, 11.34s/it]

2 [H[2J

100%|██████████| 34/34 [00:02<00:00, 14.14it/s]
100%|██████████| 24/24 [00:01<00:00, 12.28it/s]
 85%|████████▌ | 29/34 [00:02<00:00, 13.91it/s]
  0%|          | 3/4276 [00:44<17:28:37, 14.72s/it]


KeyboardInterrupt: 