## Loading the Custom CLIP Model

In [10]:
# Load necessary libraries
import torch
from clip_finetune_model.model import CLIPFineTuner  # Ensure this points to your model definition
import os
from transformers import CLIPProcessor

# Set device
device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")

# Define the number of classes
num_classes = 7  # As per your previous setup

# Initialize the model
clip_model = CLIPFineTuner(num_classes=num_classes).to(device)

# Load the saved state dictionary
clip_model.load_state_dict(torch.load('/data/huzhengyu/github_repo/tony_csml/csml-final-project/clip_finetune_model/clip_finetuned_model_v2.pth', map_location=device))

# Set the model to evaluation mode
clip_model.eval()

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


  clip_model.load_state_dict(torch.load('/data/huzhengyu/github_repo/tony_csml/csml-final-project/clip_finetune_model/clip_finetuned_model_v2.pth', map_location=device))


In [4]:
from blip_finetune_model.model import BLIPFineTuner  # Ensure this points to your model definition

# Initialize the model
blip_model = BLIPFineTuner(num_classes=num_classes).to(device)

# Load the saved state dictionary
blip_model.load_state_dict(torch.load('/data/huzhengyu/github_repo/tony_csml/csml-final-project/blip_finetune_model/blip_finetuned_model.pth', map_location=device))

# Set the model to evaluation mode
blip_model.eval()


  blip_model.load_state_dict(torch.load('/data/huzhengyu/github_repo/tony_csml/csml-final-project/blip_finetune_model/blip_finetuned_model.pth', map_location=device))


BLIPFineTuner(
  (blip): BlipForConditionalGeneration(
    (vision_model): BlipVisionModel(
      (embeddings): BlipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (encoder): BlipEncoder(
        (layers): ModuleList(
          (0-11): 12 x BlipEncoderLayer(
            (self_attn): BlipAttention(
              (dropout): Dropout(p=0.0, inplace=False)
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (projection): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): BlipMLP(
              (activation_fn): GELUActivation()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out_features=768, bias=True)
            )
            (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
         

In [None]:
import torch
# from captum.attr.visualization import LayerGradCam, visualize_image_attr
from captum.attr import LayerGradCam 
from captum.attr import visualization as viz
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import transforms

# Define the preprocessing transforms
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_processor.image_mean, std=processor.image_processor.image_std),
])

# Load and preprocess the image
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    input_tensor = transform(image)

    # Add a batch dimension
    input_tensor = input_tensor.unsqueeze(0)  # Shape: [1, 3, 224, 224]
    
    # Move the tensor to the specified device
    input_tensor = input_tensor.to(device)
    return image, input_tensor

# Choose the target layer
target_layer = clip_model.clip_model.vision_model.encoder.layers[-1].self_attn

# Initialize LayerGradCam
layer_gc = LayerGradCam(clip_model, target_layer)

# Function to get prediction and index
def get_prediction(input_tensor):
    outputs = clip_model(input_tensor)
    _, pred_idx = torch.max(outputs, 1)
    return outputs, pred_idx

# Compute attributions
def compute_gradcam(input_tensor, target_class=None):
    if target_class is None:
        outputs, pred_idx = get_prediction(input_tensor)
        target_class = pred_idx.item()
    else:
        pred_idx = torch.tensor([target_class]).to(device)
    attributions = layer_gc.attribute(input_tensor, target=target_class)
    return attributions

# Visualize the attributions
def visualize_attributions(original_image, attributions):
    attribution = attributions.squeeze(0).cpu().detach().numpy()
    attribution = np.mean(attribution, axis=0)  # Average over channels
    attribution = np.maximum(attribution, 0)
    attribution = cv2.resize(attribution, (original_image.size[0], original_image.size[1]))
    attribution = (attribution - attribution.min()) / (attribution.max() - attribution.min())
    plt.imshow(original_image)
    plt.imshow(attribution, cmap='jet', alpha=0.5)
    plt.axis('off')
    plt.show()


# load image

In [19]:
# Assuming you've already loaded the model as per Section 1

# Load an example image
image_path = '/data/huzhengyu/github_repo/tony_csml/csml-final-project/split_data/test/akiec/ISIC_0024329.jpg'
original_image, input_tensor = load_image(image_path)


In [20]:
from torchcam.methods import ScoreCAM
from torchvision.transforms.functional import normalize

# Initialize ScoreCAM
scorecam = ScoreCAM(model=clip_model, target_layer=target_layer)

# Forward pass
outputs = clip_model(input_tensor)
_, pred_idx = torch.max(outputs, 1)

# Get CAM
cams = scorecam(input_tensor)

# Visualize CAM
cam = cams[0][pred_idx.item()].cpu().numpy()
cam = cv2.resize(cam, (original_image.size[0], original_image.size[1]))
plt.imshow(original_image)
plt.imshow(cam, cmap='jet', alpha=0.5)
plt.axis('off')
plt.show()


ValueError: not enough values to unpack (expected 4, got 3)

In [15]:
from captum.attr import NoiseTunnel, Saliency

# Initialize Saliency and NoiseTunnel
saliency = Saliency(clip_model)
nt = NoiseTunnel(saliency)

# Compute attributions
attributions = nt.attribute(input_tensor, nt_type='smoothgrad', stdevs=0.02, n_samples=50, target=pred_idx)

# Visualize attributions
attribution = attributions.squeeze(0).cpu().detach().numpy()
attribution = np.mean(attribution, axis=0)  # Average over channels
attribution = (attribution - attribution.min()) / (attribution.max() - attribution.min())
plt.imshow(original_image)
plt.imshow(attribution, cmap='jet', alpha=0.5)
plt.axis('off')
plt.show()


NameError: name 'input_tensor' is not defined