The code used in this project is inspired from the original project https://github.com/serendipity1122/Pre-trained-Model-Guided-Fine-Tuning-for-Zero-Shot-Adversarial-Robustness

In [1]:
import os
import torch.nn.functional as F
import torch
import open_clip
import matplotlib.pyplot as plt
import json
import warnings
import time
warnings.filterwarnings("ignore")
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel

In [2]:
# Function to parse the labels from the label file
def load_labels(label_file):
    labels = []
    with open(label_file, "r") as f:
        for line in f.readlines():
            filename, description = line.strip().split("|")
            labels.append((filename, description))
    return labels

In [3]:
# Dataset class for loading images and their corresponding descriptions
class SceneDataset(Dataset):
    def __init__(self, image_dir, labels, transform=None):
        self.image_dir = image_dir
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        filename, description = self.labels[idx]
        image_path = os.path.join(self.image_dir, filename)
        
        # Load image
        image = Image.open(image_path).convert("RGB")
        
        # Apply transformations
        if self.transform:
            image = self.transform(image)
        
        return image, description

In [4]:
# Main function to load the dataset for Training
def prepare_dataset(image_dir, label_file):
    # Load labels
    labels = load_labels(label_file)
    
    # Filter images that exist in the directory
    filtered_labels = []
    for filename, description in labels:
        if os.path.exists(os.path.join(image_dir, filename)):
            filtered_labels.append((filename, description))
        else:
            print(f"Warning: {filename} not found in {image_dir}")
    
    # Define transformations for the images
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to 224x224 for CLIP
        transforms.ToTensor(),         # Convert to PyTorch tensor
        transforms.Normalize(mean=[0.481, 0.457, 0.408], std=[0.268, 0.261, 0.275])  # Normalize for CLIP
    ])
    
    # Create the dataset and dataloader
    dataset = SceneDataset(image_dir, filtered_labels, transform)
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    
    return dataset, dataloader

In [5]:
image_dir = "./pmgaft_dataset/images"
label_file = "./pmgaft_dataset/pmgaft_labels.txt"

In [6]:
dataset, dataloader = prepare_dataset(image_dir, label_file)

In [7]:
# Load the OpenCLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
model = model.to(device)

# Load tokenizer for text input
tokenizer = open_clip.get_tokenizer("ViT-B-32")

print("OpenCLIP Model and Preprocessor Loaded Successfully!")

OpenCLIP Model and Preprocessor Loaded Successfully!


In [8]:
def extract_text_embeddings(descriptions, tokenizer, model):
    """
    Generate text embeddings for scene descriptions.
    
    """
    # Tokenize descriptions
    text_tokens = tokenizer(descriptions).to(device)
    
    # Generate embeddings
    text_embeddings = model.encode_text(text_tokens).detach()  # Detach to freeze embeddings
    return text_embeddings

In [9]:
def extract_image_embeddings(images, model):
    """
    Generate image embeddings for scene images.
    
    """
    # Generate embeddings
    image_embeddings = model.encode_image(images).detach()  # Detach to freeze embeddings
    return image_embeddings

In [10]:
def compute_similarity(image_embeddings, text_embeddings):
    """
    Compute cosine similarity between image and text embeddings.
    
    """
    similarity = F.cosine_similarity(image_embeddings, text_embeddings)
    return similarity

In [11]:
def generate_adversarial_examples(model, images, text_embeddings, epsilon=0.03, alpha=0.01, steps=10):
    """
    Generate adversarial examples using PGD.
    
    """
    images_adv = images.clone().detach().requires_grad_(True).to(device)

    for step in range(steps):
        outputs = model.encode_image(images_adv)
        logits = outputs @ text_embeddings.T
        loss = F.cross_entropy(logits, torch.arange(images.shape[0], device=device))

        # Compute gradients
        loss.backward()
        grad = images_adv.grad.data

        # Update adversarial examples
        images_adv = images_adv + alpha * grad.sign()
        images_adv = torch.clamp(images_adv, images - epsilon, images + epsilon).detach().requires_grad_(True)

    return images_adv

In [12]:
def compute_losses(model, images_adv, images_clean, text_embeddings):
    """
    Compute robustness, generalization, and regularization losses.
    
    """
    # Robustness Loss
    logits_adv = model.encode_image(images_adv) @ text_embeddings.T
    labels = torch.arange(images_adv.shape[0], device=device)
    loss_robust = F.cross_entropy(logits_adv, labels)

    # Generalization Loss
    logits_clean = model.encode_image(images_clean) @ text_embeddings.T
    loss_general = F.kl_div(F.log_softmax(logits_adv, dim=-1), F.softmax(logits_clean, dim=-1), reduction="batchmean")

    # Regularization Loss
    loss_regular = F.mse_loss(logits_adv, logits_clean)

    return loss_robust, loss_general, loss_regular

In [13]:
alpha = 0.7  # Weight for generalization loss
beta = 0.3   # Weight for regularization loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 5

In [14]:
start_time = time.time()
for epoch in range(epochs):
    for images, descriptions in dataloader:
        # Extract text embeddings
        text_embeddings = extract_text_embeddings(descriptions, tokenizer, model)

        # Generate clean and adversarial image embeddings
        images_clean = images.to(device)
        images_adv = generate_adversarial_examples(model, images_clean, text_embeddings)

        # Compute losses
        loss_robust, loss_general, loss_regular = compute_losses(model, images_adv, images_clean, text_embeddings)

        # Combine losses
        total_loss = loss_robust + alpha * loss_general + beta * loss_regular

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}")
end_time = time.time()
time_taken_seconds = end_time - start_time

# Convert to minutes
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Epoch 1/5, Loss: 0.8661
Epoch 2/5, Loss: 1.1564
Epoch 3/5, Loss: 0.7847
Epoch 4/5, Loss: 0.5517
Epoch 5/5, Loss: 0.7715
Time taken: 137.45 minutes


In [15]:
# Save the fine-tuned model
torch.save(model.state_dict(), "pmgaft_self_driving_finetuned.pth")
print("Fine-tuned model saved successfully!")

Fine-tuned model saved successfully!


In [17]:
import json

# Load test data from JSON file
with open("./pmgaft_dataset/test/pmgaft_test_data.json", "r") as json_file:
    test_data = json.load(json_file)

print(f"Loaded {len(test_data)} test samples!")

Loaded 1500 test samples!


In [18]:
class TestDataset(Dataset):
    def __init__(self, test_data, image_dir, transform=None):
        self.test_data = test_data
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.test_data)

    def __getitem__(self, idx):
        entry = self.test_data[idx]
        image_path = os.path.join(self.image_dir, entry["image"])
        image = Image.open(image_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        descriptions = entry["descriptions"]
        ground_truth_index = entry["ground_truth_index"]

        return image, descriptions, ground_truth_index

In [19]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.481, 0.457, 0.408], std=[0.268, 0.261, 0.275])
])

In [20]:
# Directory containing the test images
image_dir = "./pmgaft_dataset/images"

# Create the dataset and dataloader
test_dataset = TestDataset(test_data, image_dir, transform)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
print("Test dataloader created successfully!")

Test dataloader created successfully!


In [23]:
def get_predictions(model, image, descriptions, tokenizer):
    """
    Predict the index of the best matching description.
    """
    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()

    # Encode the image
    image = image.unsqueeze(0).to(device)  # Add batch dimension
    image_embedding = model.encode_image(image)

    # Compute similarities and find the best match
    similarities = image_embedding @ text_embeddings.T
    predicted_index = torch.argmax(similarities, dim=1).item()

    return predicted_index

# Evaluate accuracy on clean images
clean_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Move data to device
    ground_truth_index = ground_truth_index.to(device)

    # Get the predicted description index
    predicted_index = get_predictions(model, image.squeeze(0), descriptions[0], tokenizer)

    # Compare with the ground truth
    if predicted_index == ground_truth_index.item():
        clean_correct += 1
    total += 1

clean_accuracy = clean_correct / total
print(f"Accuracy on Clean Images: {clean_accuracy:.4f}")

Accuracy on Clean Images: 0.2807


In [22]:
def get_predictions_for_adversarial(model, adversarial_image, descriptions, tokenizer):
    """
    Predict the index of the best matching description for adversarial images.
    """
    # Ensure descriptions is a list of strings
    descriptions = descriptions[0] if isinstance(descriptions, tuple) else descriptions

    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()  # Shape: (num_descriptions, embedding_dim)

    # Encode the adversarial image
    adversarial_image = adversarial_image.unsqueeze(0).to(device)  # Add batch dimension
    image_embedding = model.encode_image(adversarial_image)  # Shape: (1, embedding_dim)

    # Compute cosine similarities
    similarities = image_embedding @ text_embeddings.T  # Shape: (1, num_descriptions)
    predicted_index = torch.argmax(similarities, dim=1).item()  # Get the index of the best match

    return predicted_index

In [22]:
adversarial_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Flatten descriptions to ensure a list of strings
    descriptions = [desc[0] if isinstance(desc, tuple) else desc for desc in descriptions]

    # Move ground truth index to device
    ground_truth_index = ground_truth_index.to(device)

    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()  # Shape: (num_descriptions, embedding_dim)

    # Ensure image is correctly shaped
    image_batch = image.to(device)  # No need to unsqueeze as it already has batch dimension [1, 3, 224, 224]

    # Generate adversarial examples
    images_adv = generate_adversarial_examples(model, image_batch, text_embeddings)  # Shape: [1, 3, 224, 224]

    # Get predictions for adversarial images
    predicted_index_adv = get_predictions_for_adversarial(model, images_adv.squeeze(0), descriptions, tokenizer)

    # Compare predictions with ground truth
    if predicted_index_adv == ground_truth_index.item():
        adversarial_correct += 1
    total += 1

# Compute accuracy
adversarial_accuracy = adversarial_correct / total
print(f"Accuracy on Adversarial Images: {adversarial_accuracy:.4f}")

Accuracy on Adversarial Images: 0.6113


#### Retraining the Model to test the generalization loss and the regularization loss with equal weights

In [23]:
alpha = 1.0  # Weight for generalization loss
beta = 1.0   # Weight for regularization loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 5

In [24]:
# Load the OpenCLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
model = model.to(device)

# Load tokenizer for text input
tokenizer = open_clip.get_tokenizer("ViT-B-32")

print("OpenCLIP Model and Preprocessor Loaded Successfully!")

OpenCLIP Model and Preprocessor Loaded Successfully!


In [25]:
start_time = time.time()
for epoch in range(epochs):
    for images, descriptions in dataloader:
        # Extract text embeddings
        text_embeddings = extract_text_embeddings(descriptions, tokenizer, model)

        # Generate clean and adversarial image embeddings
        images_clean = images.to(device)
        images_adv = generate_adversarial_examples(model, images_clean, text_embeddings)

        # Compute losses
        loss_robust, loss_general, loss_regular = compute_losses(model, images_adv, images_clean, text_embeddings)

        # Combine losses
        total_loss = loss_robust + beta * loss_regular

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}")
end_time = time.time()
time_taken_seconds = end_time - start_time

# Convert to minutes
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Epoch 1/5, Loss: 37.2035
Epoch 2/5, Loss: 41.3302
Epoch 3/5, Loss: 51.7592
Epoch 4/5, Loss: 64.9808
Epoch 5/5, Loss: 50.6268
Time taken: 138.51 minutes


In [26]:
# Evaluate accuracy on clean images
clean_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Move data to device
    ground_truth_index = ground_truth_index.to(device)

    # Get the predicted description index
    predicted_index = get_predictions(model, image.squeeze(0), descriptions[0], tokenizer)

    # Compare with the ground truth
    if predicted_index == ground_truth_index.item():
        clean_correct += 1
    total += 1

clean_accuracy = clean_correct / total
print(f"Accuracy on Clean Images: {clean_accuracy:.4f}")

Accuracy on Clean Images: 0.2807


In [27]:
adversarial_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Flatten descriptions to ensure a list of strings
    descriptions = [desc[0] if isinstance(desc, tuple) else desc for desc in descriptions]

    # Move ground truth index to device
    ground_truth_index = ground_truth_index.to(device)

    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()  # Shape: (num_descriptions, embedding_dim)

    # Ensure image is correctly shaped
    image_batch = image.to(device)  # No need to unsqueeze as it already has batch dimension [1, 3, 224, 224]

    # Generate adversarial examples
    images_adv = generate_adversarial_examples(model, image_batch, text_embeddings)  # Shape: [1, 3, 224, 224]

    # Get predictions for adversarial images
    predicted_index_adv = get_predictions_for_adversarial(model, images_adv.squeeze(0), descriptions, tokenizer)

    # Compare predictions with ground truth
    if predicted_index_adv == ground_truth_index.item():
        adversarial_correct += 1
    total += 1

# Compute accuracy
adversarial_accuracy = adversarial_correct / total
print(f"Accuracy on Adversarial Images: {adversarial_accuracy:.4f}")

Accuracy on Adversarial Images: 0.4567


In [28]:
# Save the fine-tuned model
torch.save(model.state_dict(), "pmgaft_self_driving_finetuned_ab_1.pth")
print("Fine-tuned model saved successfully!")

Fine-tuned model saved successfully!


In [29]:
alpha = 0.1  # Weight for generalization loss
beta = 0.1  # Weight for regularization loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 5

In [30]:
# Load the OpenCLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
model = model.to(device)

# Load tokenizer for text input
tokenizer = open_clip.get_tokenizer("ViT-B-32")

print("OpenCLIP Model and Preprocessor Loaded Successfully!")

OpenCLIP Model and Preprocessor Loaded Successfully!


In [31]:
start_time = time.time()
for epoch in range(epochs):
    for images, descriptions in dataloader:
        # Extract text embeddings
        text_embeddings = extract_text_embeddings(descriptions, tokenizer, model)

        # Generate clean and adversarial image embeddings
        images_clean = images.to(device)
        images_adv = generate_adversarial_examples(model, images_clean, text_embeddings)

        # Compute losses
        loss_robust, loss_general, loss_regular = compute_losses(model, images_adv, images_clean, text_embeddings)

        # Combine losses
        total_loss = loss_robust + beta * loss_regular

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}")
end_time = time.time()
time_taken_seconds = end_time - start_time

# Convert to minutes
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Epoch 1/5, Loss: 12.0053
Epoch 2/5, Loss: 18.5700
Epoch 3/5, Loss: 15.7165
Epoch 4/5, Loss: 21.9792
Epoch 5/5, Loss: 14.1834
Time taken: 135.44 minutes


In [32]:
# Evaluate accuracy on clean images
clean_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Move data to device
    ground_truth_index = ground_truth_index.to(device)

    # Get the predicted description index
    predicted_index = get_predictions(model, image.squeeze(0), descriptions[0], tokenizer)

    # Compare with the ground truth
    if predicted_index == ground_truth_index.item():
        clean_correct += 1
    total += 1

clean_accuracy = clean_correct / total
print(f"Accuracy on Clean Images: {clean_accuracy:.4f}")

Accuracy on Clean Images: 0.2807


In [33]:
adversarial_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Flatten descriptions to ensure a list of strings
    descriptions = [desc[0] if isinstance(desc, tuple) else desc for desc in descriptions]

    # Move ground truth index to device
    ground_truth_index = ground_truth_index.to(device)

    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()  # Shape: (num_descriptions, embedding_dim)

    # Ensure image is correctly shaped
    image_batch = image.to(device)  # No need to unsqueeze as it already has batch dimension [1, 3, 224, 224]

    # Generate adversarial examples
    images_adv = generate_adversarial_examples(model, image_batch, text_embeddings)  # Shape: [1, 3, 224, 224]

    # Get predictions for adversarial images
    predicted_index_adv = get_predictions_for_adversarial(model, images_adv.squeeze(0), descriptions, tokenizer)

    # Compare predictions with ground truth
    if predicted_index_adv == ground_truth_index.item():
        adversarial_correct += 1
    total += 1

# Compute accuracy
adversarial_accuracy = adversarial_correct / total
print(f"Accuracy on Adversarial Images: {adversarial_accuracy:.4f}")

Accuracy on Adversarial Images: 0.4567


In [13]:
alpha = 0.3  # Weight for generalization loss
beta = 0.7  # Weight for regularization loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 5

In [14]:
# Load the OpenCLIP model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
model = model.to(device)

# Load tokenizer for text input
tokenizer = open_clip.get_tokenizer("ViT-B-32")

print("OpenCLIP Model and Preprocessor Loaded Successfully!")

OpenCLIP Model and Preprocessor Loaded Successfully!


In [15]:
start_time = time.time()
for epoch in range(epochs):
    for images, descriptions in dataloader:
        # Extract text embeddings
        text_embeddings = extract_text_embeddings(descriptions, tokenizer, model)

        # Generate clean and adversarial image embeddings
        images_clean = images.to(device)
        images_adv = generate_adversarial_examples(model, images_clean, text_embeddings)

        # Compute losses
        loss_robust, loss_general, loss_regular = compute_losses(model, images_adv, images_clean, text_embeddings)

        # Combine losses
        total_loss = loss_robust + beta * loss_regular

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}")
end_time = time.time()
time_taken_seconds = end_time - start_time

# Convert to minutes
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Epoch 1/5, Loss: 42.4631
Epoch 2/5, Loss: 32.4958
Epoch 3/5, Loss: 40.5326
Epoch 4/5, Loss: 22.3420
Epoch 5/5, Loss: 40.9245
Time taken: 132.97 minutes


In [24]:
# Evaluate accuracy on clean images
clean_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Move data to device
    ground_truth_index = ground_truth_index.to(device)

    # Get the predicted description index
    predicted_index = get_predictions(model, image.squeeze(0), descriptions[0], tokenizer)

    # Compare with the ground truth
    if predicted_index == ground_truth_index.item():
        clean_correct += 1
    total += 1

clean_accuracy = clean_correct / total
print(f"Accuracy on Clean Images: {clean_accuracy:.4f}")

Accuracy on Clean Images: 0.2807


In [25]:
adversarial_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Flatten descriptions to ensure a list of strings
    descriptions = [desc[0] if isinstance(desc, tuple) else desc for desc in descriptions]

    # Move ground truth index to device
    ground_truth_index = ground_truth_index.to(device)

    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()  # Shape: (num_descriptions, embedding_dim)

    # Ensure image is correctly shaped
    image_batch = image.to(device)  # No need to unsqueeze as it already has batch dimension [1, 3, 224, 224]

    # Generate adversarial examples
    images_adv = generate_adversarial_examples(model, image_batch, text_embeddings)  # Shape: [1, 3, 224, 224]

    # Get predictions for adversarial images
    predicted_index_adv = get_predictions_for_adversarial(model, images_adv.squeeze(0), descriptions, tokenizer)

    # Compare predictions with ground truth
    if predicted_index_adv == ground_truth_index.item():
        adversarial_correct += 1
    total += 1

# Compute accuracy
adversarial_accuracy = adversarial_correct / total
print(f"Accuracy on Adversarial Images: {adversarial_accuracy:.4f}")

Accuracy on Adversarial Images: 0.4567


## Loss with Cosine Similarity

In [26]:
def compute_losses(model, images_adv, images_clean, text_embeddings):
    """
    Compute robustness, generalization, and regularization losses.
    """
    # Robustness Loss (Cross-Entropy on adversarial examples)
    logits_adv = model.encode_image(images_adv) @ text_embeddings.T
    labels = torch.arange(images_adv.shape[0], device=device)
    loss_robust = F.cross_entropy(logits_adv, labels)

    # Generalization Loss (KL Divergence between adversarial and clean outputs)
    logits_clean = model.encode_image(images_clean) @ text_embeddings.T
    loss_general = F.kl_div(
        F.log_softmax(logits_adv, dim=-1),
        F.softmax(logits_clean, dim=-1),
        reduction="batchmean"
    )

    # Regularization Loss (Replaced MSE with Cosine Similarity)
    loss_regular = 1 - F.cosine_similarity(logits_adv, logits_clean, dim=-1).mean()

    return loss_robust, loss_general, loss_regular

In [27]:
alpha = 0.5  # Weight for generalization loss
beta = 0.5  # Weight for regularization loss
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 5

In [28]:
start_time = time.time()
for epoch in range(epochs):
    for images, descriptions in dataloader:
        # Extract text embeddings
        text_embeddings = extract_text_embeddings(descriptions, tokenizer, model)

        # Generate clean and adversarial image embeddings
        images_clean = images.to(device)
        images_adv = generate_adversarial_examples(model, images_clean, text_embeddings)

        # Compute losses
        loss_robust, loss_general, loss_regular = compute_losses(model, images_adv, images_clean, text_embeddings)

        # Combine losses
        total_loss = loss_robust + beta * loss_regular

        # Optimize
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss.item():.4f}")
end_time = time.time()
time_taken_seconds = end_time - start_time

# Convert to minutes
time_taken_minutes = time_taken_seconds / 60
print(f"Time taken: {time_taken_minutes:.2f} minutes")

Epoch 1/5, Loss: 1.4457
Epoch 2/5, Loss: 1.1075
Epoch 3/5, Loss: 1.4279
Epoch 4/5, Loss: 1.2815
Epoch 5/5, Loss: 0.7323
Time taken: 140.03 minutes


In [29]:
# Evaluate accuracy on clean images
clean_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Move data to device
    ground_truth_index = ground_truth_index.to(device)

    # Get the predicted description index
    predicted_index = get_predictions(model, image.squeeze(0), descriptions[0], tokenizer)

    # Compare with the ground truth
    if predicted_index == ground_truth_index.item():
        clean_correct += 1
    total += 1

clean_accuracy = clean_correct / total
print(f"Accuracy on Clean Images: {clean_accuracy:.4f}")

Accuracy on Clean Images: 0.2807


In [30]:
adversarial_correct = 0
total = 0

for image, descriptions, ground_truth_index in test_dataloader:
    # Flatten descriptions to ensure a list of strings
    descriptions = [desc[0] if isinstance(desc, tuple) else desc for desc in descriptions]

    # Move ground truth index to device
    ground_truth_index = ground_truth_index.to(device)

    # Tokenize and encode descriptions
    text_tokens = tokenizer(descriptions).to(device)
    text_embeddings = model.encode_text(text_tokens).detach()  # Shape: (num_descriptions, embedding_dim)

    # Ensure image is correctly shaped
    image_batch = image.to(device)  # No need to unsqueeze as it already has batch dimension [1, 3, 224, 224]

    # Generate adversarial examples
    images_adv = generate_adversarial_examples(model, image_batch, text_embeddings)  # Shape: [1, 3, 224, 224]

    # Get predictions for adversarial images
    predicted_index_adv = get_predictions_for_adversarial(model, images_adv.squeeze(0), descriptions, tokenizer)

    # Compare predictions with ground truth
    if predicted_index_adv == ground_truth_index.item():
        adversarial_correct += 1
    total += 1

# Compute accuracy
adversarial_accuracy = adversarial_correct / total
print(f"Accuracy on Adversarial Images: {adversarial_accuracy:.4f}")

Accuracy on Adversarial Images: 0.6133
