In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from zmq import device


torch.set_default_device("cuda")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # CLIP expects 224x224 images
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761])  # CLIP normalization
])


In [None]:
from torch.utils.data import Subset
cifar_data_train = datasets.CIFAR100(root="./data", train=True, download=True, transform=transform)
cifar_data_test = datasets.CIFAR100(root="./data", train=False, download=True, transform=transform)
calib_data = Subset(cifar_data_train, range(1000))  # for demonstration purposes
# train_data = Subset(cifar_data, range(250,1000))  # for demonstration purposes
# test_data = Subset(cifar_data_test, range(2000))
test_data = cifar_data_test
calib_loader = DataLoader(calib_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [6]:
class_names = cifar_data_train.classes  # CIFAR-10 class names
text_inputs = processor(text=class_names, return_tensors="pt", padding=True)


In [7]:
class_names

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [8]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

def denormalize(img: torch.Tensor, mean: torch.Tensor, std: torch.Tensor):
    """Denormalizes the image given the mean and standard deviation."""
    return img * torch.tensor(std, device="cpu").view(3, 1, 1) + torch.tensor(mean, device="cpu").view(3, 1, 1)

# Forward pass through CLIP

all_labels = []
all_predictions = []
scores = []

# Compute nonconformity scores

for images, labels in calib_loader:

    pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
    
    # Process images using CLIP's processor (automatically normalizes them)
    inputs = processor(images=pil_images, return_tensors="pt").to("cuda")
    input_image_processed = inputs['pixel_values'].squeeze(0)

    outputs = model(**inputs, **text_inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
    predictions = probs.argmax(dim=1)
    all_labels.extend(labels.tolist())
    all_predictions.extend(predictions.tolist())
    scores += logits_per_image.take_along_dim(torch.tensor(labels).unsqueeze(-1),dim=1).squeeze().tolist()
    


print(scores)



  return func(*args, **kwargs)


[25.368732452392578, 26.452966690063477, 25.229278564453125, 25.598283767700195, 25.935531616210938, 27.184476852416992, 26.504697799682617, 29.575803756713867, 28.322383880615234, 28.204208374023438, 24.665494918823242, 25.596296310424805, 27.46965217590332, 28.72529411315918, 26.64809799194336, 28.156179428100586, 25.208477020263672, 27.227535247802734, 28.48607635498047, 29.22666358947754, 29.37409210205078, 26.907634735107422, 23.374475479125977, 25.624265670776367, 24.328384399414062, 28.5148983001709, 26.394155502319336, 25.76992416381836, 29.074817657470703, 21.481353759765625, 24.412639617919922, 27.268293380737305, 28.223346710205078, 25.43918228149414, 29.101478576660156, 25.410106658935547, 27.693742752075195, 27.118778228759766, 26.9343204498291, 25.910694122314453, 27.50600814819336, 27.50728416442871, 27.578697204589844, 29.839933395385742, 27.475521087646484, 28.36693000793457, 27.00468635559082, 24.789363861083984, 26.871427536010742, 24.7781925201416, 25.73372077941894

In [10]:
import numpy as np
alphas = [0.02, 0.05, 0.1, 0.2]
for alpha in alphas:
    print("\n\n")
    print(f"alpha =\t\t\t {alpha}")
    # Compute the quantile for the nonconformity scores
    n = len(scores)
    threshold = np.quantile(scores, np.ceil((n+1)*(alpha))/n, method="inverted_cdf")
    prediction_sets = []
    all_labels = []
    all_predictions = []

    for images, labels in test_loader:

        pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
        
        # Process images using CLIP's processor (automatically normalizes them)
        inputs = processor(images=pil_images, return_tensors="pt").to("cuda")
        input_image_processed = inputs['pixel_values'].squeeze(0)

        outputs = model(**inputs, **text_inputs)
        logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
        probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
        predictions = probs.argmax(dim=1)
        all_labels.extend(labels.tolist())
        all_predictions.extend(predictions.tolist())
        indices = (logits_per_image > threshold).nonzero(as_tuple=True)
        row_indices = [indices[1][indices[0] == i] for i in range(logits_per_image.size(0))]
        prediction_sets.extend(row_indices)

    pred_sets = [x.tolist() for x in prediction_sets]
    coverage = np.mean([all_labels[i] in pred_sets[i] for i in range(len(all_labels))])
    avg_set_size = np.mean([len(s) for s in pred_sets])
    median_set_size = np.median([len(s) for s in pred_sets])
    acc_score = accuracy_score(all_labels, all_predictions)
    print(f"accuracy =\t\t {acc_score}")
    print(f"coverage =\t\t {coverage}")
    print(f"mean set size =\t\t {avg_set_size}")
    print(f"median set size =\t {median_set_size}")




alpha =			 0.02
accuracy =		 0.8498
coverage =		 0.9797
mean set size =		 3.5607
median set size =	 3.0



alpha =			 0.05
accuracy =		 0.8498
coverage =		 0.9518
mean set size =		 2.4033
median set size =	 2.0



alpha =			 0.1
accuracy =		 0.8498
coverage =		 0.8904
mean set size =		 1.5704
median set size =	 1.0



alpha =			 0.2
accuracy =		 0.8498
coverage =		 0.7985
mean set size =		 1.0915
median set size =	 1.0
