In [9]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel


In [10]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [11]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # CLIP expects 224x224 images
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.2757])  # CLIP normalization
])


In [29]:
from torch.utils.data import Subset
cifar_data_train = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
cifar_data_test = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
calib_data = Subset(cifar_data_train, range(250))  # for demonstration purposes
# train_data = Subset(cifar_data, range(250,1000))  # for demonstration purposes
test_data = Subset(cifar_data_test, range(500))
calib_loader = DataLoader(calib_data, batch_size=32, shuffle=False)
test_loader = DataLoader(calib_data, batch_size=32, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [30]:
class_names = cifar_data.classes  # CIFAR-10 class names
text_inputs = processor(text=class_names, return_tensors="pt", padding=True)


In [31]:
class_names

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

def denormalize(img: torch.Tensor, mean: torch.Tensor, std: torch.Tensor):
    """Denormalizes the image given the mean and standard deviation."""
    return img * torch.tensor(std).view(3, 1, 1) + torch.tensor(mean).view(3, 1, 1)

# Forward pass through CLIP

all_labels = []
all_predictions = []
scores = []

# Compute nonconformity scores

for images, labels in calib_loader:

    pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
    
    # Process images using CLIP's processor (automatically normalizes them)
    inputs = processor(images=pil_images, return_tensors="pt")
    input_image_processed = inputs['pixel_values'].squeeze(0)

    outputs = model(**inputs, **text_inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
    predictions = probs.argmax(dim=1)  
    all_labels.extend(labels.tolist())
    all_predictions.extend(predictions.tolist())    
    scores += logits_per_image.take_along_dim(torch.tensor(labels).unsqueeze(-1),dim=1).squeeze().tolist()
    break


print(scores)



[25.365989685058594, 26.455493927001953, 25.2419376373291, 25.60202407836914, 25.934114456176758, 27.183713912963867, 26.499649047851562, 29.578855514526367, 28.320209503173828, 28.197999954223633, 24.66594696044922, 25.592741012573242, 27.471712112426758, 28.72464370727539, 26.64839744567871, 28.159019470214844, 25.213783264160156, 27.23236083984375, 28.481182098388672, 29.226364135742188, 29.374387741088867, 26.899513244628906, 23.37452507019043, 25.62276840209961, 24.326765060424805, 28.5065975189209, 26.390464782714844, 25.771800994873047, 29.074600219726562, 21.49118995666504, 24.411808013916016, 27.261476516723633]


  scores += logits_per_image.take_along_dim(torch.tensor(labels).unsqueeze(-1),dim=1).squeeze().tolist()


In [None]:
alpha = 0.05
n = len(scores)
# Compute the quantile for the nonconformity scores
quantile = torch.quantile(torch.tensor(scores), 1 - alpha)

print("Quantile:", quantile.item())


for images, labels in test_loader:

    pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
    
    # Process images using CLIP's processor (automatically normalizes them)
    inputs = processor(images=pil_images, return_tensors="pt")
    input_image_processed = inputs['pixel_values'].squeeze(0)

    outputs = model(**inputs, **text_inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
    predictions = probs.argmax(dim=1)
    all_labels.extend(labels.tolist())
    all_predictions.extend(predictions.tolist())
    scores += logits_per_image.take_along_dim(torch.tensor(labels).unsqueeze(-1),dim=1).squeeze().tolist()
    break


Quantile: 29.2929744720459


In [None]:

all_labels = []
all_predictions = []


# Compute nonconformity scores

for images, labels in calib_loader:

    pil_images = [transforms.ToPILImage()(denormalize(img, processor.image_processor.image_mean, processor.image_processor.image_std)) for img in images]
    
    # Process images using CLIP's processor (automatically normalizes them)
    inputs = processor(images=pil_images, return_tensors="pt")
    input_image_processed = inputs['pixel_values'].squeeze(0)

    outputs = model(**inputs, **text_inputs)
    logits_per_image = outputs.logits_per_image  # Image-to-text similarity scores
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities
    predictions = probs.argmax(dim=1)  
    all_labels.extend(labels.tolist())
    all_predictions.extend(predictions.tolist())    
    print(outputs.logit_per_image.take_along_dim(1,labels))
    break


print(all_labels,all_predictions)
print(accuracy_score(all_labels, all_predictions))