In [8]:
def mask_score(cm_score):
    positive_or_zero_mask = cm_score >= 0
    negative_mask = cm_score < 0
    cm_score[positive_or_zero_mask] = 1
    cm_score[negative_mask] = 0
    return cm_score

In [7]:
# Image Method

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset, Dataset
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr

from utils.dataset import get_dataset
from utils.model_utils import load_model, get_text_embeddings, get_image_embeddings
import os
import glob
from PIL import Image

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"

datasets_info = [
    {"name": "NWPU-RESISC45", "template": "a satellite photo containing {}."},
    {"name": "Stanford_dogs", "template": "a photo of {}, a type of dog."},
    {"name": "CUB_200_2011", "template": "a photo of {}, a type of bird."},
    {"name": "Flower102", "template": "a photo of {}, a type of flower."},
]

class PseudoImageDataset(Dataset):
    def __init__(self, image_folder, class_names, transform=None):
        self.image_paths = []
        self.labels = []
        self.transform = transform
        self.class_names = class_names

        # Store class_names in a set for faster lookup
        class_name_set = set(class_names)

        for image_path in glob.glob(os.path.join(image_folder, '*.png')):
            filename = os.path.basename(image_path)
            class_name = filename.split('-')[0].replace('_', ' ')
            if class_name in class_name_set:
                self.image_paths.append(image_path)
                self.labels.append(class_name)

        # Convert labels to indices
        self.labels = [class_names.index(label) for label in self.labels]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label

def get_image_loader(dataset, class_indices, batch_size, shuffle=False):
    if hasattr(dataset, 'targets'):
        selected_indices = [i for i, label in enumerate(dataset.targets) if label in class_indices]
    else:
        selected_indices = [i for i, (_, label) in enumerate(dataset.imgs) if label in class_indices]
    selected_dataset = Subset(dataset, selected_indices)

    # Ensure no transform or compatible transform is applied
    selected_dataset.dataset.transform = None

    loader = DataLoader(
        selected_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
    )
    return loader

def pairwise_difference(a, b):
  """
  Computes pairwise difference between rows of two matrices.

  Args:
    a: NumPy array of shape (N_a, D)
    b: NumPy array of shape (N_b, D)

  Returns:
    NumPy array of shape (N_a, N_b, D) containing pairwise differences.
  """
  return a[:, np.newaxis, :] - b[np.newaxis, :, :]

def run_experiment(dataset_name, template, use_pseudo_data):
    # 1. Load dataset (test split)
    dataset = get_dataset(dataset_name, data_root='data', split='test')
    class_names = dataset.classes
    num_classes = len(class_names)

    if use_pseudo_data:
        PSEUDO_IMAGES_FOLDER = f'pseudo_images/{dataset_name}'
        pseudo_dataset = PseudoImageDataset(
            image_folder=PSEUDO_IMAGES_FOLDER,
            class_names=class_names,
            transform=None
        )
        dataloader = DataLoader(
            pseudo_dataset,
            batch_size=64,
            shuffle=False,
            pin_memory=True,
            collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
        )
    else:
        selected_class_indices = list(range(num_classes))
        dataloader = get_image_loader(dataset, selected_class_indices, 64, shuffle=False)

    # 2. Load model
    model_info = load_model(model_name, device=device)

    # 3. Create captions and compute text embeddings
    captions = [template.format(c) for c in class_names]
    text_embeddings = get_text_embeddings(captions, model_info, device, batch_size=64)

    # 4. Compute image embeddings
    image_embeddings, all_targets = get_image_embeddings(dataloader, model_info, device)
    all_targets = np.array(all_targets)

    # Normalize embeddings
    image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True)

    # img shape: (N_img, D), text shape: (N_text, D)

    # 5. Compute similarities and predictions
    temperature = 0.01
    similarities = np.matmul(image_embeddings, text_embeddings.T) / temperature
    probs = np.exp(similarities) / np.sum(np.exp(similarities), axis=1, keepdims=True)

    preds = np.argmax(probs, axis=1)
    confidences = np.max(probs, axis=1)

    predicted_cm_scores = []
    # do some class-wise operations
    for c in range(num_classes):
        # Select rows of image_embeddings where the target is class c
        c_embeddings = image_embeddings[all_targets == c] # shape (N_c, D)

        other_class_text_embeddings = np.concatenate([text_embeddings[i:i+1] for i in range(num_classes) if i != c], axis=0) # shape (N_t-1, D)
        text_diff = pairwise_difference(text_embeddings[c:c+1], other_class_text_embeddings) # shape (1, N_t-1, D)
        
        c_embeddings = c_embeddings / np.linalg.norm(c_embeddings, axis=-1, keepdims=True)
        text_diff = text_diff / np.linalg.norm(text_diff, axis=-1, keepdims=True)

        cm_scores = np.einsum('cd, td -> ct', c_embeddings, text_diff[0]) # shape (N_c, N_t-1)
        
        # make softmax
        def softmax(x):
            """Compute softmax values for each sets of scores in x."""
            e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
            return e_x / np.sum(e_x, axis=-1, keepdims=True)
        
        def softmin(x):
            """Compute softmin values for each sets of scores in x (with numerical stability)."""
            neg_x = -x
            e_neg_x = np.exp(neg_x - np.max(neg_x, axis=-1, keepdims=True))
            return e_neg_x / np.sum(e_neg_x, axis=-1, keepdims=True)
        
        cm_scores = softmax(1/cm_scores)

        cm_score = 1 / np.max(cm_scores, axis=-1) # shape (N_c, 1)
        # cm_score = cm_score / (1 + cm_score) # shape (N_c, 1)
        cm_score = np.mean(cm_score, axis=0) # shape (1,)
        predicted_cm_scores.append(cm_score)

    # Compute class-wise actual and predicted accuracies
    actual_accuracies = []
    # predicted_accuracies = []
    for c in range(num_classes):
        cur_class_indices = (all_targets == c)
        cur_class_pred = preds[cur_class_indices]
        cur_class_accuracy = (cur_class_pred == c).mean()

        if np.isnan(cur_class_accuracy):
            print(f"Class {c}, {class_names[c]} has NaN accuracy")
            cur_class_accuracy = 0.0
        # predicted_cur_class_indices = (preds == c)
        # predicted_confidences = confidences[predicted_cur_class_indices]
        # predicted_acc = predicted_confidences.mean() if predicted_confidences.size > 0 else np.nan

        actual_accuracies.append(cur_class_accuracy)
        # predicted_accuracies.append(predicted_acc if not np.isnan(predicted_acc) else 0.0)
            
    # Compute Spearman correlation
    spearman_corr = spearmanr(np.array(actual_accuracies), predicted_cm_scores )# np.array(predicted_accuracies))

    # Return data for plotting outside
    return np.array(actual_accuracies), np.array(predicted_cm_scores), spearman_corr

# Now we call run_experiment for pseudo and real data and plot them together
for dinfo in datasets_info:
    actual_pseudo, predicted_pseudo, spearman_pseudo = run_experiment(dinfo["name"], dinfo["template"], use_pseudo_data=True)
    actual_real, predicted_real, spearman_real = run_experiment(dinfo["name"], dinfo["template"], use_pseudo_data=False)

    # Recompute pseudo Spearman correlation using actual_real for the y-values
    pseudo_spearman_corr = spearmanr(actual_real, predicted_pseudo)

    # Create a single plot for both pseudo and real
    fig, ax = plt.subplots(figsize=(8,8))
    # For pseudo, use actual_real as y-axis
    ax.scatter(predicted_pseudo, actual_real, color='tab:orange', label='Pseudo Images', alpha=0.5)
    ax.scatter(predicted_real, actual_real, color='tab:blue', label='Real Images', alpha=0.5)

    ax.set_xlabel("Predicted Accuracy (Confidence)")
    ax.set_ylabel("Actual Accuracy (Real Data)")
    pseudo_corr_str = f"{pseudo_spearman_corr.correlation:.2f}" if pseudo_spearman_corr.correlation is not None else "N/A"
    real_corr_str = f"{spearman_real.correlation:.2f}" if spearman_real.correlation is not None else "N/A"
    ax.set_title(f"Calibration Approach: {dinfo['name']}\nSpearman (Pseudo vs Real Accuracy): {pseudo_corr_str}, Spearman (Real vs Real Accuracy): {real_corr_str}")
    print(f"Calibration Approach: {dinfo['name']}\nSpearman (Pseudo vs Real Accuracy): {pseudo_corr_str}, Spearman (Real vs Real Accuracy): {real_corr_str}")
    ax.grid(True)
    # ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.legend()

    # Save the figure
    plot_filename = f"figures/{dinfo['name']}_CM_softmax.png"
    plt.savefig(plot_filename)
    plt.close(fig)

Processed batch 1/1


Processing Images: 100%|██████████| 25/25 [00:23<00:00,  1.07it/s]


Processed batch 1/1


Processing Images: 100%|██████████| 493/493 [02:22<00:00,  3.46it/s]


Calibration Approach: NWPU-RESISC45
Spearman (Pseudo vs Real Accuracy): 0.48, Spearman (Real vs Real Accuracy): 0.89
Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 59/59 [00:50<00:00,  1.18it/s]


Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 322/322 [03:05<00:00,  1.74it/s]


Calibration Approach: Stanford_dogs
Spearman (Pseudo vs Real Accuracy): 0.48, Spearman (Real vs Real Accuracy): 0.82
Processed batch 1/4
Processed batch 2/4
Processed batch 3/4
Processed batch 4/4


Processing Images: 100%|██████████| 98/98 [04:58<00:00,  3.05s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  cur_class_accuracy = (cur_class_pred == c).mean()
  ret = ret.dtype.type(ret / rcount)


Class 93, White breasted Nuthatch has NaN accuracy
Processed batch 1/4
Processed batch 2/4
Processed batch 3/4
Processed batch 4/4


Processing Images: 100%|██████████| 185/185 [02:33<00:00,  1.21it/s]


Calibration Approach: CUB_200_2011
Spearman (Pseudo vs Real Accuracy): nan, Spearman (Real vs Real Accuracy): 0.85
Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 51/51 [01:11<00:00,  1.39s/it]


Class 11, black-eyed susan has NaN accuracy
Class 26, colt's foot has NaN accuracy
Class 32, desert-rose has NaN accuracy
Class 44, globe-flower has NaN accuracy
Class 47, hard-leaved pocket orchid has NaN accuracy
Class 72, pink-yellow dahlia has NaN accuracy
Class 79, ruby-lipped cattleya has NaN accuracy
Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 128/128 [01:21<00:00,  1.56it/s]


Calibration Approach: Flower102
Spearman (Pseudo vs Real Accuracy): nan, Spearman (Real vs Real Accuracy): 0.79


In [2]:
# Image Method

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset, Dataset
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr

from utils.dataset import get_dataset
from utils.model_utils import load_model, get_text_embeddings, get_image_embeddings
import os
import glob
from PIL import Image

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"

datasets_info = [
    {"name": "NWPU-RESISC45", "template": "a satellite photo containing {}."},
    {"name": "Stanford_dogs", "template": "a photo of {}, a type of dog."},
    {"name": "CUB_200_2011", "template": "a photo of {}, a type of bird."},
    {"name": "Flower102", "template": "a photo of {}, a type of flower."},
]

class PseudoImageDataset(Dataset):
    def __init__(self, image_folder, class_names, transform=None):
        self.image_paths = []
        self.labels = []
        self.transform = transform
        self.class_names = class_names

        # Store class_names in a set for faster lookup
        class_name_set = set(class_names)

        for image_path in glob.glob(os.path.join(image_folder, '*.png')):
            filename = os.path.basename(image_path)
            class_name = filename.split('-')[0].replace('_', ' ')
            if class_name in class_name_set:
                self.image_paths.append(image_path)
                self.labels.append(class_name)

        # Convert labels to indices
        self.labels = [class_names.index(label) for label in self.labels]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label

def get_image_loader(dataset, class_indices, batch_size, shuffle=False):
    if hasattr(dataset, 'targets'):
        selected_indices = [i for i, label in enumerate(dataset.targets) if label in class_indices]
    else:
        selected_indices = [i for i, (_, label) in enumerate(dataset.imgs) if label in class_indices]
    selected_dataset = Subset(dataset, selected_indices)

    # Ensure no transform or compatible transform is applied
    selected_dataset.dataset.transform = None

    loader = DataLoader(
        selected_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
    )
    return loader

def pairwise_difference(a, b):
  """
  Computes pairwise difference between rows of two matrices.

  Args:
    a: NumPy array of shape (N_a, D)
    b: NumPy array of shape (N_b, D)

  Returns:
    NumPy array of shape (N_a, N_b, D) containing pairwise differences.
  """
  return a[:, np.newaxis, :] - b[np.newaxis, :, :]

def run_experiment(dataset_name, template, use_pseudo_data):
    # 1. Load dataset (test split)
    dataset = get_dataset(dataset_name, data_root='data', split='test')
    class_names = dataset.classes
    num_classes = len(class_names)

    if use_pseudo_data:
        PSEUDO_IMAGES_FOLDER = f'pseudo_images/{dataset_name}'
        pseudo_dataset = PseudoImageDataset(
            image_folder=PSEUDO_IMAGES_FOLDER,
            class_names=class_names,
            transform=None
        )
        dataloader = DataLoader(
            pseudo_dataset,
            batch_size=64,
            shuffle=False,
            pin_memory=True,
            collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
        )
    else:
        selected_class_indices = list(range(num_classes))
        dataloader = get_image_loader(dataset, selected_class_indices, 64, shuffle=False)

    # 2. Load model
    model_info = load_model(model_name, device=device)

    # 3. Create captions and compute text embeddings
    captions = [template.format(c) for c in class_names]
    text_embeddings = get_text_embeddings(captions, model_info, device, batch_size=64)

    # 4. Compute image embeddings
    image_embeddings, all_targets = get_image_embeddings(dataloader, model_info, device)
    all_targets = np.array(all_targets)

    # Normalize embeddings
    image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True)

    # img shape: (N_img, D), text shape: (N_text, D)

    # 5. Compute similarities and predictions
    temperature = 0.01
    similarities = np.matmul(image_embeddings, text_embeddings.T) / temperature
    probs = np.exp(similarities) / np.sum(np.exp(similarities), axis=1, keepdims=True)

    preds = np.argmax(probs, axis=1)
    confidences = np.max(probs, axis=1)

    predicted_cm_scores = []
    # do some class-wise operations
    for c in range(num_classes):
        # Select rows of image_embeddings where the target is class c
        c_embeddings = image_embeddings[all_targets == c] # shape (N_c, D)

        other_class_text_embeddings = np.concatenate([text_embeddings[i:i+1] for i in range(num_classes) if i != c], axis=0) # shape (N_t-1, D)
        text_diff = pairwise_difference(text_embeddings[c:c+1], other_class_text_embeddings) # shape (1, N_t-1, D)
        
        c_embeddings = c_embeddings / np.linalg.norm(c_embeddings, axis=-1, keepdims=True)
        text_diff = text_diff / np.linalg.norm(text_diff, axis=-1, keepdims=True)

        cm_scores = np.einsum('cd, td -> ct', c_embeddings, text_diff[0]) # shape (N_c, N_t-1)
        
        # make softmax
        def softmax(x):
            """Compute softmax values for each sets of scores in x."""
            e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
            return e_x / np.sum(e_x, axis=-1, keepdims=True)
        
        def softmin(x):
            """Compute softmin values for each sets of scores in x (with numerical stability)."""
            neg_x = -x
            e_neg_x = np.exp(neg_x - np.max(neg_x, axis=-1, keepdims=True))
            return e_neg_x / np.sum(e_neg_x, axis=-1, keepdims=True)
        
        cm_scores = softmin(cm_scores)

        cm_score = np.min(cm_scores, axis=-1) # shape (N_c, 1)
        # cm_score = cm_score / (1 + cm_score) # shape (N_c, 1)
        
        cm_score = np.mean(cm_score, axis=0) # shape (1,)
        predicted_cm_scores.append(cm_score)

    # Compute class-wise actual and predicted accuracies
    actual_accuracies = []
    # predicted_accuracies = []
    for c in range(num_classes):
        cur_class_indices = (all_targets == c)
        cur_class_pred = preds[cur_class_indices]
        cur_class_accuracy = (cur_class_pred == c).mean()

        if np.isnan(cur_class_accuracy):
            print(f"Class {c}, {class_names[c]} has NaN accuracy")
            cur_class_accuracy = 0.0
        # predicted_cur_class_indices = (preds == c)
        # predicted_confidences = confidences[predicted_cur_class_indices]
        # predicted_acc = predicted_confidences.mean() if predicted_confidences.size > 0 else np.nan

        actual_accuracies.append(cur_class_accuracy)
        # predicted_accuracies.append(predicted_acc if not np.isnan(predicted_acc) else 0.0)
            
    # Compute Spearman correlation
    spearman_corr = spearmanr(np.array(actual_accuracies), predicted_cm_scores )# np.array(predicted_accuracies))

    # Return data for plotting outside
    return np.array(actual_accuracies), np.array(predicted_cm_scores), spearman_corr

# Now we call run_experiment for pseudo and real data and plot them together
for dinfo in datasets_info:
    actual_pseudo, predicted_pseudo, spearman_pseudo = run_experiment(dinfo["name"], dinfo["template"], use_pseudo_data=True)
    actual_real, predicted_real, spearman_real = run_experiment(dinfo["name"], dinfo["template"], use_pseudo_data=False)

    # Recompute pseudo Spearman correlation using actual_real for the y-values
    pseudo_spearman_corr = spearmanr(actual_real, predicted_pseudo)

    # Create a single plot for both pseudo and real
    fig, ax = plt.subplots(figsize=(8,8))
    # For pseudo, use actual_real as y-axis
    ax.scatter(predicted_pseudo, actual_real, color='tab:orange', label='Pseudo Images', alpha=0.5)
    ax.scatter(predicted_real, actual_real, color='tab:blue', label='Real Images', alpha=0.5)

    ax.set_xlabel("Predicted Accuracy (Confidence)")
    ax.set_ylabel("Actual Accuracy (Real Data)")
    pseudo_corr_str = f"{pseudo_spearman_corr.correlation:.2f}" if pseudo_spearman_corr.correlation is not None else "N/A"
    real_corr_str = f"{spearman_real.correlation:.2f}" if spearman_real.correlation is not None else "N/A"
    ax.set_title(f"Calibration Approach: {dinfo['name']}\nSpearman (Pseudo vs Real Accuracy): {pseudo_corr_str}, Spearman (Real vs Real Accuracy): {real_corr_str}")
    print(f"Calibration Approach: {dinfo['name']}\nSpearman (Pseudo vs Real Accuracy): {pseudo_corr_str}, Spearman (Real vs Real Accuracy): {real_corr_str}")
    ax.grid(True)
    # ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.legend()

    # Save the figure
    plot_filename = f"figures/{dinfo['name']}_CM_softmin.png"
    plt.savefig(plot_filename)
    plt.close(fig)

Processed batch 1/1


Processing Images: 100%|██████████| 25/25 [00:23<00:00,  1.08it/s]


Processed batch 1/1


Processing Images: 100%|██████████| 493/493 [04:20<00:00,  1.89it/s]


Calibration Approach: NWPU-RESISC45
Spearman (Pseudo vs Real Accuracy): 0.22, Spearman (Real vs Real Accuracy): 0.40
Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 59/59 [00:54<00:00,  1.09it/s]


Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 322/322 [03:19<00:00,  1.61it/s]


Calibration Approach: Stanford_dogs
Spearman (Pseudo vs Real Accuracy): 0.23, Spearman (Real vs Real Accuracy): 0.47


In [5]:
# Image Method

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Subset, Dataset
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr

from utils.dataset import get_dataset
from utils.model_utils import load_model, get_text_embeddings, get_image_embeddings
import os
import glob
from PIL import Image

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"

datasets_info = [
    # {"name": "NWPU-RESISC45", "template": "a satellite photo containing {}."},
    # {"name": "Stanford_dogs", "template": "a photo of {}, a type of dog."},
    {"name": "CUB_200_2011", "template": "a photo of {}, a type of bird."},
    {"name": "Flower102", "template": "a photo of {}, a type of flower."},
]

class PseudoImageDataset(Dataset):
    def __init__(self, image_folder, class_names, transform=None):
        self.image_paths = []
        self.labels = []
        self.transform = transform
        self.class_names = class_names

        # Store class_names in a set for faster lookup
        class_name_set = set(class_names)

        for image_path in glob.glob(os.path.join(image_folder, '*.png')):
            filename = os.path.basename(image_path)
            class_name = filename.split('-')[0].replace('_', ' ')
            if class_name in class_name_set:
                self.image_paths.append(image_path)
                self.labels.append(class_name)

        # Convert labels to indices
        self.labels = [class_names.index(label) for label in self.labels]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, label

def get_image_loader(dataset, class_indices, batch_size, shuffle=False):
    if hasattr(dataset, 'targets'):
        selected_indices = [i for i, label in enumerate(dataset.targets) if label in class_indices]
    else:
        selected_indices = [i for i, (_, label) in enumerate(dataset.imgs) if label in class_indices]
    selected_dataset = Subset(dataset, selected_indices)

    # Ensure no transform or compatible transform is applied
    selected_dataset.dataset.transform = None

    loader = DataLoader(
        selected_dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
    )
    return loader

def pairwise_difference(a, b):
  """
  Computes pairwise difference between rows of two matrices.

  Args:
    a: NumPy array of shape (N_a, D)
    b: NumPy array of shape (N_b, D)

  Returns:
    NumPy array of shape (N_a, N_b, D) containing pairwise differences.
  """
  return a[:, np.newaxis, :] - b[np.newaxis, :, :]

def run_experiment(dataset_name, template, use_pseudo_data):
    # 1. Load dataset (test split)
    dataset = get_dataset(dataset_name, data_root='data', split='test')
    class_names = dataset.classes
    num_classes = len(class_names)

    if use_pseudo_data:
        PSEUDO_IMAGES_FOLDER = f'pseudo_images/{dataset_name}'
        pseudo_dataset = PseudoImageDataset(
            image_folder=PSEUDO_IMAGES_FOLDER,
            class_names=class_names,
            transform=None
        )
        dataloader = DataLoader(
            pseudo_dataset,
            batch_size=64,
            shuffle=False,
            pin_memory=True,
            collate_fn=lambda batch: (list(zip(*batch))[0], list(zip(*batch))[1])
        )
    else:
        selected_class_indices = list(range(num_classes))
        dataloader = get_image_loader(dataset, selected_class_indices, 64, shuffle=False)

    # 2. Load model
    model_info = load_model(model_name, device=device)

    # 3. Create captions and compute text embeddings
    captions = [template.format(c) for c in class_names]
    text_embeddings = get_text_embeddings(captions, model_info, device, batch_size=64)

    # 4. Compute image embeddings
    image_embeddings, all_targets = get_image_embeddings(dataloader, model_info, device)
    all_targets = np.array(all_targets)

    # Normalize embeddings
    image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True)
    text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True)

    # img shape: (N_img, D), text shape: (N_text, D)

    # 5. Compute similarities and predictions
    temperature = 0.01
    similarities = np.matmul(image_embeddings, text_embeddings.T) / temperature
    probs = np.exp(similarities) / np.sum(np.exp(similarities), axis=1, keepdims=True)

    preds = np.argmax(probs, axis=1)
    confidences = np.max(probs, axis=1)

    predicted_cm_scores = []
    # do some class-wise operations
    for c in range(num_classes):
        # Select rows of image_embeddings where the target is class c
        c_embeddings = image_embeddings[all_targets == c] # shape (N_c, D)

        other_class_text_embeddings = np.concatenate([text_embeddings[i:i+1] for i in range(num_classes) if i != c], axis=0) # shape (N_t-1, D)
        text_diff = pairwise_difference(text_embeddings[c:c+1], other_class_text_embeddings) # shape (1, N_t-1, D)
        
        c_embeddings = c_embeddings / np.linalg.norm(c_embeddings, axis=-1, keepdims=True)
        text_diff = text_diff / np.linalg.norm(text_diff, axis=-1, keepdims=True)

        cm_scores = np.einsum('cd, td -> ct', c_embeddings, text_diff[0]) # shape (N_c, N_t-1)
        
        # make softmax
        def softmax(x):
            """Compute softmax values for each sets of scores in x."""
            e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
            return e_x / np.sum(e_x, axis=1, keepdims=True)
        
        def softmin(x):
            """Compute softmin values for each sets of scores in x (with numerical stability)."""
            neg_x = -x
            e_neg_x = np.exp(neg_x - np.max(neg_x, axis=1, keepdims=True))
            return e_neg_x / np.sum(e_neg_x, axis=1, keepdims=True)
        
        # cm_scores = softmax(1/cm_scores)

        cm_score = np.min(cm_scores, axis=-1) # shape (N_c, 1)
        # # cm_score = cm_score / (1 + cm_score) # shape (N_c, 1)
        
        def mask_score(cm_score):
            positive_or_zero_mask = cm_score >= 0
            negative_mask = cm_score < 0
            cm_score[positive_or_zero_mask] = 1
            cm_score[negative_mask] = 0
            return cm_score

        cm_score = mask_score(cm_score)
        cm_score = np.mean(cm_score, axis=0) # shape (1,)

        if np.isnan(cm_score):
            print(f"Class {c}, {class_names[c]} has NaN cm_score")
            cm_score = 0.0

        predicted_cm_scores.append(cm_score)

    # Compute class-wise actual and predicted accuracies
    actual_accuracies = []
    # predicted_accuracies = []
    for c in range(num_classes):
        cur_class_indices = (all_targets == c)
        cur_class_pred = preds[cur_class_indices]
        cur_class_accuracy = (cur_class_pred == c).mean()

        if np.isnan(cur_class_accuracy):
            print(f"Class {c}, {class_names[c]} has NaN accuracy")
            cur_class_accuracy = 0.0
        # predicted_cur_class_indices = (preds == c)
        # predicted_confidences = confidences[predicted_cur_class_indices]
        # predicted_acc = predicted_confidences.mean() if predicted_confidences.size > 0 else np.nan

        actual_accuracies.append(cur_class_accuracy)
        # predicted_accuracies.append(predicted_acc if not np.isnan(predicted_acc) else 0.0)
            
    # Compute Spearman correlation
    spearman_corr = spearmanr(np.array(actual_accuracies), predicted_cm_scores )# np.array(predicted_accuracies))

    # Return data for plotting outside
    return np.array(actual_accuracies), np.array(predicted_cm_scores), spearman_corr

# Now we call run_experiment for pseudo and real data and plot them together
for dinfo in datasets_info:
    actual_pseudo, predicted_pseudo, spearman_pseudo = run_experiment(dinfo["name"], dinfo["template"], use_pseudo_data=True)
    actual_real, predicted_real, spearman_real = run_experiment(dinfo["name"], dinfo["template"], use_pseudo_data=False)

    # Recompute pseudo Spearman correlation using actual_real for the y-values
    pseudo_spearman_corr = spearmanr(actual_real, predicted_pseudo)

    # Create a single plot for both pseudo and real
    fig, ax = plt.subplots(figsize=(8,8))
    # For pseudo, use actual_real as y-axis
    ax.scatter(predicted_pseudo, actual_real, color='tab:orange', label='Pseudo Images', alpha=0.5)
    ax.scatter(predicted_real, actual_real, color='tab:blue', label='Real Images', alpha=0.5)

    ax.set_xlabel("Predicted Accuracy (Confidence)")
    ax.set_ylabel("Actual Accuracy (Real Data)")
    pseudo_corr_str = f"{pseudo_spearman_corr.correlation:.2f}" if pseudo_spearman_corr.correlation is not None else "N/A"
    real_corr_str = f"{spearman_real.correlation:.2f}" if spearman_real.correlation is not None else "N/A"
    ax.set_title(f"{dinfo['name']}\nSpearman (Pseudo vs Real Accuracy): {pseudo_corr_str}, Spearman (Real vs Real Accuracy): {real_corr_str}")
    print(f"{dinfo['name']}\nSpearman (Pseudo vs Real Accuracy): {pseudo_corr_str}, Spearman (Real vs Real Accuracy): {real_corr_str}")
    ax.grid(True)
    # ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.legend()

    # Save the figure
    plot_filename = f"figures/{dinfo['name']}_CM_count.png"
    plt.savefig(plot_filename)
    plt.close(fig)

Processed batch 1/4
Processed batch 2/4
Processed batch 3/4
Processed batch 4/4


Processing Images: 100%|██████████| 98/98 [04:33<00:00,  2.79s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Class 93, White breasted Nuthatch has NaN cm_score
Class 93, White breasted Nuthatch has NaN accuracy


  cur_class_accuracy = (cur_class_pred == c).mean()
  ret = ret.dtype.type(ret / rcount)


Processed batch 1/4
Processed batch 2/4
Processed batch 3/4
Processed batch 4/4


Processing Images: 100%|██████████| 185/185 [02:26<00:00,  1.26it/s]


CUB_200_2011
Spearman (Pseudo vs Real Accuracy): 0.73, Spearman (Real vs Real Accuracy): 1.00
Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 51/51 [00:49<00:00,  1.03it/s]


Class 11, black-eyed susan has NaN cm_score
Class 26, colt's foot has NaN cm_score
Class 32, desert-rose has NaN cm_score
Class 44, globe-flower has NaN cm_score
Class 47, hard-leaved pocket orchid has NaN cm_score
Class 72, pink-yellow dahlia has NaN cm_score
Class 79, ruby-lipped cattleya has NaN cm_score
Class 11, black-eyed susan has NaN accuracy
Class 26, colt's foot has NaN accuracy
Class 32, desert-rose has NaN accuracy
Class 44, globe-flower has NaN accuracy
Class 47, hard-leaved pocket orchid has NaN accuracy
Class 72, pink-yellow dahlia has NaN accuracy
Class 79, ruby-lipped cattleya has NaN accuracy
Processed batch 1/2
Processed batch 2/2


Processing Images: 100%|██████████| 128/128 [01:16<00:00,  1.67it/s]


Flower102
Spearman (Pseudo vs Real Accuracy): 0.48, Spearman (Real vs Real Accuracy): 1.00


In [10]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Dataset", "Spearman (Pseudo vs Real) - Discretization Method", "Spearman (Pseudo vs Real) - Default method", "Spearman (Real vs Real) - Discretization Method", "Spearman (Real vs Real) - Default method"]
table.add_row(["Stanford_dogs", "0.59", "**0.62**", "**1.00**", "0.98"])
table.add_row(["NWPU-RESISC45", "**0.61**", "0.58", "**1.00**", "0.98"])
table.add_row(["Flower102", "**0.48**", "0.35", "**1.00**", "0.96"])
table.add_row(["CUB_200_2011", "0.73", "**0.76**", "**1.00**", "0.99"])

print(table)

+---------------+---------------------------------------------------+--------------------------------------------+-------------------------------------------------+------------------------------------------+
|    Dataset    | Spearman (Pseudo vs Real) - Discretization Method | Spearman (Pseudo vs Real) - Default method | Spearman (Real vs Real) - Discretization Method | Spearman (Real vs Real) - Default method |
+---------------+---------------------------------------------------+--------------------------------------------+-------------------------------------------------+------------------------------------------+
| Stanford_dogs |                        0.59                       |                  **0.62**                  |                     **1.00**                    |                   0.98                   |
| NWPU-RESISC45 |                      **0.61**                     |                    0.58                    |                     **1.00**                    |    