# script used to estimate model performance on benchmark w/ a linear readout

# environment setup

In [None]:
import pickle, pandas, os, json, numpy as np
from PIL import Image
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import torch
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [None]:
! pip install open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-2.24.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting timm (from open_clip_torch)
  Downloading timm-1.0.3-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy, timm, open_clip_torch
Successfully installed ftfy-6.2.0 open_clip_torch-2.24.0 timm-1.0.3


# define common function to extract features from dinos, clips, maes

In [None]:
from transformers import AutoImageProcessor, ViTMAEModel
from transformers import AutoImageProcessor, AutoModel
import open_clip

In [None]:
class define_clip:
    """
    use: X.extract_features(imagename, modelname)
         e.g. X.extract_features(image, 'ViT-B-32')

    """
    def __init__(self, model, dataset):

        self.make = open_clip.create_model_and_transforms
        self.name = model
        self.dataset = dataset
        self.model, _, self.prep = self.make(model,pretrained=dataset)
        self.model.eval()

    def extract(self, images):
        with torch.no_grad():
            inputs = self.prep(images).unsqueeze(0)
            features = self.model.encode_image(inputs)
        return np.squeeze(features.detach().numpy())

class define_dino:

    """
    # https://huggingface.co/docs/transformers/model_doc/dinov2
    """

    def __init__(self, model):

        self.model = AutoModel.from_pretrained("facebook/%s"%model)
        self.prep = AutoImageProcessor.from_pretrained("facebook/%s"%model)
        self.model.eval()

    def extract(self, images):
        with torch.no_grad():
          inputs = self.prep(images=images, return_tensors="pt")
          features = self.model(**inputs).pooler_output.detach().numpy()
        return features

class define_mae:

    """
    extrac cls token from maes
    from: https://github.com/facebookresearch/mae/blob/main/models_mae.py#L161
    """

    def __init__(self, model):

        self.model = ViTMAEModel.from_pretrained("facebook/%s"%model)
        self.prep = AutoImageProcessor.from_pretrained("facebook/%s"%model)
        self.model.eval()


    def extract(self, images):
        with torch.no_grad():
            inputs = self.prep(images=images, return_tensors="pt")
            features = self.model(**inputs).last_hidden_state
        return features[:, 0, :].detach().numpy().squeeze()

# define functions to extract features and generate same-different training data

In [None]:
def extract_features0(df, model, single=True):

  images, oddities, features = {} , {}, []

  for index, i_trial in df.iterrows():

    oddities[i_trial.trial] = i_trial.oddity_index

    for i_index, i_image in enumerate( extract_list( i_trial.images ) ):
      i_path = os.path.join(image_directory, i_image)
      # open image
      i_image = Image.open(i_path)
      # extract features maybe
      if single: features.append( model(i_image).flatten() )
      # # store image
      images['%s-%s'%(i_trial.trial,i_index)] = i_image

  # extract features maybe
  if not single: features = model([images[i] for i in images])

  return np.array(features).squeeze(), oddities, images

def generate_delta_vectors0(trials, features, oddities, images):

  def delta(a, b, xtype='diff'):

    if xtype == 'diff':
        x = a - b
    elif xtype == 'abs':
        x = np.array(np.abs(a - b))
    elif xtype == 'sqrt':
        x = np.sqrt(np.array(np.abs(a - b)))
    elif xtype == 'sqr':
        x = (np.array(np.abs(a - b)))**2
    elif xtype == 'product':
        x = a * b

    return x

  trial_markers = []
  deltas = []
  labels = []

  for i_trial in trials:

    trial_indices = [i.split('-')[0] == i_trial for i in images ]

    i_features = features[ trial_indices ]

    oddity_index = oddities[i_trial]

    pairs = [] # we just need once -- okay to override

    for i in range(len(i_features)):

      for j in range(i+1,len(i_features)):

        i_delta = delta(i_features[i], i_features[j])
        deltas.append( i_delta )
        trial_markers.append( i_trial )
        pairs.append( (i,j) )

        if (i == oddity_index) or (j == oddity_index):
          labels.append(0) # different
        else:
          labels.append(1) # same

  all_inds = np.array(range(len(i_features)))

  location_of_indices = {i: np.nonzero((pairs == i).mean(1))[0] for i in all_inds}

  info = {'deltas': np.array(np.squeeze(deltas)),
          'labels': np.array(labels),
          'trials':trial_markers,
          'inds': all_inds,
          'locs': location_of_indices}

  return info

def extract_list(l):
  return [i[1:-1] for i in l[1:-1].split(', ')]

# define model to evaluate

In [None]:
dinos = ['dinov2-base','dinov2-large', 'dinov2-giant']

maes = ['vit-mae-base','vit-mae-large', 'vit-mae-huge']

clips = {'ViT-B-32': 'laion2b_s34b_b79k',
         'ViT-B-16': 'laion2b_s34b_b88k',
         'ViT-L-14': 'laion2b_s32b_b82k',
         'ViT-H-14': 'laion2b_s32b_b79k',
         'ViT-g-14': 'laion2b_s12b_b42k'}

In [None]:
models_to_evaluate = ['dinov2-base']

n_params = {}
models = {}
n_flops = {}

for i_model in models_to_evaluate:

  if i_model in dinos:
    models[i_model] = define_dino(i_model).extract
  elif i_model in maes:
    models[i_model] = define_mae(i_model).extract
  elif i_model in clips:
    models[i_model] = define_clip(i_model, clips[i_model]).extract

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

# set paths to images + load metadata

In [None]:
from google.colab import drive ; drive.mount('/content/gdrive')
base_directory = './gdrive/MyDrive/neurips_benchmark/for_reviewers/'
image_directory = os.path.join(base_directory, 'images')
data_directory = os.path.join(base_directory, 'data')
df = pandas.read_csv(os.path.join(data_directory, 'benchmark.csv'))
df.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# number of independent splits used to train the svm on each trial
n_permutations = 100
# fraction of data to include in each split
subset_fraction = 0.75

# iterate through each dataset
for i_dataset in df.dataset.unique():

  # iterate through each condition
  for i_condition in df[df.dataset==i_dataset].condition.unique():
    # select subset of data to train and test on
    idf = df[df.condition==i_condition]

    # identify all trials
    trials0 = idf.trial.unique()

    for i_model in list(models):

      # extract features from this dataset
      features0, oddities0, images0 = extract_features0(idf, models[i_model])

      # # # for each trial, generate vector of differences between each image
      diffs = generate_delta_vectors0(trials0, features0, oddities0, images0)

      for i_index, i_trial in idf.iterrows():

        # determine the index/image of the oddity
        i_oddity_index = oddities0[i_trial.trial]

        # extract indices and labels for training svm
        train_indices = [i.split('-')[0] != i_trial.trial for i in diffs['trials']]
        # vectors for the different between each image vector
        X_train = diffs['deltas'][train_indices,:]
        # labels for whether each vector was 'same' or 'different'
        y_train = diffs['labels'][train_indices].T

        # extract indices and labels for training svm
        test_indices = [i.split('-')[0] == i_trial.trial for i in diffs['trials']]
        # vectors for the different between each image vector
        X_test = diffs['deltas'][test_indices,:]
        # labels for whether each vector was 'same' or 'different'
        y_test = diffs['labels'][test_indices].T
        # for each iteration use X% of the available trials
        len_subset = int(subset_fraction*len(X_train))

        # prep to each each iteration
        choices = []

        for _ in range(n_permutations):

          # identify random subset of delta vectors to train on
          random_subset = np.random.permutation(len(X_train))[:len_subset]

          # define model to train a linear readout
          clf = make_pipeline(StandardScaler(),
                              SVC(class_weight='balanced', probability=True))
          # fit training data
          clf.fit(np.squeeze(X_train[random_subset,:]),y_train[random_subset])
          # predict performance on this trial
          y_hat = clf.predict_proba(X_test)

          # identify which image has the highest probability of being different
          i_diffs = [y_hat[:,0][diffs['locs'][i]].mean() for i in diffs['inds']]
          # determine whether the model-selected oddity matches ground truth
          i_trial_accuracy = i_oddity_index == np.argmax(i_diffs)
          # save
          choices.append(i_trial_accuracy)

        # store in original dataframe
        df.loc[i_trial.idx,'%s_svm_avg'%i_model] = np.mean(choices)
        df.loc[i_trial.idx,'%s_svm_med'%i_model] = np.median(choices)
        df.loc[i_trial.idx,'%s_svm_std'%i_model] = np.std(choices)
        df.loc[i_trial.idx,'%s_svm_sem'%i_model] = stats.sem(choices)
        # save
        df.to_csv(os.path.join(data_directory, 'your_results.csv'))