In [1]:
### System Requirement ###
# At least 32 GB RAM (Cpu)

# Set-up the necessary data for the experiments in playground.ipynb

In [None]:
dataset = "imagenet"
device = 'cuda'
model_name = 'ViT-L-14-336' # 'ViT-H-14'
seed = 0
num_last_layers_ = 4
subset_dim = 10
tot_samples_per_class = 50
dataset_text_name = "top_1500_nouns_5_sentences_imagenet_clean"
datataset_image_name = "imagenet"
algorithm = "svd_data_approx"
path = './datasets'
# Additional params for LLAVA
full_output = False # If we want intermediate hiddn tokens, def False
vision_proj = True # If we want to project in shared space, def True

if model_name == "ViT-H-14":
    pretrained = "laion2B-s32B-b79K"
    precision = "fp32"
elif model_name == "ViT-L-14":
    pretrained = "laion2B-s32B-b82K"
    precision = "fp32"
elif model_name == "ViT-B-16":
    pretrained = "laion2B-s34B-b88K"
    precision = "fp32"
elif model_name == "ViT-B-32":
    pretrained = "laion2B-s34B-b79K"
    precision = "fp32"
elif model_name == "ViT-L-14-336":
    pretrained = "openai"
    precision = "fp16"

In [11]:
# Retrieve the activations (hooks) for each head in each layer, of a given ViT-model.
# Opt.: Setup a seed and use only a subset of the dataset (Imagenet)
!python -m utils.scripts.compute_activation_values --dataset {dataset} --device {device} --model {model_name} --pretrained {pretrained} --seed {seed} --batch_size 1  --samples_per_class {subset_dim} --tot_samples_per_class {tot_samples_per_class} --quantization {precision} --cache_dir "../cache/" --full_output {full_output} --vision_proj {vision_proj} 

Using local files
Model parameters: 151,277,313
Context length: 77
Vocab size: 49408
Len of res: 12
We are using a dataset containing 1000 images.
100%|███████████████████████████████████████| 1000/1000 [01:29<00:00, 11.23it/s]

Concatenating chunk files into final .npy arrays...
Final single-file arrays created:
  ./output_dir/imagenet_attn_ViT-B-32_seed_1.npy
  ./output_dir/imagenet_mlp_ViT-B-32_seed_1.npy
  ./output_dir/imagenet_cls_attn_ViT-B-32_seed_1.npy
  ./output_dir/imagenet_labels_ViT-B-32_seed_1.npy
Deleting chunk files...
Chunk files removed.
Done.


In [12]:
# Use the previous outputs (i.e. all the activations) to derive the final clip embeddings for the image dataset.
!python -m utils.scripts.compute_images_embedding --dataset {dataset} --model {model_name} --seed {seed}

(1000, 12, 12, 512) (1000, 13, 512)


In [12]:
# Derive the CLIP-embeddings of the classes' labels of a given dataset (i.e. allow zero-shot classifications)
!python -m utils.scripts.compute_classes_embeddings --dataset {dataset} --device {device} --model {model_name} --pretrained {pretrained} --cache_dir "../cache/"

Model parameters: 427,944,193
Context length: 77
Vocab size: 49408
100%|███████████████████████████████████████| 1000/1000 [00:25<00:00, 38.49it/s]


In [13]:
# Given a dataset of text compute their CLIP's embeddings.
!python -m utils.scripts.compute_text_embeddings --device {device} --model {model_name} --pretrained {pretrained} --cache_dir "../cache/" --data_path "utils/text_descriptions/{dataset_text_name}.txt"

Model parameters: 427,944,193
Context length: 77
Vocab size: 49408
100%|█████████████████████████████████████████████| 8/8 [00:06<00:00,  1.30it/s]


# Test text explanations

In [13]:
# Test an algorithm to explain the CLIP-embeddings with text (ours: svd_data_approx, their: text_span)
!python -m utils.scripts.compute_text_explanations --device {device} --model {model_name} --algorithm svd_data_approx --dataset {dataset} --seed {seed} --num_of_last_layers 4 --text_descriptions {dataset_text_name}

Number of layers: 12
100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 50.86it/s]
  0%|                                                     | 0/4 [00:00<?, ?it/s]
Layer [8], Head: 0

Layer [8], Head: 1

Layer [8], Head: 2

Layer [8], Head: 3

Layer [8], Head: 4

Layer [8], Head: 5

Layer [8], Head: 6

Layer [8], Head: 7

Layer [8], Head: 8

Layer [8], Head: 9

Layer [8], Head: 10

Layer [8], Head: 11
 25%|███████████▎                                 | 1/4 [00:05<00:15,  5.19s/it]
Layer [9], Head: 0

Layer [9], Head: 1

Layer [9], Head: 2

Layer [9], Head: 3

Layer [9], Head: 4

Layer [9], Head: 5

Layer [9], Head: 6

Layer [9], Head: 7

Layer [9], Head: 8

Layer [9], Head: 9

Layer [9], Head: 10

Layer [9], Head: 11
 50%|██████████████████████▌                      | 2/4 [00:09<00:09,  4.91s/it]
Layer [10], Head: 0

Layer [10], Head: 1

Layer [10], Head: 2

Layer [10], Head: 3

Layer [10], Head: 4

Layer [10], Head: 5

Layer [10], Head: 6

Layer [10], Head: 7

Layer

In [None]:
# Print the components with have generated
import json

# Read JSON lines
with open("output_dir/CIFAR10_completeness_top_1500_nouns_5_sentences_imagenet_clean_ViT-L-14-336_algo_svd_data_approx_seed_0_max_text80.jsonl", "r") as json_file:
    for line in json_file:
        entry = json.loads(line)  # Parse each line as a JSON object
        layer = entry["layer"]
        head = entry["head"]
        texts = entry["embeddings_sort"]

        if entry["head"] == -1:
            print(entry.keys())
            print(entry["accuracy"])
        
        print(len(entry["s"]))
        print(f"Layer: {layer}, Head: {head}")
        print("Texts:")
        for text in texts:
            print(text)

In [None]:
# Go on playground to analyze the results
models = ["ViT-B-32", "ViT-B-16", "ViT-L-14", "ViT-H-14", "ViT-L-14-336"]
algorithms = ["svd_data_approx", "text_span"]
seed = 0
for model in models:
    for algorithm in algorithms:
        with open(f"output_dir/imagenet_completeness_top_1500_nouns_5_sentences_imagenet_clean_ViT-B-32_algo_text_span_seed_0.jsonl", "r") as json_file:

            for line in json_file:
                entry = json.loads(line)  # Parse each line as a JSON object
                layer = entry["layer"]
                head = entry["head"]
                texts = entry["embeddings_sort"]

                if entry["head"] == -1:
                    print(entry.keys())
                    print(entry["accuracy"])