In [1]:
import lightning as L

from fusion_bench import (
    CLIPVisionModelPool,
    CLIPVisionModelTaskPool,
    get_default_config_path,
    initialize_hydra_config,
    instantiate,
)
from fusion_bench.models.hf_clip import HFCLIPClassifier
from fusion_bench.tasks.clip_classification import (
    get_classnames_and_templates,
    get_num_classes,
)

In [2]:
fabric = L.Fabric(accelerator="auto", devices=1)
fabric.launch()

In [3]:
config = initialize_hydra_config(
    config_name="fabric_model_fusion",
    config_path=get_default_config_path(),
    overrides=[
        "method=emr_merging/emr_merging",
        "modelpool=CLIPVisionModelPool/clip-vit-base-patch32_TA8_model_only",
        "taskpool=CLIPVisionModelTaskPool/clip-vit-classification_TA8.yaml",
    ],
)

In [4]:
algorithm = instantiate(config.method)
modelpool: CLIPVisionModelPool = instantiate(config.modelpool)
taskpool: CLIPVisionModelTaskPool = instantiate(config.taskpool)
taskpool.fabric = fabric

Unused argument: base_model=openai/clip-vit-base-patch32


In [5]:
emr_model = algorithm.run(modelpool)

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  self.rescaler = nn.Parameter(torch.tensor(rescaler), requires_grad=False)


In [6]:
emr_model

EMRModulatedModel(
  (backbone): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
        (position_embedding): Embedding(50, 768)
      )
      (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): QuickGELUActivation()
            

In [None]:
if not taskpool._is_setup:
    taskpool.setup()

classifier = HFCLIPClassifier(
    taskpool.clip_model,
    processor=taskpool.processor,
)
classifier.vision_model = emr_model
classifier = fabric.to_device(classifier)
results = {}
for task_name in taskpool._test_datasets:
    emr_model.set_task(task_name)
    classnames, templates = get_classnames_and_templates(task_name)
    classifier.set_classification_task(
        classnames=classnames,
        templates=templates,
    )
    result = taskpool._evaluate(
        classifier,
        test_loader=taskpool.test_dataloaders[task_name],
        task_name=task_name,
        num_classes=get_num_classes(task_name),
    )
    print(f"Results for task {task_name}:\n{result}")
    results[task_name] = result

print("Final results:", results)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
