In [15]:
import torch
from colpali_engine.models import BiModernVBert, BiModernVBertProcessor
from PIL import Image
from huggingface_hub import hf_hub_download
from torch.nn.functional import normalize

def get_device():
    if torch.cuda.is_available():
        return "cuda:0"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

model_id = "ModernVBERT/modernvbert-embed"

processor = BiModernVBertProcessor.from_pretrained(model_id)
model = BiModernVBert.from_pretrained(
            model_id,
            trust_remote_code=True,
            torch_dtype="auto",
            device_map="auto",
)


image = Image.open(hf_hub_download("HuggingFaceTB/SmolVLM", "example_images/rococo.jpg", repo_type="space"))
text = "This is a text"

# Prepare inputs
text_inputs = processor.process_texts([text]).to(get_device())
image_inputs = processor.process_images([image]).to(get_device())

text_outputs = model(**text_inputs).to(get_device())
image_outputs = model(**image_inputs).to(get_device())

def get_embeddings(outputs):
    with torch.no_grad():
        if isinstance(outputs, dict):
            embeddings = outputs["embeddings"]
        else:
            embeddings = outputs
    emb = normalize(embeddings[0], dim=-1).to(get_device()).tolist()
    return emb

text_emb = get_embeddings(text_outputs)
image_emb = get_embeddings(image_outputs)

print(f"text_emb: {len(text_emb)} first3: {text_emb[:3]}")
print(f"image_emb: {len(image_emb)} first3: {image_emb[:3]}")

text_emb: 768 first3: [-0.004450473003089428, 0.04333425685763359, -0.03766343742609024]
image_emb: 768 first3: [0.006753838155418634, -0.0058644916862249374, -0.046822741627693176]


In [None]:
model.config.to_dict()


{'image_token_id': 50407,
 'use_cache': True,
 'tie_word_embeddings': False,
 'scale_factor': 4,
 'additional_vocab_size': 40,
 'text_config': {'return_dict': True,
  'output_hidden_states': False,
  'torchscript': False,
  'dtype': None,
  'pruned_heads': {},
  'tie_word_embeddings': True,
  'chunk_size_feed_forward': 0,
  'is_encoder_decoder': False,
  'is_decoder': False,
  'cross_attention_hidden_size': None,
  'add_cross_attention': False,
  'tie_encoder_decoder': False,
  'architectures': None,
  'finetuning_task': None,
  'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},
  'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
  'task_specific_params': None,
  'problem_type': None,
  'tokenizer_class': None,
  'prefix': None,
  'bos_token_id': None,
  'pad_token_id': None,
  'eos_token_id': None,
  'sep_token_id': None,
  'decoder_start_token_id': None,
  'max_length': 20,
  'min_length': 0,
  'do_sample': False,
  'early_stopping': False,
  'num_beams': 1,
  'temperature': 1.0,
  'top_k': 50,
  'to

In [None]:
text2 = "Other text"
text_inputs = processor.process_texts([text, text2]).to(get_device())
text_outputs = model(**text_inputs).to(get_device())
text_outputs


In [None]:
text2 = "Other text"
text3 = "Other text"
text_inputs = processor.process_texts([text, text2, text3]).to(get_device())
text_outputs = model(**text_inputs).to(get_device())
text_outputs


In [None]:
text2 = "Other text"
text3 = "Other text"
text4 = "Other text"
text_inputs = processor.process_texts([text, text2, text3, text4]).to(get_device())
text_outputs = model(**text_inputs).to(get_device())
text_outputs[0]


In [None]:
text_outputs[0]

In [12]:
def _get_embeddings(outputs):
		with torch.no_grad():
			if isinstance(outputs, dict):
				embeddings = outputs["embeddings"]
			else:
				embeddings = outputs
			vectors = []
			for emb in embeddings:
				vectors.append(normalize(emb, dim=-1).to(get_device()).tolist())
			return vectors

In [14]:
embs = _get_embeddings(text_outputs)
len(embs)

4

In [None]:
from colpali_engine.models from Config