QLIP
====

**QLIP: Text-Aligned Visual Tokenization Unifies Auto-Regressive Multimodal Understanding and Generation**

* Paper: https://arxiv.org/pdf/2502.05178

![QLIP Overview](../assets/qlip_overview.png)


```bash
git clone https://github.com/NVlabs/QLIP.git

pip install torch torchvision
pip install transformers
```

### Load model

In [1]:
import sys
import torch
from transformers import CLIPImageProcessor, CLIPTokenizer
from torchvision.transforms import Normalize
from PIL import Image

sys.path.append("QLIP/QLIP")  # must clone repo
from modeling_qlip import QLIPModel

processor = CLIPImageProcessor.from_pretrained(
    "nvidia/QLIP-B-8-256"
)
tokenizer = CLIPTokenizer.from_pretrained(
    "nvidia/QLIP-B-8-256"
)
model = QLIPModel.from_pretrained(
    "nvidia/QLIP-B-8-256"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval().to(device);

  from .autonotebook import tqdm as notebook_tqdm
Some weights of QLIPModel were not initialized from the model checkpoint at nvidia/QLIP-B-8-256 and are newly initialized: ['vision_decoder.encoder.layers.0.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.0.self_attn.rope.freqs_sin', 'vision_decoder.encoder.layers.1.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.1.self_attn.rope.freqs_sin', 'vision_decoder.encoder.layers.10.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.10.self_attn.rope.freqs_sin', 'vision_decoder.encoder.layers.11.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.11.self_attn.rope.freqs_sin', 'vision_decoder.encoder.layers.2.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.2.self_attn.rope.freqs_sin', 'vision_decoder.encoder.layers.3.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.3.self_attn.rope.freqs_sin', 'vision_decoder.encoder.layers.4.self_attn.rope.freqs_cos', 'vision_decoder.encoder.layers.4.self_attn

### Inference example

In [None]:
image_path = "../samples/plants.jpg"
image = Image.open(image_path)
captions = ["a dog", "a cat", "a plant"]

input_ids = tokenizer(
    captions, return_tensors="pt", padding=True
).input_ids.to(device)
input_pixels = processor(
    images=image, return_tensors="pt", padding=True
).pixel_values.to(device)

with torch.no_grad():
    outputs = model(input_ids, input_pixels)

print(f"model outputs: {outputs.keys()}")

Unused or unrecognized kwargs: padding.


model outputs: odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output', 'reconstructions'])


In [8]:
print("logits per image:", outputs.logits_per_image.cpu().numpy())
print("logits per text:", outputs.logits_per_text.cpu().numpy())

logits per image: [[-1.5476025  -0.23974146 11.993626  ]]
logits per text: [[-1.5476025 ]
 [-0.23974146]
 [11.993626  ]]


In [10]:
with torch.no_grad():
    #image_features = outputs["image_embeds"]
    image_features = model.get_image_features(input_pixels)
    #text_features = outputs["text_embeds"]
    text_features = model.get_text_features(input_ids)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print(f"model label_probs: {label_probs}")

model label_probs: tensor([[1.3209e-06, 4.8831e-06, 9.9999e-01]], device='cuda:0')
