In [1]:
from PIL import Image
import requests
import torch
from torchvision import io
import numpy as np
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

In [2]:
# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
# _preprocess in Qwen2VLImageProcessor
# image can be visualized till `patches`
# data_format == ChannelDimension.LAST -> False
# patches.shape[0] % self.temporal_patch_size != 0 -> True
# TODO: why does processed_image look different with imshow?

# patches -> 2, 3, 1372, 2044 (image repeated across dim=0, c, h, w)
# grid_t = 1, grid_h = 98, grid_w = 146
# temporal_patch_size = 2
# patch_size = 14
# merge_size = 2
# resized_height = 1372
# resized_width = 2044
# channel = 3

#processor

In [None]:
# patches -> (1, 3, 1372, 2044)

#if patches.shape[0] % self.temporal_patch_size != 0:
    #repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
    #patches = np.concatenate([patches, repeats], axis=0)

# patches -> (2, 3, 1372, 2044)

In [14]:
height = 1372
width = 2044
channel = 3
temporal_patch_size = 2
patch_size = 14
merge_size = 2
patches = np.random.rand(2, channel, height, width)

grid_t = patches.shape[0] // temporal_patch_size
grid_h = height // patch_size
grid_w = width // patch_size

# patches -> (2, 3, 1372, 2044)

patches = patches.reshape(
    grid_t,
    temporal_patch_size,
    channel,
    grid_h // merge_size,
    merge_size,
    patch_size,
    grid_w // merge_size,
    merge_size,
    patch_size,
)
print(patches.shape)
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
print(patches.shape)
flatten_patches = patches.reshape(
    grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
)
print(flatten_patches.shape)

(1, 2, 3, 49, 2, 14, 73, 2, 14)
(1, 49, 73, 2, 2, 3, 2, 14, 14)
(14308, 1176)


In [None]:
# image_embeds in Qwen2VLForConditionalGeneration -> 3577, 3884
# (image_grid_thw[index].prod() // merge_length) = 3577

# TODO:
# 3577 image tokens (same as image_emebds) -> how to get from flatten_patches (14308, 1176)?
# pixel_values fed into image_embeds in Qwen2VisionTransformerPretrainedModel
# map image_tokens (3577, 3884) -> image_embeds (3577, 3884)
# -> pixel_values (14308, 1176) -> flatten_patches (14308, 1176)
# -> image patches (2, 3, 1372, 2044) -> original image (2, c, h, w)
# can print (c, h, w)

# map image_embeds (3577, 3884) to image (2, 3, 1372, 2044) :

# map pixel_values (14308, 1176) to image (2, 3, 1372, 2044)
# visualize which part of pixel_values correspond to which part of image

# map image_embeds (3577, 3884) to pixel_values (14308, 1176)

# hidden states have 3577 image token embeddings

In [None]:
# llava image inputs -> channel x height x wdith (3, 336, 336)
# image to embeddings :
# image size = (336, 336)
# kernel size = (14, 14) (stride = 14)
# (336 * 336) / (14 * 14) = 576 embeddings
# softmax(embeddings) -> interpolate and plot over image

In [5]:
processor.image_token

'<|image_pad|>'

In [3]:
# Image
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(url, stream=True).raw)

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]


# Preprocess the inputs
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
#print(text_prompt)
#raise

inputs = processor(
    text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to(model.device)

(1, 3, 1372, 2044)
(2, 3, 1372, 2044)


RuntimeError: No active exception to reraise

In [8]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])

In [None]:
# image_grid_thw -> tensor([[  1,  98, 146]])

In [12]:
import torch
a = torch.tensor([[  1,  98, 146]])
a[0].prod()

tensor(14308)

In [9]:
import numpy as np
a = np.array([[  1,  98, 146]])
a.prod()

14308

In [4]:
model.visual

Qwen2VisionTransformerPretrainedModel(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
  )
  (rotary_pos_emb): VisionRotaryEmbedding()
  (blocks): ModuleList(
    (0-31): 32 x Qwen2VLVisionBlock(
      (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (attn): VisionFlashAttention2(
        (qkv): Linear(in_features=1280, out_features=3840, bias=True)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (mlp): VisionMlp(
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): QuickGELUActivation()
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      )
    )
  )
  (merger): PatchMerger(
    (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
    (mlp): Sequential(
      (0): Linear(in_features=5120, out_features=5120, bias=True)
      (1): G

In [None]:
# Inference: Generation of the output
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)



In [32]:
torch.count_nonzero((inputs['input_ids'][0] == 151655).long())

tensor(3577, device='cuda:0')

In [27]:
inputs['input_ids'][0][50] == 151655

tensor(True, device='cuda:0')

In [2]:
1176*14

16464