In [94]:
import torch
from transformers import AutoProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("/120040051/MLLM_Repos/clip-vit-large-patch14-336").to(device)
clip_processor = AutoProcessor.from_pretrained("/120040051/MLLM_Repos/clip-vit-large-patch14-336")



In [2]:
model

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [None]:
from PIL import Image
import os

image_folder = '/120040051/merged0728'
image_file = 'P00424_a_prisoner_taking_a_shower_in_a_board_room.jpg'
image_path = os.path.join(image_folder, image_file)

image = Image.open(image_path).convert("RGB")

image

In [5]:
device

'cuda'

In [6]:
text_caps = ["a prisoner taking a shower in a board room", "a prisoner taking a shower in jail"]

inputs = processor(
    text=text_caps, images=image, return_tensors="pt", padding=True
)

inputs = inputs.to(device)
outputs = model(**inputs)

In [7]:
!pip show transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: transformers
Version: 4.45.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /root/miniconda3/envs/llava/lib/python3.12/site-packages
Editable project location: /120040051/github_repos/transformers-4.44.0
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: transformers-stream-generator


In [8]:
outputs.logits_per_image

tensor([[15.3011,  9.1208]], device='cuda:0', grad_fn=<TBackward0>)

In [9]:
model

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [10]:
from transformers import LlavaForConditionalGeneration

model_dir = "/120040051/MLLM_Repos/llava-1.5-13b-hf"
model = LlavaForConditionalGeneration.from_pretrained(model_dir, device_map='auto')

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
model

LlavaForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): Q

In [12]:
from transformers import AutoProcessor

llava_processor = AutoProcessor.from_pretrained(model_dir)

In [15]:
llava_processor

LlavaProcessor:
- image_processor: CLIPImageProcessor {
  "crop_size": {
    "height": 336,
    "width": 336
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "CLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "LlavaProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 336
  }
}

- tokenizer: LlamaTokenizerFast(name_or_path='/120040051/MLLM_Repos/llava-1.5-13b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_wor

In [21]:
image_folder = "/120040051/test_resource/challset_llava-1.5-13b"
image_1 = 'A00079_cows_in_a_field_paddling_the_boat.jpg'
image_2 = 'A00108_the_firemen_in_the_fire_mailing_a_letter.jpg'

image_path_1 = os.path.join(image_folder, image_1)
image_path_2 = os.path.join(image_folder, image_2)


image_1 = Image.open(image_path_1).convert("RGB")
image_2 = Image.open(image_path_2).convert("RGB")

images = [image_1, image_2]

In [41]:
outputs = llava_processor.image_processor(images=image_1, return_tensors="pt")

In [42]:
outputs.pixel_values.shape

torch.Size([1, 3, 336, 336])

In [50]:
vision_tower_outputs = model.vision_tower(outputs.pixel_values, output_hidden_states=True)

In [52]:
selected_vision_feature = vision_tower_outputs.hidden_states[-2][:, 1:, :]

In [53]:
selected_vision_feature.shape

torch.Size([1, 576, 1024])

In [54]:
vision_tower_outputs.hidden_states[-2].shape

torch.Size([1, 577, 1024])

In [55]:
mm_projector_outputs = model.multi_modal_projector(selected_vision_feature)

In [56]:
mm_projector_outputs.shape

torch.Size([1, 576, 5120])

In [18]:
outputs.pixel_values.shape

torch.Size([1, 3, 336, 336])

In [61]:
llava_processor

LlamaTokenizerFast(name_or_path='/120040051/MLLM_Repos/llava-1.5-13b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [83]:
text_template_1 = """USER: <image>
What is the person doing?
Answer no more than 5 words. ASSISTANT:"""

text_template_2 = """USER: <image>
What is the person doing? Please think step by step.
Answer no more than 5 words. ASSISTANT:"""

texts = [text_template_1, text_template_2]
text_inputs = llava_processor.tokenizer(text_template_1, return_tensors="pt", padding=True)

In [84]:
text_inputs.input_ids.shape

torch.Size([1, 28])

In [69]:
inputs_embeds = model.language_model.get_input_embeddings()(text_inputs.input_ids)

In [71]:
inputs_embeds.shape

torch.Size([1, 28, 5120])

In [76]:
input_ids = text_inputs.input_ids
attention_mask = text_inputs.input_ids

In [77]:
merged_inputs_embeds, attention_mask, labels, position_ids = model._merge_input_ids_with_image_features(
    mm_projector_outputs, inputs_embeds, input_ids, attention_mask, None
)

In [79]:
merged_inputs_embeds.shape

torch.Size([1, 603, 5120])

In [88]:
mm_outputs = model.language_model(
    attention_mask=attention_mask,
    position_ids=position_ids,
    past_key_values=None,
    inputs_embeds=merged_inputs_embeds,
    output_attentions=False,
    output_hidden_states=True,
    return_dict=True,
)

In [89]:
mm_outputs.keys()

odict_keys(['logits', 'past_key_values', 'hidden_states'])

In [90]:
len(mm_outputs.hidden_states)

41

In [91]:
mm_outputs.hidden_states[-1].shape

torch.Size([1, 603, 5120])

In [57]:
model.language_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32064, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
      )
    )
    (no

In [96]:
clip_model.visual_projection

Linear(in_features=1024, out_features=768, bias=False)

In [None]:
clip_model.visual_projection

In [97]:
clip_model.text_model

CLIPTextTransformer(
  (embeddings): CLIPTextEmbeddings(
    (token_embedding): Embedding(49408, 768)
    (position_embedding): Embedding(77, 768)
  )
  (encoder): CLIPEncoder(
    (layers): ModuleList(
      (0-11): 12 x CLIPEncoderLayer(
        (self_attn): CLIPSdpaAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): CLIPMLP(
          (activation_fn): QuickGELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (final_layer_norm

In [1]:
import os
from PIL import Image
from transformers import LlavaForConditionalGeneration, AutoProcessor

model_dir = "/120040051/MLLM_Repos/llava-1.5-13b-hf"
llava_model = LlavaForConditionalGeneration.from_pretrained(model_dir, device_map='auto')
llava_processor = AutoProcessor.from_pretrained(model_dir)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [2]:
image_folder = "/120040051/test_resource/challset_llava-1.5-13b"
image_1 = 'A00079_cows_in_a_field_paddling_the_boat.jpg'
image_2 = 'A00108_the_firemen_in_the_fire_mailing_a_letter.jpg'

image_path_1 = os.path.join(image_folder, image_1)
image_path_2 = os.path.join(image_folder, image_2)

image_1 = Image.open(image_path_1).convert("RGB")
image_2 = Image.open(image_path_2).convert("RGB")

images = [image_1, image_2]

In [6]:
visual_only_inputs = llava_processor(text="<image>", images=image_1)

In [27]:
text_only_inputs = llava_processor.tokenizer(text="cows in a field paddling the boat", return_tensors="pt")

In [13]:
inputs.input_ids

tensor([[    1, 32000]])

In [15]:
llava_processor.tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

['<s>', '<image>']

In [22]:
text_only_inputs.input_ids

[[1, 274, 1242, 297, 263, 1746, 282, 1202, 1847, 278, 13006],
 [1, 274, 1242, 297, 263, 1746, 2646, 19583]]

In [23]:
text_only_inputs

{'input_ids': [[1, 274, 1242, 297, 263, 1746, 282, 1202, 1847, 278, 13006], [1, 274, 1242, 297, 263, 1746, 2646, 19583]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [19]:
llava_processor.tokenizer

LlamaTokenizerFast(name_or_path='/120040051/MLLM_Repos/llava-1.5-13b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<image>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
outputs_vision_only = llava_model(**inputs, output_hidden_states=True)

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [28]:
outputs_text_only = llava_model(**text_only_inputs, output_hidden_states=True)

In [11]:
len(outputs_vision_only.hidden_states)

41

In [29]:
len(outputs_text_only.hidden_states)

41

In [30]:
print(outputs_vision_only.hidden_states[-1].shape)
print(outputs_text_only.hidden_states[-1].shape)

torch.Size([1, 577, 5120])
torch.Size([1, 11, 5120])


In [109]:
model.device

device(type='cuda', index=0)