### 使用带有`visual_projection`的clip模型，这样的clip模型允许输出`image_embeds`信息

In [1]:
from transformers import (  AutoModel, 
                            AutoModelForCausalLM, 
                            AutoTokenizer, 
                            AutoProcessor,
                            CLIPVisionModelWithProjection,
                            LlavaForConditionalGeneration,
                            LlavaConfig
                        )
import torch

In [2]:
clip_model_name_or_path = (
    "./openai/clip-vit-large-patch14-336"
)
qwen_model_name_or_path = "./Qwen/Qwen2.5-3B-Instruct"
device = "cuda:0"

#### 加载`clip`，`qwen`， `qwen_tokenizer`
#### 使用`CLIPVisionModelWithProjection`来加载clip模型

In [3]:
clip_model = CLIPVisionModelWithProjection.from_pretrained(clip_model_name_or_path, device_map=device)
llm_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name_or_path, device_map=device
)
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)

  return self.fget.__get__(instance, owner)()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
llm_tokenizer.encode("<image>"), clip_model

([151665],
 CLIPVisionModelWithProjection(
   (vision_model): CLIPVisionTransformer(
     (embeddings): CLIPVisionEmbeddings(
       (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
       (position_embedding): Embedding(577, 1024)
     )
     (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
     (encoder): CLIPEncoder(
       (layers): ModuleList(
         (0-23): 24 x CLIPEncoderLayer(
           (self_attn): CLIPSdpaAttention(
             (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
             (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
             (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
             (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
           )
           (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
           (mlp): CLIPMLP(
             (activation_fn): QuickGELUActivation()
             (fc1): 

#### 将clip模型和llm_model模型的config拿出来，初始化一个llava config

In [6]:
# Initializing a CLIP-vision config
vision_config = clip_model.vision_model.config

# Initializing a Llama config
text_config = llm_model.config

# Initializing a Llava llava-1.5-7b style configuration
configuration = LlavaConfig(vision_config, text_config)

# Initializing a model from the llava-1.5-7b style configuration
model = LlavaForConditionalGeneration(configuration)

#### 但是上面，只是把llava模型的形状初始化好了，模型权重都还是随机生成的，需要把两个模型的权重，复制过去

In [7]:
model.vision_tower.vision_model = clip_model.vision_model
model.language_model = llm_model

#### 将Qwen tokenizer的`pad_token_id`及`image_token_index`赋值给llava架构的config

In [8]:
model.config.pad_token_id = llm_tokenizer.pad_token_id
model.config.image_token_index = llm_tokenizer.encode("<image>")[0]
model.config.pad_token_id, model.config.image_token_index

(151643, 151665)

#### 保存`model`，`tokenizer`，`图像模型processor`，并且需要把`model002`里面的`preprocessor_config.json`文件，放在`model001`里面

In [13]:
model.save_pretrained("qwen2.5_3B_Instruct_clipvL14_model/clip_proj/model001")
llm_tokenizer.save_pretrained("qwen2.5_3B_Instruct_clipvL14_model/clip_proj/model001")
autoprocessor = AutoProcessor.from_pretrained(clip_model_name_or_path)
autoprocessor.save_pretrained("qwen2.5_3B_Instruct_clipvL14_model/clip_proj/model002")

[2025-02-16 16:33:01,223] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[]

#### reboot并且测试模型是否可以正常工作

In [2]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch


model_name_or_path = "./qwen2.5_3B_Instruct_clipvL14_model/clip_proj/model001"
# model_name_or_path = "test_model_copy/model001"  #

llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name_or_path, device_map="cuda:0", torch_dtype=torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from PIL import Image

prompt_text = "<image>\nWhat are these?"


messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt_text},
]
prompt = llava_processor.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)


image_path = "./data/000000039769.jpg"
image = Image.open(image_path)

inputs = llava_processor(text=prompt, images=image, return_tensors="pt")

for tk in inputs.keys():
    inputs[tk] = inputs[tk].to(model.device)

generate_ids = model.generate(**inputs, max_new_tokens=200)
gen_text = llava_processor.batch_decode(
    generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]

print(gen_text)

Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
What are these?<|im_end|>
<|im_start|>assistant
These are aa series of Chinese characters and words mixed with English words and phrases, which appear to be a jumbled mess of Chinese text and English. It doesn't form a coherent sentence or meaningful phrase. The text seems to be a random combination of terms related to "travel" (旅游), "business" (商务), "law" (法律), "education" (教育), and "communication" (沟通) in Chinese, mixed with some English words and phrases.

It's not clear what the original message was trying to convey, as it appears to be a mix-up of different concepts and terms. Without more context, it's difficult to provide a specific interpretation or meaning.<|im_end|>


In [5]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])

In [7]:
inputs['input_ids'].shape, inputs['attention_mask'].shape, inputs['pixel_values'].shape

(torch.Size([1, 25]), torch.Size([1, 25]), torch.Size([1, 3, 336, 336]))

In [8]:
inputs['input_ids'], inputs['attention_mask'], inputs['pixel_values']

(tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
          151645,    198, 151644,    872,    198, 151665,    198,   3838,    525,
            1493,     30, 151645,    198, 151644,  77091,    198]],
        device='cuda:0'),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1]], device='cuda:0'),
 tensor([[[[ 0.5435,  0.6457,  0.5581,  ...,  0.0909,  0.0033, -0.0696],
           [ 0.5435,  0.6165,  0.5435,  ...,  0.1201,  0.0179,  0.0617],
           [ 0.5581,  0.5581,  0.6603,  ...,  0.0909,  0.0763,  0.0617],
           ...,
           [ 1.8281,  1.8865,  1.8281,  ...,  1.4048,  1.4486,  1.5654],
           [ 1.8573,  1.9011,  1.8719,  ...,  1.4778,  1.4048,  1.4924],
           [ 1.8719,  1.9011,  1.9011,  ...,  1.4048,  1.2150,  1.4778]],
 
          [[-1.3619, -1.2718, -1.3769,  ..., -1.4219, -1.4820, -1.5120],
           [-1.3319, -1.2418, -1.3469,  ..., -1.4219, -1.4820, -1.4219],
           [-1.2418, -1.