# Check token addition

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

modify_qwen_tokenizer_dir = "Qwen1.5-4B-Chat"
modify_qwen_tokenizer = AutoTokenizer.from_pretrained(modify_qwen_tokenizer_dir)

modify_qwen_tokenizer.encode("<image>")

In [None]:
qwen_model = AutoModelForCausalLM.from_pretrained(modify_qwen_tokenizer_dir, device_map='cuda:0', 
                                                torch_dtype=torch.bfloat16)

In [None]:
(qwen_model.model.embed_tokens,)

In [None]:
qwen_model.lm_head

In [None]:
qwen_model

# Initilization for Llava model

Restart the kernel here to avoid loading the same model multiple times.

In [None]:
clip_model_name_or_path = "clip-vit-large-patch14-336"
qwen_model_name_or_path = "Qwen1.5-4B-Chat"

from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoProcessor

clip_model = AutoModel.from_pretrained(clip_model_name_or_path, device_map="cuda:0")
llm_model = AutoModelForCausalLM.from_pretrained(
    qwen_model_name_or_path, device_map="cuda:0"
)

In [None]:
llm_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name_or_path)
llm_tokenizer.encode("<image>")

In [None]:
from transformers import LlavaForConditionalGeneration, LlavaConfig

In [None]:
vision_config = clip_model.vision_model.config
text_config = llm_model.config

# * Initialize the model
configuration = LlavaConfig(vision_config, text_config)
model = LlavaForConditionalGeneration(configuration)

Check the following two are the same:

In [None]:
model.vision_tower.vision_model.embeddings

In [None]:
clip_model.vision_model.embeddings

Copy weights from the original model to the new model.

In [None]:
model.vision_tower.vision_model = clip_model.vision_model
model.language_model = llm_model

Check the following two are the same:

In [None]:
llm_model.model.embed_tokens.weight.data[:,:2]

In [None]:
model.language_model.model.embed_tokens.weight.data[:,:2]

Copy `pad_token_id` from the original model to the new model.

In [None]:
model.config.pad_token_id

In [None]:
model.config.pad_token_id = llm_tokenizer.pad_token_id
model.config.pad_token_id

Copy `image_token_index` from the original model to the new model.

In [None]:
model.config.image_token_index

In [None]:
llm_tokenizer.encode("<image>")

In [None]:
model.config.image_token_index = llm_tokenizer.encode("<image>")[0]
model.config.image_token_index

## Save the new model

In [None]:
model.save_pretrained('my-model/model-01')

In [None]:
llm_tokenizer.save_pretrained('my-model/model-01')

In [None]:
autoprocessor = AutoProcessor.from_pretrained(clip_model_name_or_path)
autoprocessor.save_pretrained('my-model/model-02')

# Restart to test the new model

In [1]:
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import torch

model_name_or_path = "my-model/model-01"

llava_processor = LlavaProcessor.from_pretrained(model_name_or_path)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name_or_path, device_map="cuda:0", torch_dtype=torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
from PIL import Image

prompt_text = "<image>\nWhat are these?"


messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt_text},
]
prompt = llava_processor.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)


image_path = "data/australia.jpg"
image = Image.open(image_path)


inputs = llava_processor(text=prompt, images=image, return_tensors="pt")

for tk in inputs.keys():
    inputs[tk] = inputs[tk].to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=20)
gen_text = llava_processor.batch_decode(
    generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)[0]

print(gen_text)

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
What are these?<|im_end|>
<|im_start|>assistant
These are all the different ways to say "I love you" in Chinese.<|im_end|>


In [4]:
print(prompt)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
What are these?<|im_end|>
<|im_start|>assistant

