In [1]:
# create the base model and processor for mini1o
from mini1o import Mini1oMLLM,Mini1oMLLMConfig, Mini1o
config = Mini1oMLLMConfig(
    pretrained_model_name_or_path="OpenGVLab/InternVL3-1B",
    num_image_gen_tokens=64,
    img_context_token_id=151655,
    image_gen_start_token_id=15000,
    image_gen_context_token_id=15000,
    image_gen_end_token_id=15001,
)
model = Mini1oMLLM(config=config)

In [2]:
path = "OpenGVLab/InternVL3-1B"
from mini1o.processor import Mini1oProcessor, Mini1oImageProcessor
from transformers import AutoTokenizer
from diffusers.image_processor import PixArtImageProcessor

image_processor = Mini1oImageProcessor()
gen_image_processor = PixArtImageProcessor()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}\n{% if content['type'] == 'image_gen' or 'image_gen' in content %}<|image_gen_start|><|image_gen_pad|><|image_gen_end|>\n{% elif content['type'] == 'video_gen' or 'video_gen' in content %}<|video_gen_start|><|video_gen_pad|><|video_gen_end|>\n{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>\n{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}\n"

processor1o = Mini1oProcessor(image_processor=image_processor, 
                              tokenizer=tokenizer, 
                              chat_template=chat_template)

In [3]:
from PIL import Image
messages = [
    {
        "role": "user",
        "content": [
            {
                "image": Image.open('1.png').convert('RGB'),
            },
            {
                "text": "Please describe the image shortly.\n"
            },
            # {
            #     'image_gen': Image.open('1.png').convert('RGB'),
            # }
        ],
    }
]

In [4]:
text = processor1o.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
x = processor1o(
    text=[text],
    images=[Image.open('1.png').convert('RGB')],
    return_tensors="pt",
)
print(processor1o.batch_decode(x.input_ids, skip_special_tokens=False)[0])

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|vision_end|>
Please describe the image shortly.
<|im_end|>
<|im_start|>assistant



In [7]:
import torch
for key, value in x.items():
    if isinstance(value, torch.Tensor):
        x[key] = value.half()

output = model.generate(**x)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.HalfTensor instead (while checking arguments for embedding)

In [None]:
# 加入特殊token
from transformers import AutoTokenizer, AutoProcessor, Qwen2_5_VLForConditionalGeneration

# 加载 tokenizer 和 processor
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# 定义你要加的 special tokens
special_tokens = [
    "<|image_gen_start|>", "<|image_gen_pad|>", "<|image_gen_end|>",
    "<|video_gen_start|>", "<|video_gen_pad|>", "<|video_gen_end|>"
]

# 添加 token，并获得它们的 ID
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens+tokenizer.additional_special_tokens})
processor.tokenizer = tokenizer  # 更新 processor 的 tokenizer

# 映射到 ID
token_ids = {token: tokenizer.convert_tokens_to_ids(token) for token in special_tokens}
