# config prepare

In [None]:
from mini1o import DitConfig
import torch
from diffusers import SanaTransformer2DModel, AutoencoderDC, DPMSolverMultistepScheduler
model = SanaTransformer2DModel.from_pretrained("Efficient-Large-Model/Sana_600M_512px_diffusers",subfolder="transformer",torch_dtype=torch.bfloat16)
# 加载 VAE 模型，用于将图像编码到 latent 空间
vae = AutoencoderDC.from_pretrained("Efficient-Large-Model/Sana_600M_512px_diffusers", subfolder="vae",torch_dtype=torch.bfloat16)
# 加载 scheduler，该组件封装了向 latent 添加噪声的操作及时间步信息
scheduler = DPMSolverMultistepScheduler.from_pretrained("Efficient-Large-Model/Sana_600M_512px_diffusers",subfolder="scheduler",torch_dtype=torch.bfloat16,)
# 从 scheduler 配置中获取训练时的总时间步数（如果配置中没有该项，可设定一个默认值，如 1000）
model_config = dict(model.config)
vae_config = dict(vae.config)
scheduler_config = dict(scheduler.config)
dit_config = DitConfig(
    model_config=model_config,
    vae_config=vae_config,
    scheduler_config=scheduler_config
)
from transformers import AutoConfig
path = "OpenGVLab/InternVL3-1B"
# mllm_config = AutoConfig.from_pretrained(path, trust_remote_code=True)
from mini1o import Mini1oMLLM,Mini1oConfig, Mini1o
config = Mini1oConfig(dit_config=dit_config.to_dict())
config.save_pretrained('ckpt', trust_remote_code=True)

# processor

In [None]:
# 加入特殊token
from transformers import AutoTokenizer, AutoProcessor

path = "OpenGVLab/InternVL3-1B"

# 加载 tokenizer 和 processor
tokenizer = AutoTokenizer.from_pretrained(path)
processor = AutoProcessor.from_pretrained(path)

# 定义你要加的 special tokens
special_tokens = [
    "<|image_gen_start|>", "<|image_gen_pad|>", "<|image_gen_end|>",
    # "<|video_gen_start|>", "<|video_gen_pad|>", "<|video_gen_end|>"
]

# 添加 token，并获得它们的 ID
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens+tokenizer.additional_special_tokens})
processor.tokenizer = tokenizer  # 更新 processor 的 tokenizer

# 映射到 ID
token_ids = {token: tokenizer.convert_tokens_to_ids(token) for token in special_tokens}


In [2]:
path = "OpenGVLab/InternVL3-1B"
from mini1o.processor import Mini1oProcessor, Mini1oImageProcessor
from transformers import AutoTokenizer
from diffusers.image_processor import PixArtImageProcessor

image_processor = Mini1oImageProcessor()
gen_image_processor = PixArtImageProcessor()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}\n{% if content['type'] == 'image_gen' or 'image_gen' in content %}<|image_gen_start|><|image_gen_pad|><|image_gen_end|>\n{% elif content['type'] == 'video_gen' or 'video_gen' in content %}<|video_gen_start|><|video_gen_pad|><|video_gen_end|>\n{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>\n{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}\n"

processor1o = Mini1oProcessor(image_processor=image_processor, 
                              tokenizer=tokenizer, 
                              chat_template=chat_template)
# processor1o.save_pretrained('ckpt')

[]

In [3]:
from PIL import Image
messages = [
    {
        "role": "user",
        "content": [
            {
                "image": Image.open('1.png').convert('RGB'),
            },
            {
                "text": "Please describe the image shortly.\n"
            },
            # {
            #     'image_gen': Image.open('1.png').convert('RGB'),
            # }
        ],
    }
]

In [4]:
text = processor1o.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
x = processor1o(
    text=[text],
    images=[Image.open('1.png').convert('RGB')],
    return_tensors="pt",
)
print(processor1o.batch_decode(x.input_ids, skip_special_tokens=False)[0])

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|vision_end|>
Please describe the image shortly.
<|im_end|>
<|im_start|>assistant



In [None]:
from mini1o import Mini1oMLLM, Mini1oConfig

config = Mini1oConfig.from_pretrained('ckpt')

model = Mini1oMLLM.from_config(config, 
                               torch_dtype=torch.bfloat16，
                               use_flash_attn=True,
                               trust_remote_code=True,
                               device_map='cuda:0').eval()



In [None]:
import torch
for key, value in x.items():
    if isinstance(value, torch.Tensor):
        x[key] = value.half()

output = model.generate(**x, max_new_tokens=1024, do_sample=True)