In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoProcessor
from mini1o.processor import Mini1oProcessor

path = 'OpenGVLab/InternVL2_5-1B-MPO'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


FlashAttention2 is not installed.


In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoProcessor
from mini1o.processor import Mini1oProcessor

path = 'OpenGVLab/InternVL2_5-1B-MPO'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
processor1o = Mini1oProcessor.from_pretrained(path, trust_remote_code=True)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoProcessor

path = 'OpenGVLab/InternVL2_5-1B-MPO'
from mini1o.processor import Mini1oProcessor, Mini1oImageProcessor

image_processor = Mini1oImageProcessor()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}\n{% if content['type'] == 'image_gen' or 'image_gen' in content %}<|image_gen_start|><|image_gen_pad|><|image_gen_end|>\n{% elif content['type'] == 'video_gen' or 'video_gen' in content %}<|video_gen_start|><|video_gen_pad|><|video_gen_end|>\n{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>\n{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}\n"

processor1o = Mini1oProcessor(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)

In [None]:
processor1o.save_pretrained('kirp' trust_remote_code=True)

In [3]:

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

In [9]:
pixel_values = load_image('1.png', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)

In [19]:
processor1o = Mini1oProcessor.from_pretrained(path, trust_remote_code=True)

In [10]:
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Please describe the image shortly.
Assistant: The image shows details of a HivePOT token with a balance of 0ETH. It includes information like ETH balance, transaction history, token tracking links, and a contract creator. There's a note indicating the token is reported to be a honeyypoten. The user interface includes options for buying, exchanging, and playing.


In [17]:
# load from kirp\mini4o\chat_template.jinja
with open('kirp\mini4o\chat_template.jinja', 'r') as f:
    template = f.read()
tokenizer.chat_template = template
processor.tokenizer = tokenizer
processor.chat_template = template

In [18]:
processor.chat_template

"{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}\n{% if content['type'] == 'image_gen' or 'image_gen' in content %}<|image_gen_start|><|image_gen_pad|><|image_gen_end|>\n{% elif content['type'] == 'video_gen' or 'video_gen' in content %}<|video_gen_start|><|video_gen_pad|><|video_gen_end|>\n{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>\n{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if 

In [22]:
processor1o.tokenizer = tokenizer

In [23]:
processor1o.chat_template = template

In [27]:
# save processor1o to kirp/mini1o
processor1o.save_pretrained('kirp/mini1o')

TypeError: Object of type Compose is not JSON serializable

In [None]:
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')


In [21]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": Image.open('1.png').convert('RGB'),
            },
            {
                "type": "text", 
                "text": "Please describe the image shortly.\n"
            },
            # {
            #     'type': 'image_gen',
            #     'image_gen': Image.open('1.png').convert('RGB'),
            # }
        ],
    }
]

In [22]:
text = processor1o.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(text)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>
Please describe the image shortly.
<|im_end|>
<|im_start|>assistant



In [23]:
x = processor1o.image_processor([Image.open('1.png').convert('RGB')], return_tensors='pt')

In [24]:
x = processor1o(
    text=[text],
    images=[Image.open('1.png').convert('RGB')]*2,
    return_tensors="pt",
)

In [25]:
x.input_ids

tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151653,    198,   5501,
           7512,    279,   2168,  19620,    624, 151645,    198, 151644,  77091,
            198]])

In [26]:
print(processor1o.batch_decode(x.input_ids, skip_special_tokens=False)[0])

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|vision_end|>
Please describe the image shortly.
<|im_end|>
<|im_start|>assistant



In [2]:
# set the max number of tiles in `max_num`
pixel_values = load_image('1.png', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)

# pure-text conversation (纯文本对话)
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# single-image single-round conversation (单图单轮对话)
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')

# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# batch inference, single image per sample (单图批处理)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
                             num_patches_list=num_patches_list,
                             questions=questions,
                             generation_config=generation_config)
for question, response in zip(questions, responses):
    print(f'User: {question}\nAssistant: {response}')

# video multi-round conversation (视频多轮对话)
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in img]
        pixel_values = torch.stack(pixel_values)
        num_patches_list.append(pixel_values.shape[0])
        pixel_values_list.append(pixel_values)
    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list

video_path = './examples/red-panda.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: Hello, who are you?
Assistant: Hello! I'm an AI assistant whose name is InternVL, developed jointly by Shanghai AI Lab, Tsinghua University and other partners.
User: Can you tell me a story?
Assistant: Sure! Here's a story for you:

In a distant galaxy bound for the distant corners of the Milky Way, there lived a planet called Terra. Terra was a lush, vibrant world where animals roamed freely, trees swayed in the wind, and rivers flowed with crystal clear water. The inhabitants of Terra, known as the Terrae, had their own unique language, rhythms, and customs.

One day, under the watchful eye of an ancient Lumina, a force that could alter the course of the universe itself, a great cosmic event occurred. A massive black hole was detected in the distant center of the galaxy, and its gravitational pull began to warp space-time. This, in turn, disrupted the flow of energy that powered Terra.

Without a fixed source of power, the Terrae were plunged into chaos. The once-universe-rotat

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Please describe the image shortly.
Assistant: The image shows a web page from an online exchange displaying a trading contract. It is part of a giveaway for "Rainbet Casino." It details an Ethereum (ETH) contract with a balance of 0 ETH. There's discussion about the contract, including an entry for participation. Important data includes an approval status, transaction history, and network tags like "Bella." There's a mention of a honeyypot token on the overview. Tabs for internal transactions, events, analytics, etc., are visible, but no actual transactions are listed.


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: <image>
Please describe the image in detail.
Assistant: This image appears to be a screenshot from a contract interface for a cryptocurrency or blockchain trading platform. Here's a detailed breakdown of the features and contents:

1. **Header Information**:
   - **Contract ID**: 0x34C6211621f2763c6E0Eb007dc2aE9E1090A2d22f6
   - **Sponsored Event**: Rainbet Casino - 20K USD WEEKLY RAFFLE, promoting a free entry to claim a game ticket.
   - **Options Tabs**: Buy, Exchange, Play, Gaming

2. **Token Details**:
   - **Token Type**: Honeypont Token, specifically labeled as "Belle".
   - **Source Code**: Tap on a button to get more details.
   - **Overview**:
     - **ETH Balance**: 0 ETH
     - **ETH Value**: $0.00
     - **Token Holdings**: 0 (1 Tokens)
   - **More Info**:
     - Private Name Tags: "+ Add", indicating the contract can be associated with multiple name tags.
     - Contract Creator Name: BELLE (Honeypont Rug P)
     - ERC-20 Token Identifier: BTC-20 (associated with BE

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


User: Please write a poem according to the image.
Assistant: In the tapestry of blockchain dreams,
A token whispers with a gentle sway,
Bello Gentle from BELLE, rare within.
A token’s hush in cryptic, silent form,
A token-holding tale born in May.

An ethereal hash didst enter its land,
“Approval” in its heart, a token’s embrace.
Tokens’ tales unbound, in a digital tide,
A tale unfurled beneath lines, in token streams.

With blocks stately, in lines they play,
From honesies in an art of unyielding grace.
Each block, a token’s latest revelation,
In tweets and fees, a token’s silent sigh.

In the heart of exchanges and in gaming's might,
BELLE hails from the Honeypont Forge.
A token tokenized promise, in chains entwined,
In tokens of Ether, a token’s eternal keep.

A network of footings, intertwined and free,
The token soars with a rhythm divine.
With every block it writes, in the game of the blockchain wide,
Bello Gentle holds the token, in its ethereal way.

Honey, tokens, token-holdin

FileNotFoundError: [Errno 2] No such file or directory: './examples/image1.jpg'

In [1]:
import torch
from diffusers import SanaSprintPipeline

pipe = SanaSprintPipeline.from_pretrained(
    "Efficient-Large-Model/Sana_Sprint_0.6B_1024px_diffusers", torch_dtype=torch.bfloat16
)
pipe.to("cuda")

image = pipe(prompt="a tiny astronaut hatching from an egg on the moon")[0]
image[0].save("output.png")

    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.5.1+cu121)
    Python  3.10.11 (you have 3.10.16)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "c:\Users\29058\miniconda3\envs\hi\lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
ModuleNotFoundError: No module named 'triton'


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

OSError: Error no file named config.json found in directory C:\Users\29058\.cache\huggingface\hub\models--Efficient-Large-Model--Sana_Sprint_0.6B_1024px_diffusers\snapshots\a7d9fc31dd5c3f5e22dbfd78360777ceed56ae97\vae.