In [5]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoProcessor

path = 'OpenGVLab/InternVL2_5-1B-MPO'
from minigemini.processor import Mini1oProcessor, Mini1oImageProcessor

image_processor = Mini1oImageProcessor()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}\n{% if content['type'] == 'image_gen' or 'image_gen' in content %}<|image_gen_start|><|image_gen_pad|><|image_gen_end|>\n{% elif content['type'] == 'video_gen' or 'video_gen' in content %}<|video_gen_start|><|video_gen_pad|><|video_gen_end|>\n{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>\n{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}\n"

processor1o = Mini1oProcessor(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)

In [9]:
processor1o.save_pretrained('kirp/mini1o')

[]

In [6]:

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

In [11]:
# original 
pixel_values = load_image('1.png', max_num=12).to(torch.bfloat16).cuda()
question = '<image>\nPlease describe the image shortly.'

In [12]:
pixel_values

tensor([[[[ 2.2500,  2.2500,  2.2500,  ...,  2.2500,  2.2500,  2.2500],
          [ 2.2500,  2.2500,  2.2500,  ...,  2.2500,  2.2500,  2.2500],
          [ 2.2500,  2.2500,  2.2500,  ...,  2.2500,  2.2500,  2.2500],
          ...,
          [ 2.2031,  2.2031,  2.2031,  ...,  2.2500,  2.2500,  2.2500],
          [ 2.1875,  2.1875,  2.1875,  ...,  2.2500,  2.2500,  2.2500],
          [ 2.1875,  2.2031,  2.2031,  ...,  2.2500,  2.2500,  2.2500]],

         [[ 2.4219,  2.4219,  2.4219,  ...,  2.4219,  2.4219,  2.4219],
          [ 2.4219,  2.4219,  2.4219,  ...,  2.4219,  2.4219,  2.4219],
          [ 2.4219,  2.4219,  2.4219,  ...,  2.4219,  2.4219,  2.4219],
          ...,
          [ 2.3750,  2.3906,  2.3906,  ...,  2.4219,  2.4219,  2.4219],
          [ 2.3750,  2.3750,  2.3750,  ...,  2.4219,  2.4219,  2.4219],
          [ 2.3750,  2.3750,  2.3750,  ...,  2.4219,  2.4219,  2.4219]],

         [[ 2.6406,  2.6406,  2.6406,  ...,  2.6406,  2.6406,  2.6406],
          [ 2.6406,  2.6406,  

In [17]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoProcessor

path = 'OpenGVLab/InternVL2_5-1B-MPO'
from minigemini.processor import Mini1oProcessor, Mini1oImageProcessor

image_processor = Mini1oImageProcessor()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
chat_template = "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}\n{% if content['type'] == 'image_gen' or 'image_gen' in content %}<|image_gen_start|><|image_gen_pad|><|image_gen_end|>\n{% elif content['type'] == 'video_gen' or 'video_gen' in content %}<|video_gen_start|><|video_gen_pad|><|video_gen_end|>\n{% elif content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>\n{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}\n"
tokenizer.chat_template = chat_template
processor1o = Mini1oProcessor(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)

processor1o.save_pretrained('kirp/mini1o')
image_processor.save_pretrained('kirp/mini1o')
tokenizer.save_pretrained('kirp/mini1o')

## new
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": Image.open('1.png').convert('RGB'),
            },
            {
                "type": "text", 
                "text": "Please describe the image shortly."
            },
            {
                'type': 'image_gen',
                'image_gen': Image.open('1.png').convert('RGB'),
            }
        ],
    }
]
text = processor1o.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
print(text)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>
Please describe the image shortly.<|image_gen_start|><|image_gen_pad|><|image_gen_end|>
<|im_end|>
<|im_start|>assistant



In [None]:
images=[Image.open('1.png').convert('RGB')],

In [None]:
from minigemini.processor import Mini1oImageProcessor
from PIL import Image
image_processor = Mini1oImageProcessor()
image_processor.save_pretrained('kirp/mini1o')

In [4]:
x = Image.open('1.png')

In [6]:
from transformers.image_transforms import (
    convert_to_rgb,
    resize
)
x = convert_to_rgb(x)

In [10]:
x = image_processor.dynamic_preprocess(x)

In [14]:
import numpy as np
y = np.array(x[0])
y.shape

(448, 448, 3)

In [15]:
y.min(), y.max(), y.mean(), y.std()

(np.uint8(0),
 np.uint8(255),
 np.float64(244.9112474091199),
 np.float64(28.72002901373138))

In [12]:
x = processor1o(
    text=[text],
    images=[Image.open('1.png').convert('RGB')],
    return_tensors="pt",
)

In [13]:
z = processor1o.batch_decode(x['input_ids'], skip_special_tokens=False)[0]
print(z)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|image_pad|><|vision_end|>
Please describe the image shortly.<|image_gen_start|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_gen_pad|><|image_ge

In [14]:
x['pixel_values'].shape

torch.Size([9, 3, 448, 448])

In [16]:
processor1o.save_pretrained('kirp/mini1o')
tokenizer.save_pretrained('kirp/mini1o')

('kirp/mini1o\\tokenizer_config.json',
 'kirp/mini1o\\special_tokens_map.json',
 'kirp/mini1o\\vocab.json',
 'kirp/mini1o\\merges.txt',
 'kirp/mini1o\\added_tokens.json')

In [18]:
x

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151653,    198,   5501,
           7512,    279,   2168,  19620,  15757,     91,   1805,  16322,   4906,
             91,   1784,     91,   1805,  16322,  30290,     91,   1784,     91,
           1805,  16322,  30290,     91,   1784,     91,   1805,  16322,  30290,
             91,   1784,     91,   1805,  16322,  30290,     91,   1784,     91,
           1805,  16322,  30290,     91,   1784,     91,   1805,  16322,  30290,
             91,   1784,     91,   1805,  16322,  30290,     91,   1784,     91,
           1805,  16322,  30290,     91,   1784,     91,   1805,  16322,  30290,
             91,   1784,     91,   1805,  16322,  30290,     91,   1784,     91,
           1805,  16322,  30290,     91,   1784,     91,   1805,  16322,  30290,
             9