-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathinternvl_inference.py
272 lines (226 loc) · 10.5 KB
/
internvl_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# -----------------------------------------------------------------------------
#
# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
from io import BytesIO
from typing import List
import requests
import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoTokenizer, TextStreamer
from QEfficient import QEFFAutoModelForCausalLM
from QEfficient.utils.logging_utils import logger
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# Process the input messages to generate prompt for the model.
def get_prompt(messages) -> str:
"""Get the prompt for generation."""
## Chat template used for InternVL
system_prompt = "<|im_start|>system\n你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。"
sep = "<|im_end|>\n"
ret = system_prompt + sep
for role, message in messages:
if message:
if type(message) is tuple:
message, _, _ = message
ret += role + message + sep
else:
ret += role
return ret
# Processor class for InternVL models
class InternProcessor:
"""
InternVL model only has an AutoTokenizer so this class performs the processing tasks similar to an AutoProcessor.
The methods used here are borrowed from the original InternVL modelling files.
"https://huggingface.co/OpenGVLab/InternVL2_5-1B/"
"""
def __init__(self, model: nn.Module, tokenizer):
self.model = model
image_size = self.model.config.force_image_size or self.model.config.vision_config.image_size
patch_size = self.model.config.vision_config.patch_size
self.template = model.config.template
self.num_image_token = int((image_size // patch_size) ** 2 * (self.model.config.downsample_ratio**2))
self.tokenizer = tokenizer
def build_transform(self, input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
[
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = self.find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(self, image, input_size=448, max_num=12):
transform = self.build_transform(input_size=input_size)
images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def __call__(
self,
pixel_values,
question,
messages,
roles,
history=None,
num_patches_list=None,
IMG_START_TOKEN="<img>",
IMG_END_TOKEN="</img>",
IMG_CONTEXT_TOKEN="<IMG_CONTEXT>",
verbose=False,
) -> str:
if history is None and pixel_values is not None and "<image>" not in question:
question = "<image>\n" + question
if num_patches_list is None:
num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
self.model.img_context_token_id = img_context_token_id
messages.append([roles[0], question])
messages.append([roles[1], None])
query = get_prompt(messages)
if verbose and pixel_values is not None:
image_bs = pixel_values.shape[0]
logger.info(f"dynamic ViT batch size: {image_bs}")
for num_patches in num_patches_list:
image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
query = query.replace("<image>", image_tokens, 1)
return query
def run_intern_on_aic(
model_name,
prompt,
image_url,
messages,
roles,
kv_offload=False,
prefill_seq_len=3840,
num_devices=1,
num_cores=16,
):
## STEP 1 -- LOAD THE MODEL
# The original Intern-VL model, despite being multimodal, is loaded using `AutoModelForCausalLM` in Huggingface.
# To maintain compatibility, we load this model using `QEFFAutoModelForCausalLM`.
model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=kv_offload, trust_remote_code=True)
## STEP 2 -- EXPORT & COMPILE THE MODEL
model.compile(
num_cores=num_cores,
num_devices=num_devices,
prefill_seq_len=prefill_seq_len,
mxfp6_matmul=False,
)
## STEP 3 -- SETUP THE PROCESSOR
# InternVL doesn't have an AutoProcessor yet, so we will use our own processor class "InternProcessor"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
internProcessor = InternProcessor(model.model, tokenizer)
## STEP 4 -- PREPROCESS THE INPUTS
img = requests.get(image_url, stream=True)
image = Image.open(BytesIO(img.content)).convert("RGB")
# Images are resized to (1000, 747) for inference
image = image.resize((1000, 747))
# preprocess the resized image
pixel_values = internProcessor.load_image(image, max_num=12)
question = "<image>\n" + prompt
query = internProcessor(pixel_values, question, messages, roles)
inputs = tokenizer(
query, return_tensors="pt", padding="max_length", max_length=prefill_seq_len, padding_side="right"
)
inputs["pixel_values"] = pixel_values
## STEP 5 -- RUN INFERENCE VIA GENERATE FUNCTION
streamer = TextStreamer(tokenizer)
model.generate(inputs=inputs, streamer=streamer, generation_len=128)
if __name__ == "__main__":
model_name = "OpenGVLab/InternVL2_5-1B"
# Chat Template information for prompt preprocessing
messages: List[List[str]] = []
roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
# Inputs for the model
prompt = "Please describe the image in detail."
image_url = "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg"
## Compilation parameters
# `kv_offload` is used to compile the model in a Single QPC or 2 QPCs.
# The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
# The outputs of the Vision Encoder are then passed to the Language model via host in this case.
kv_offload = False
# InternVL is an Early-Fusion model that uses placeholder tokens within the input_ids to interleave text_embeddings with
# Image embeddings and generate final input_embeds for outout generation. Hence we need very large prefill_seq_len (3840 in this case) to
# incorporate the memory for the merged embeddings.
prefill_seq_len = 3840
num_devices = 4
num_cores = 16
run_intern_on_aic(
model_name=model_name,
prompt=prompt,
image_url=image_url,
messages=messages,
roles=roles,
kv_offload=kv_offload,
prefill_seq_len=prefill_seq_len,
num_devices=num_devices,
num_cores=num_cores,
)
"""
Expected Response:
The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme.
On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters.
Below the logo, there is text that reads:
- "By Dinesh Kumar Wick
"""