In [1]:
!pip install  -U -q git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git qwen-vl-utils accelerate
!pip install -q torch==2.4.1+cu121 torchvision==0.19.1+cu121 torchaudio==2.4.1+cu121 --extra-index-url https://download.pytorch.org/whl/cu121

In [9]:
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from prompts import *
model_id = "phmtung/Qwen2"

In [3]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = Qwen2VLProcessor.from_pretrained(model_id)


config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/3.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [4]:
from typing import List, Union, Sequence

def build_sample(
    prompt: str,
    images: Union[str, Sequence[str]]
):
    if isinstance(images, (str, bytes)):
        images = [images]

    content = [{"type": "image", "image": img} for img in images]
    content.append({"type": "text", "text": prompt})

    sample = [
        {"role": "user", "content": content},
    ]
    return sample

In [5]:
from qwen_vl_utils import process_vision_info


def generate_text_from_sample(model, processor, sample, max_new_tokens=2048, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample, tokenize=False, add_generation_prompt=True  # Use the sample without the system message
    )

    # Process the visual input from the sample
    image_inputs, _ = process_vision_info(sample)

    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(
        device
    )  

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [17]:
prompt = VEHICLE_REGISTRATION_PROMPT
sample = build_sample(prompt, ["examples/vehicle_registration.jpg"])

In [18]:
response = generate_text_from_sample(model, processor, sample, max_new_tokens=2048, device="cuda")

In [19]:
response

'{\n    "Tên chủ xe (Owner\'s full name)": "PHẠM VĂN HẢO - 1995",\n    "Địa chỉ (Address)": "Lạc Thọ Bắc, TT Hồ, Thuận Thành, BN",\n    "Nhãn hiệu (Brand)": "HYUNDAI",\n    "Số loại (Model code)": "ACCENT",\n    "Loại xe (Type)": "Ô tô con",\n    "Màu sơn (Color)": "Bạc",\n    "Số máy (Engine number)": "G4LCJF701482",\n    "Số khung (Chassis number)": "41BAKN012085",\n    "Số chỗ ngồi (Seats)": "5",\n    "Trọng tải (Gross weight)": "kg",\n    "KL toàn bộ (Total mass)": "1368",\n    "KL kéo theo (Towed mass)": "",\n    "Hoạt động trong phạm vi": "",\n    "Biển số đăng kí (Number plate)": "99A-280.96",\n    "Giá trị đến ngày (Date of expiry)": "",\n    "Nơi đăng kí, ngày đăng kí": "Bắc ninh, ngày 11 tháng 03 năm 2019"\n}'

In [None]:
# Generation configs
generation_config =  model.generation_config
generation_config.do_sample   = True
generation_config.temperature = 1.0
generation_config.top_k       = 1
generation_config.top_p       = 0.9
generation_config.min_p       = 0.1
generation_config.best_of     = 5
generation_config.max_new_tokens     = 2048
generation_config.repetition_penalty = 1.06