### Image Question with Qwen3-Omni

In [1]:
import os
os.environ['VLLM_USE_V1'] = '0'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import torch
import warnings
import numpy as np

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

from qwen_omni_utils import process_mm_info
from transformers import Qwen3OmniMoeProcessor

def _load_model_processor():
    if USE_TRANSFORMERS:
        from transformers import Qwen3OmniMoeForConditionalGeneration
        if TRANSFORMERS_USE_FLASH_ATTN2:
            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH,
                                                                         dtype='auto',
                                                                         attn_implementation='flash_attention_2',
                                                                         device_map="auto")
        else:
            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH, device_map="auto", dtype='auto')
    else:
        from vllm import LLM
        model = LLM(
            model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,
            tensor_parallel_size=torch.cuda.device_count(),
            limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},
            max_num_seqs=1,
            max_model_len=32768,
            seed=1234,
        )

    processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
    return model, processor

def run_model(model, processor, messages, return_audio, use_audio_in_video):
    if USE_TRANSFORMERS:
        text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video)
        inputs = inputs.to(model.device).to(model.dtype)
        text_ids, audio = model.generate(**inputs, 
                                            thinker_return_dict_in_generate=True,
                                            thinker_max_new_tokens=8192, 
                                            thinker_do_sample=False,
                                            speaker="Ethan", 
                                            use_audio_in_video=use_audio_in_video,
                                            return_audio=return_audio)
        response = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        if audio is not None:
            audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)
        return response, audio
    else:
        from vllm import SamplingParams
        sampling_params = SamplingParams(temperature=1e-2, top_p=0.1, top_k=1, max_tokens=8192)
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
        inputs = {'prompt': text, 'multi_modal_data': {}, "mm_processor_kwargs": {"use_audio_in_video": use_audio_in_video}}
        if images is not None: inputs['multi_modal_data']['image'] = images
        if videos is not None: inputs['multi_modal_data']['video'] = videos
        if audios is not None: inputs['multi_modal_data']['audio'] = audios
        outputs = model.generate(inputs, sampling_params=sampling_params)
        response = outputs[0].outputs[0].text
        return response, None
    

In [2]:
import librosa
import audioread

from IPython.display import Video
from IPython.display import Image
from IPython.display import Audio

MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

USE_TRANSFORMERS = False
TRANSFORMERS_USE_FLASH_ATTN2 = True

model, processor = _load_model_processor()

USE_AUDIO_IN_VIDEO = True
RETURN_AUDIO = False

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'interleaved', 'mrope_section'}
`torch_dtype` is deprecated! Use `dtype` instead!
2025-09-14 20:32:11,498	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   7% Completed | 1/15 [00:00<00:09,  1.52it/s]
Loading safetensors checkpoint shards:  20% Completed | 3/15 [00:01<00:06,  1.78it/s]
Loading safetensors checkpoint shards:  27% Completed | 4/15 [00:03<00:09,  1.20it/s]
Loading safetensors checkpoint shards:  33% Completed | 5/15 [00:03<00:06,  1.65it/s]
Loading safetensors checkpoint shards:  40% Comp

In [3]:
image_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/2621.jpg"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": "What style does this image depict?"},
        ]
    }
]

display(Image(image_path, width=640, height=360))

response, audio = run_model(model=model, messages=messages, processor=processor, return_audio=RETURN_AUDIO, use_audio_in_video=USE_AUDIO_IN_VIDEO)

print(response)
if audio is not None:
    display(Audio(audio, rate=24000))

<IPython.core.display.Image object>

Adding requests: 100%|██████████| 1/1 [00:05<00:00,  5.44s/it]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.88s/it, est. speed input: 72.69 toks/s, output: 113.04 toks/s]

Based on the visual characteristics of the image, this artwork is a prime example of **Surrealism**.

Here are the key elements that point to this style:

*   **Dreamlike and Illogical Imagery:** The central figure is a bizarre, non-realistic creature. It appears to be a combination of a human head, a large red tower or column, and a spinning top. This impossible fusion of disparate objects is a hallmark of Surrealism.
*   **Juxtaposition of Unrelated Objects:** The artist places a human-like figure with a tower for a body and a spinning top for a base in a barren, desert landscape. This creates a strange and unsettling scene that defies the laws of nature and logic.
*   **Symbolism:** The image is rich with symbolic content. The tower could represent a fortress, a prison, or a monument to the past. The spinning top suggests instability, the passage of time, or a precarious existence. The figure's hands are raised in a gesture that could be interpreted as prayer, surrender, or despair.




In [4]:
image_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/2233.jpg"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": "Based on this image, what do you think will happen next?"},
        ]
    }
]

display(Image(image_path, width=640, height=360))

response, audio = run_model(model=model, messages=messages, processor=processor, return_audio=RETURN_AUDIO, use_audio_in_video=USE_AUDIO_IN_VIDEO)

print(response)
if audio is not None:
    display(Audio(audio, rate=24000))

<IPython.core.display.Image object>

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 103.81it/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it, est. speed input: 67.97 toks/s, output: 119.23 toks/s]

Based on the visual cues in the image, the most likely event to happen next is a storm.

Here is a breakdown of the reasoning:

1.  **The Sky:** The sky is filled with dark, heavy, and turbulent clouds. This type of cloud formation is characteristic of a cumulonimbus or a dense stratocumulus layer, both of which are associated with storms. The ominous, low-hanging clouds suggest that precipitation is imminent.

2.  **The Field:** The field is a lush, green crop, likely wheat or barley, that appears healthy and ready for harvest. This type of vegetation is highly susceptible to damage from strong winds and heavy rain.

3.  **The Atmosphere:** The overall lighting is dim and gray, indicating that the sun is completely obscured by the thick cloud cover. This lack of direct sunlight is typical of the period just before a storm hits.

Given these elements, the most probable sequence of events is that the storm will arrive, bringing with it heavy rain and strong winds. This could lead to sev




In [6]:
image_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/val_IQ_Test_113.jpg"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": "Identify the one picture that follows the same pattern or rule established by the previous pictures. Please directly the correct answer from the options above."},
        ]
    }
]

display(Image(image_path, width=640, height=360))

response, audio = run_model(model=model, messages=messages, processor=processor, return_audio=RETURN_AUDIO, use_audio_in_video=USE_AUDIO_IN_VIDEO)

print(response)
if audio is not None:
    display(Audio(audio, rate=24000))

<IPython.core.display.Image object>

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 241.89it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  9.17it/s, est. speed input: 1351.22 toks/s, output: 18.51 toks/s]

D



