### Audio Caption with Qwen3-Omni

In [None]:
import os
os.environ['VLLM_USE_V1'] = '0'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
os.environ["VLLM_LOGGING_LEVEL"] = "ERROR"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import torch
import warnings
import numpy as np

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

from qwen_omni_utils import process_mm_info
from transformers import Qwen3OmniMoeProcessor

def _load_model_processor():
    if USE_TRANSFORMERS:
        from transformers import Qwen3OmniMoeForConditionalGeneration
        if TRANSFORMERS_USE_FLASH_ATTN2:
            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH,
                                                                         dtype='auto',
                                                                         attn_implementation='flash_attention_2',
                                                                         device_map="auto")
        else:
            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH, device_map="auto", dtype='auto')
    else:
        from vllm import LLM
        model = LLM(
            model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,
            tensor_parallel_size=torch.cuda.device_count(),
            limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},
            max_num_seqs=1,
            max_model_len=32768,
            seed=1234,
        )

    processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
    return model, processor

def run_model(model, processor, messages, return_audio, use_audio_in_video):
    if USE_TRANSFORMERS:
        text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=use_audio_in_video)
        inputs = inputs.to(model.device).to(model.dtype)
        text_ids, audio = model.generate(**inputs, 
                                            thinker_return_dict_in_generate=True,
                                            thinker_max_new_tokens=8192, 
                                            thinker_do_sample=False,
                                            speaker="Ethan", 
                                            use_audio_in_video=use_audio_in_video,
                                            return_audio=return_audio)
        response = processor.batch_decode(text_ids.sequences[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        if audio is not None:
            audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)
        return response, audio
    else:
        from vllm import SamplingParams
        sampling_params = SamplingParams(temperature=1e-2, top_p=0.1, top_k=1, max_tokens=8192)
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
        inputs = {'prompt': text, 'multi_modal_data': {}, "mm_processor_kwargs": {"use_audio_in_video": use_audio_in_video}}
        if images is not None: inputs['multi_modal_data']['image'] = images
        if videos is not None: inputs['multi_modal_data']['video'] = videos
        if audios is not None: inputs['multi_modal_data']['audio'] = audios
        outputs = model.generate(inputs, sampling_params=sampling_params)
        response = outputs[0].outputs[0].text
        return response, None
    

In [None]:
import librosa
import audioread

from IPython.display import Video
from IPython.display import Audio

MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

USE_TRANSFORMERS = False
TRANSFORMERS_USE_FLASH_ATTN2 = True

model, processor = _load_model_processor()

USE_AUDIO_IN_VIDEO = True
RETURN_AUDIO = False

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section', 'interleaved'}
`torch_dtype` is deprecated! Use `dtype` instead!
2025-09-14 18:08:37,090	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   7% Completed | 1/15 [00:00<00:08,  1.57it/s]
Loading safetensors checkpoint shards:  20% Completed | 3/15 [00:01<00:06,  1.85it/s]
Loading safetensors checkpoint shards:  27% Completed | 4/15 [00:02<00:07,  1.39it/s]
Loading safetensors checkpoint shards:  33% Completed | 5/15 [00:02<00:05,  1.91it/s]
Loading safetensors checkpoint shards:  40% Comp

In [22]:
audio_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/caption1.mp3"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": audio_path},
            {"type": "text", "text": "Give the detailed description of the audio."},
        ]
    }
]

display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=16000)[0], rate=16000))

response, audio = run_model(model=model, messages=messages, processor=processor, return_audio=RETURN_AUDIO, use_audio_in_video=USE_AUDIO_IN_VIDEO)

print(response)
if audio is not None:
    display(Audio(audio, rate=24000))

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 178.24it/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it, est. speed input: 98.75 toks/s, output: 130.67 toks/s]

The audio clip is a 21-second, high-fidelity field recording capturing the intense, layered soundscape of a busy industrial or construction site. The recording is dominated by the powerful, rhythmic pounding of a pneumatic jackhammer, its percussive impacts sharp and metallic, occurring in irregular bursts that suggest manual operation. These impacts are accompanied by a constant, low-frequency hum from heavy machinery—likely a diesel engine—providing a steady background drone.

At the start, a brief, high-pitched metallic squeal is heard, possibly from a piece of equipment being engaged or adjusted. As the jackhammer work progresses, the soundscape becomes increasingly complex: the jackhammer’s rhythm is punctuated by the clatter of loose debris and the scraping of metal on metal, indicating active demolition or excavation. Around the 12-second mark, a new sound emerges—a high-pitched, continuous whine, characteristic of a power saw or grinder cutting through hard material. This sound




In [23]:
audio_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/caption2.mp3"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": audio_path},
            {"type": "text", "text": "Please provide a detailed description of the audio."},
        ]
    }
]

display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=16000)[0], rate=16000))

response, audio = run_model(model=model, messages=messages, processor=processor, return_audio=RETURN_AUDIO, use_audio_in_video=USE_AUDIO_IN_VIDEO)

print(response)
if audio is not None:
    display(Audio(audio, rate=24000))

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 86.54it/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.15s/it, est. speed input: 90.65 toks/s, output: 130.40 toks/s]

The audio clip is a 20-second, low-fidelity recording of a solo female vocalist performing a gentle, acoustic pop song. The recording is intimate and lo-fi, with a noticeable background hiss and a slightly muffled, compressed sound, suggesting it was captured on a consumer device like a smartphone or basic digital recorder in a small, untreated room.

The musical arrangement is sparse and centered on a solo acoustic guitar. The guitar is played in a fingerstyle pattern, providing a steady, arpeggiated accompaniment with a warm, mellow tone. The rhythm is unhurried and consistent, supporting the song’s calm, reflective mood.

The vocalist is a young adult female with a soft, breathy, and slightly melancholic tone. Her delivery is emotionally expressive, with subtle dynamic shifts and gentle vibrato, conveying vulnerability and introspection. Her accent is non-native English, with a distinct melodic intonation and pronunciation that suggests she is likely a native speaker of a language s




In [24]:
audio_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/caption3.mp3"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": audio_path},
            {"type": "text", "text": "Give a thorough description of the audio."},
        ]
    }
]

display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=16000)[0], rate=16000))

response, audio = run_model(model=model, messages=messages, processor=processor, return_audio=RETURN_AUDIO, use_audio_in_video=USE_AUDIO_IN_VIDEO)

print(response)
if audio is not None:
    display(Audio(audio, rate=24000))

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 143.41it/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.13s/it, est. speed input: 80.85 toks/s, output: 130.07 toks/s]

The audio clip is a 17-second, intimate, and emotionally charged recording, likely captured on a smartphone or similar consumer device in a quiet, private indoor space—most probably a bedroom. The recording begins with a soft, breathy sigh, immediately establishing a mood of melancholy and introspection.

A young woman, speaking in a gentle, slightly breathy, and emotionally vulnerable tone, delivers a monologue in English. Her accent is non-rhotic, suggesting a British or Australian origin. She says, "I always listen to this song in my bedroom, missing about those past times. Why does time fly by so quickly? I find it hard to let go." Her speech is slow and deliberate, with pauses and sighs that underscore her wistful and nostalgic state of mind.

Throughout the clip, a soft, ethereal ambient music track plays in the background. The music features a high-pitched, breathy female vocal line with heavy reverb and delay, creating a dreamy, spacious, and slightly haunting atmosphere. The i


