# 使用transformers推理

In [1]:
import soundfile as sf  # 导入soundfile库，用于处理音频文件

# 导入transformers库中的Qwen2_5OmniModel和Qwen2_5OmniProcessor
# Qwen2_5OmniModel是用于多模态对话的模型，Qwen2_5OmniProcessor是用于处理多模态数据的处理器
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
# 导入qwen_omni_utils模块中的process_mm_info函数，用于处理多模态信息
from qwen_omni_utils import process_mm_info

# 模型路径，指向预训练模型的存储位置
mode_path = "/root/autodl-tmp/Qwen/Qwen2.5-Omni-7B"
# 加载预训练模型，自动选择设备和数据类型
#model = Qwen2_5OmniModel.from_pretrained(mode_path, torch_dtype="auto", device_map="auto")

#启用flash_attention_2 加速、节省内存（否则会慢成蜗牛！！！）
model = Qwen2_5OmniModel.from_pretrained(
    mode_path,
    torch_dtype="auto",
    device_map="auto",
    attn_implementation="flash_attention_2",)


# 加载预训练的处理器
processor = Qwen2_5OmniProcessor.from_pretrained(mode_path)

# 对话内容，包含系统角色的介绍和用户发送的视频信息
conversation = [
    {
        "role": "system",
        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
    },
    {
        "role": "user",
        "content": [
            {"type": "video", "video": "/root/autodl-tmp/draw.mp4"},
        ],
    },
]

# 是否在视频中使用音频
USE_AUDIO_IN_VIDEO = True

# 准备推理所需的输入数据
# 使用处理器将对话内容转换为模型所需的文本模板
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
# 使用process_mm_info函数处理对话中的多模态信息（音频、图像、视频）
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# 使用处理器将文本、音频、图像、视频等多模态数据编码为模型输入张量
inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# 将输入数据移动到模型所在的设备，并转换为模型的数据类型
inputs = inputs.to(model.device).to(model.dtype)

# 推理：生成输出文本和音频
# 调用模型的generate方法，根据输入生成文本和音频
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)

# 将生成的文本ID解码为可读文本
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)  # 打印生成的文本

# 将生成的音频保存为WAV文件
sf.write(
    "output.wav",
    audio.reshape(-1).detach().cpu().numpy(),
    samplerate=24000,
)

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
  return F.conv3d(
Setting `pad_token_id` to `eos_token_id`:8292 for open-end generation.
  return F.conv1d(input, weight, bias, self.stride,


["system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nOh, that's a really cool drawing! It looks like a guitar. You've got the body and the neck drawn in a simple yet effective way. The lines are clean and the shape is recognizable. What made you choose to draw a guitar?"]


输出内容如下（同时在同目录下会生成一个包含模型回复内容的output.wav音频文件）：
["system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nOh, that's a really cool drawing! It looks like a guitar. You've got the body and the neck drawn in a simple yet effective way. The lines are clean and the shape is recognizable. What made you choose to draw a guitar?"]

In [None]:
输出内容翻译如下：
system
你是通义千问，由阿里巴巴通义千问团队开发的虚拟数字人，能够感知听觉和视觉输入，并生成文本和语音。

user

assistant
哇，这幅画真酷！看起来像一把吉他。你把吉他琴身和琴颈画得很简单，但又很传神。线条干净利落，形状也很容易辨认。你是怎么想到画吉他的呢？

In [None]:
## 是否开启音频输出
该模型支持文本和音频输出，如果用户不需要音频输出，
可以在 from_pretrained 函数中设置 enable_audio_output=False 。
此选项可以节省大约 ~2GB 的 GPU 内存，但 generate 函数的 return_audio 选项仅允许在 False 时设置。

样例如下：

model = Qwen2_5OmniModel.from_pretrained(
    "Qwen/Qwen2.5-Omni-7B",
    torch_dtype="auto",
    device_map="auto",
    enable_audio_output=False,
)