In [1]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import whisper  # 用于语音转文本

device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载 GLM-4V 模型
tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/glm-4v-9b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "/root/autodl-tmp/glm-4v-9b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to(device).eval()

# 加载 Whisper 语音识别模型
whisper_model = whisper.load_model("base")  # 确保加载成功

def transcribe_audio(audio_path):
    """ 语音转文本，并提供状态信息 """
    if not audio_path:
        return ""
    
    print(f"🔍 正在处理音频文件: {audio_path}")  # DEBUG: 确保路径有效

    try:
        transcription = whisper_model.transcribe(audio_path)
        text_output = transcription["text"]
        return text_output if text_output.strip() else ""
    except Exception as e:
        return f"语音识别失败: {str(e)}"

def generate_description(image, query):
    """ 生成文本描述 """
    if not query.strip():
        return "错误：请输入文本或语音输入问题。"

    # 处理图片 + 文本输入
    if image is not None:
        image = image.convert('RGB')
        inputs = tokenizer.apply_chat_template(
            [{"role": "user", "image": image, "content": query}],
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt",
            return_dict=True
        ).to(device)
    else:
        inputs = tokenizer.apply_chat_template(
            [{"role": "user", "content": query}],
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt",
            return_dict=True
        ).to(device)

    gen_kwargs = {"max_length": 1000, "do_sample": True, "top_k": 1}
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        description = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return description

def update_query_from_audio(audio):
    """ 语音输入后，自动填充 `语音转文本` 输入框 """
    transcribed_text = transcribe_audio(audio)
    return transcribed_text  # 让语音转文字直接填充 UI 输入框

def gradio_interface(image, transcribed_text, query):
    """ 处理输入，并确保 query 由 `语音转文本` + `手动输入` 共同决定 """
    final_query = query.strip() or transcribed_text.strip()  # 优先使用用户输入的文本

    if not final_query:
        return "错误：请输入文本或语音输入问题。"

    description = generate_description(image, final_query)
    return description

# Gradio 界面
with gr.Blocks() as interface:
    gr.Markdown("## GLM-4V 语音 + 图片 + 文本 多模态描述生成")
    gr.Markdown("上传图片、输入问题或使用语音描述，让 AI 生成对应的描述。")

    with gr.Row():
        image_input = gr.Image(label="上传图片（可选）", type="pil")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="语音输入（可选）")
        transcribed_text = gr.Textbox(label="语音转文本结果（可修改）", interactive=True)  # 语音转文本实时填充

    with gr.Row():
        query_input = gr.Textbox(label="输入问题（可手动修改）", interactive=True)  # 最终输入
        submit_button = gr.Button("提交")

    output_text = gr.Textbox(label="生成的描述")

    # 当用户上传音频时，更新 `语音转文本` 输入框
    audio_input.change(update_query_from_audio, inputs=[audio_input], outputs=[transcribed_text])

    # 提交时，综合 `语音转文本` + `手动输入`，生成描述
    submit_button.click(
        gradio_interface,
        inputs=[image_input, transcribed_text, query_input],
        outputs=[output_text]
    )

interface.launch(share=True)


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

  checkpoint = torch.load(fp, map_location=device)


Running on local URL:  http://127.0.0.1:7860

Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB
Running on public URL: https://4041d8f124070fcd45.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


