<a href="https://colab.research.google.com/github/yohoobot/works/blob/main/qwen2_5_hugMusicGen_Gradio_yesyes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ✅ 安装 Gradio（仅首次运行）
!pip install gradio
!pip install transformers torchaudio audiocraft accelerate bitsandbytes


Collecting gradio
  Using cached gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Using cached fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Using cached gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Using cached groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Using cached MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Using cached python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ru

In [1]:

import json
import requests
import random
import torch
import torchaudio
import gradio as gr
from transformers import MusicgenProcessor, MusicgenForConditionalGeneration
from IPython.display import display, Audio

# ✅ 你的 Qwen API Key（请替换）
QWEN_API_KEY = "sk- "

# ✅ Qwen2.5 API 配置
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 读取 JSON 数据集（餐馆环境 - 背景音乐描述）
with open("using.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# ✅ Hugging Face MusicGen 设置
MODEL_NAME = "facebook/musicgen-medium"  # 可改为 musicgen-large / musicgen-stereo-large
processor = MusicgenProcessor.from_pretrained(MODEL_NAME)
model = MusicgenForConditionalGeneration.from_pretrained(MODEL_NAME)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 选取 Few-shot 示例
few_shot_examples = random.sample(dataset, 5)
few_shot_prompt = "\n".join(
    [f"Scene: {pair['scene']}\nMusic: {pair['music_desc']}\n" for pair in few_shot_examples]
)

def generate_music_description(scene_desc):
    """使用 Qwen2.5-7B-Instruct 生成背景音乐描述"""
    messages = [
        {"role": "system", "content": "You are an expert in restaurant music selection. Based on the given restaurant scene description, generate a suitable background music description."},
        {"role": "user", "content": f"Here are some examples:\n{few_shot_prompt}\n\nNow, based on the following scene, generate a matching music description.\n\nScene: {scene_desc}\nMusic:"}
    ]

    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "qwen2.5-7b-instruct",
        "input": {"messages": messages},
        "parameters": {
            "temperature": 0.7,
            "max_tokens": 100
        }
    }

    response = requests.post(QWEN_API_URL, headers=headers, json=payload)

    if response.status_code == 200:
        result = response.json()
        return result.get("output", {}).get("text", "No output received.")
    else:
        return f"Error: {response.text}"

def generate_music_from_text(music_desc, duration=10):
    """使用 Hugging Face MusicGen 生成音频"""
    print(f"Generating music using MusicGen for: {music_desc}")

    inputs = processor(text=[music_desc], padding=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=duration * 50)  # 计算 token 数量

    # 处理生成的音频数据
    waveform = torch.tensor(outputs[0].cpu())  # 转换为 PyTorch Tensor
    sample_rate = 32000  # MusicGen 生成的音频默认 32kHz 采样率

    # 保存到文件
    audio_path = "generated_music.wav"
    torchaudio.save(audio_path, waveform, sample_rate)

    return audio_path

# ✅ Gradio 前端
def generate_music(scene_description):
    """Gradio 前端：输入餐馆环境，输出音乐描述和音频"""
    music_desc = generate_music_description(scene_description)
    if "Error" in music_desc:
        return music_desc, None

    audio_path = generate_music_from_text(music_desc)
    return music_desc, audio_path

# 创建 Gradio 界面
with gr.Blocks() as demo:
    gr.Markdown("## 🎵 AI 生成背景音乐")
    gr.Markdown("输入餐馆环境描述，Qwen2.5 生成音乐描述，MusicGen 生成音频")

    with gr.Row():
        scene_input = gr.Textbox(label="餐馆环境描述", placeholder="例如：温馨的意大利餐厅，低光照，浪漫氛围")
        generate_button = gr.Button("🎶 生成音乐")

    with gr.Row():
        music_output = gr.Textbox(label="生成的音乐描述")

    audio_output = gr.Audio(label="🎧 生成的音乐", type="filepath")

    generate_button.click(generate_music, inputs=[scene_input], outputs=[music_output, audio_output])

# 启动 Gradio 界面
demo.launch(share=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.87k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/8.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/8.04G [00:00<?, ?B/s]

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

IMPORTANT: You are using gradio version 3.50.2, however version 4.44.1 is available, please upgrade.
--------
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://9868caae01f54dd368.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


