<a href="https://colab.research.google.com/github/yohoobot/works/blob/main/qwen25_hugMusicGenset_yes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ✅ 安装必要的库（仅首次运行）
!pip install transformers torchaudio audiocraft accelerate bitsandbytes

import json
import requests
import random
import torch
import torchaudio
from transformers import MusicgenProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio

# ✅ 你的 Qwen API Key（请替换）
QWEN_API_KEY = "sk- "  # 替换为你的 Qwen2.5 API Key

# ✅ Qwen2.5 API 配置
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 读取 JSON 数据集（餐馆环境 - 背景音乐描述）
with open("using.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# ✅ Hugging Face MusicGen 设置
MODEL_NAME = "facebook/musicgen-medium"  # 可改为 musicgen-large / musicgen-stereo-large
processor = MusicgenProcessor.from_pretrained(MODEL_NAME)
model = MusicgenForConditionalGeneration.from_pretrained(MODEL_NAME)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# ✅ 选取 Few-shot 示例
few_shot_examples = random.sample(dataset, 5)
few_shot_prompt = "\n".join(
    [f"Scene: {pair['scene']}\nMusic: {pair['music_desc']}\n" for pair in few_shot_examples]
)

def generate_music_description(scene_desc):
    """使用 Qwen2.5-7B-Instruct 生成背景音乐描述"""
    messages = [
        {"role": "system", "content": "You are an expert in restaurant music selection. Based on the given restaurant scene description, generate a suitable background music description."},
        {"role": "user", "content": f"Here are some examples:\n{few_shot_prompt}\n\nNow, based on the following scene, generate a matching music description.\n\nScene: {scene_desc}\nMusic:"}
    ]

    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "qwen2.5-7b-instruct",
        "input": {"messages": messages},
        "parameters": {
            "temperature": 0.7,
            "max_tokens": 100
        }
    }

    response = requests.post(QWEN_API_URL, headers=headers, json=payload)

    if response.status_code == 200:
        result = response.json()
        return result.get("output", {}).get("text", "No output received.")
    else:
        return f"Error: {response.text}"

def generate_music_from_text(music_desc, duration=10):
    """使用 Hugging Face MusicGen 本地推理生成音频"""
    print(f"Generating music using MusicGen for: {music_desc}")

    inputs = processor(text=[music_desc], padding=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=duration * 50)  # 计算 token 数量

    # 处理生成的音频数据
    waveform = torch.tensor(outputs[0].cpu())  # 转换为 PyTorch Tensor
    sample_rate = 32000  # MusicGen 生成的音频默认 32kHz 采样率

    # 保存到文件
    audio_path = "generated_music.wav"
    torchaudio.save(audio_path, waveform, sample_rate)

    return audio_path

# ✅ 示例测试
test_scene = "现代餐厅，午餐时段，人流量中等"
music_desc = generate_music_description(test_scene)
print("Generated Music Description:", music_desc)

if "Error" not in music_desc:
    audio_file = generate_music_from_text(music_desc)

    if "Error" not in audio_file:
        # 播放音频
        display(Audio(audio_file))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:  57%|#####7    | 4.60G/8.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/8.04G [00:00<?, ?B/s]

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Generated Music Description: Music: 温馨愉悦的轻音乐，融合了现代电子元素与吉他旋律，营造出轻松舒适的午餐时光。
Generating music using MusicGen for: Music: 温馨愉悦的轻音乐，融合了现代电子元素与吉他旋律，营造出轻松舒适的午餐时光。


  waveform = torch.tensor(outputs[0].cpu())  # 转换为 PyTorch Tensor
