<a href="https://colab.research.google.com/github/yohoobot/works/blob/main/experiment_yes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate bert_score rouge_score nltk --quiet


In [None]:
# 10
# 80+样本，带歌名未清洗
# musicgen_scene_music_pairs_train.jsonl
# musicgen_scene_music_pairs_test.jsonl
# few-shot k=2
# 1 epoch

# ✅ 安装一次即可
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入所需模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ✅ 读取数据
with open("musicgen_scene_music_pairs_train.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ 设置 API
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构建 few-shot prompt
def build_few_shot_messages(k=2):
    examples = random.sample(train_data, k)
    messages = [{"role": "system", "content": "You are a music cognition expert converting restaurant scene descriptions into music prompts suitable for MusicGen."}]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen API
def generate_music_description(scene_desc):
    messages = build_few_shot_messages()
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 遍历测试集做推理
generated = []
for item in tqdm(test_data):
    scene = item["scene"]
    reference = item["music"]
    prediction = generate_music_description(scene)
    generated.append({"scene": scene, "reference": reference, "prediction": prediction})

# ✅ 提取用于评估的文本
references = [x["reference"] for x in generated]
predictions = [x["prediction"] for x in generated]

# ✅ 评估：BLEU（使用 NLTK）
smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)

# ✅ 评估：ROUGE / METEOR / BERTScore / BLEURT
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")
bleurt = evaluate.load("bleurt", module_type="metric", checkpoint="bleurt-base-128")

rouge_result = rouge.compute(predictions=predictions, references=references)
meteor_result = meteor.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
bleurt_result = bleurt.compute(predictions=predictions, references=references)

# ✅ 输出评估结果
print("🎯 Evaluation Results:")
print(f"BLEU: {avg_bleu:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"METEOR: {meteor_result['meteor']:.4f}")
print(f"BERTScore (F1): {sum(bertscore_result['f1'])/len(bertscore_result['f1']):.4f}")
print(f"BLEURT: {sum(bleurt_result['scores'])/len(bleurt_result['scores']):.4f}")


In [None]:
# 21 32
# 80+样本，已经替换为 this is a，已清洗
# musicgen_scene_music_pairs_train_v2.jsonl
# musicgen_scene_music_pairs_test_v2.jsonl
# few-shot k=4
# 1 epoch

# ✅ 安装依赖（首次运行）
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ✅ 读取新数据集（已经替换了 is a 为 this is a）
with open("musicgen_scene_music_pairs_train_v2.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_test_v2.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ Qwen API 设置（请替换为你自己的 API Key）
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构造 few-shot prompt
def build_few_shot_messages(k=2):
    examples = random.sample(train_data, k)
    messages = [{
        "role": "system",
        "content": "You are a music cognition expert. Convert restaurant scene descriptions into MusicGen-style music prompts. Describe BPM, genre, mood, key, instrumentation."
    }]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen 接口
def generate_music_description(scene_desc):
    messages = build_few_shot_messages(k=2)
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 遍历测试集生成预测
results = []
for sample in tqdm(test_data):
    pred = generate_music_description(sample["scene"])
    results.append({
        "scene": sample["scene"],
        "reference": sample["music"],
        "prediction": pred
    })

# ✅ 保存生成结果
output_path = "/content/qwen_fewshot_outputs_v2.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# ✅ 评估部分
references = [x["reference"] for x in results]
predictions = [x["prediction"] for x in results]

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)
rouge_result = rouge.compute(predictions=predictions, references=references)
meteor_result = meteor.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
avg_bertscore_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])

# ✅ 输出评估结果
print("\n🎯 Evaluation Results (Qwen Few-shot → Music Description):")
print(f"BLEU: {avg_bleu:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"METEOR: {meteor_result['meteor']:.4f}")
print(f"BERTScore (F1): {avg_bertscore_f1:.4f}")


In [None]:
# 40
# 185样本，未清洗
# musicgen_scene_music_pairs_all_train.jsonl
# musicgen_scene_music_pairs_all_test.jsonl
# few-shot k=4
# 1 epoch

# ✅ 安装依赖（首次运行）
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ✅ 读取新数据集（185条完整数据，格式已标准化）
with open("musicgen_scene_music_pairs_all_train.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_all_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ Qwen API 设置（请替换为你自己的 API Key）
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构造 few-shot prompt（使用 4 条）
def build_few_shot_messages(k=4):
    examples = random.sample(train_data, k)
    messages = [{
        "role": "system",
        "content": "You are a music cognition expert. Convert restaurant scene descriptions into MusicGen-style music prompts. Describe BPM, genre, mood, key, instrumentation."
    }]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen 接口
def generate_music_description(scene_desc):
    messages = build_few_shot_messages(k=4)
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 遍历测试集生成预测
results = []
for sample in tqdm(test_data):
    pred = generate_music_description(sample["scene"])
    results.append({
        "scene": sample["scene"],
        "reference": sample["music"],
        "prediction": pred
    })

# ✅ 保存生成结果
output_path = "/content/qwen_fewshot_outputs_full.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# ✅ 评估部分
references = [x["reference"] for x in results]
predictions = [x["prediction"] for x in results]

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)
rouge_result = rouge.compute(predictions=predictions, references=references)
meteor_result = meteor.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
avg_bertscore_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])

# ✅ 输出评估结果
print("\n🎯 Evaluation Results (Qwen Few-shot → Music Description):")
print(f"BLEU: {avg_bleu:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"METEOR: {meteor_result['meteor']:.4f}")
print(f"BERTScore (F1): {avg_bertscore_f1:.4f}")


In [None]:
# 50
# 185样本，已清洗
# musicgen_scene_music_pairs_all_cleaned_train.jsonl
# musicgen_scene_music_pairs_all_cleaned_test.jsonl
# few-shot k=4
# 1 epoch


# ✅ 安装依赖（首次运行）
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ✅ 读取新数据集（185条完整数据，格式已标准化）
with open("musicgen_scene_music_pairs_all_cleaned_train.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_all_cleaned_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ Qwen API 设置（请替换为你自己的 API Key）
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构造 few-shot prompt（使用 4 条）
def build_few_shot_messages(k=4):
    examples = random.sample(train_data, k)
    messages = [{
        "role": "system",
        "content": "You are a music cognition expert. Convert restaurant scene descriptions into MusicGen-style music prompts. Describe BPM, genre, mood, key, instrumentation."
    }]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen 接口
def generate_music_description(scene_desc):
    messages = build_few_shot_messages(k=4)
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 遍历测试集生成预测
results = []
for sample in tqdm(test_data):
    pred = generate_music_description(sample["scene"])
    results.append({
        "scene": sample["scene"],
        "reference": sample["music"],
        "prediction": pred
    })

# ✅ 保存生成结果
output_path = "/content/qwen_fewshot_outputs_full.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# ✅ 评估部分
references = [x["reference"] for x in results]
predictions = [x["prediction"] for x in results]

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)
rouge_result = rouge.compute(predictions=predictions, references=references)
meteor_result = meteor.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
avg_bertscore_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])

# ✅ 输出评估结果
print("\n🎯 Evaluation Results (Qwen Few-shot → Music Description):")
print(f"BLEU: {avg_bleu:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")
print(f"METEOR: {meteor_result['meteor']:.4f}")
print(f"BERTScore (F1): {avg_bertscore_f1:.4f}")


In [None]:
# 63
# 185样本，已清洗
# musicgen_scene_music_pairs_all_cleaned_train.jsonl
# musicgen_scene_music_pairs_all_cleaned_test.jsonl
# few-shot k=6
# 5 epoch

# ✅ 安装依赖（首次运行）
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import matplotlib.pyplot as plt

# ✅ 读取新数据集（185条完整数据，格式已标准化）
with open("musicgen_scene_music_pairs_all_cleaned_train.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_all_cleaned_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ Qwen API 设置（请替换为你自己的 API Key）
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构造 few-shot prompt（每轮 6 条）
def build_few_shot_messages(k=6):
    examples = random.sample(train_data, k)
    messages = [{
        "role": "system",
        "content": "You are a music cognition expert. Convert restaurant scene descriptions into MusicGen-style music prompts. Describe BPM, genre, mood, key, instrumentation."
    }]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen 接口
def generate_music_description(scene_desc, k=6):
    messages = build_few_shot_messages(k)
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 多轮验证（默认 5 轮）
epochs = 5
results_by_round = []

for round_idx in range(epochs):
    print(f"\n🚀 Round {round_idx + 1}/{epochs}")
    round_results = []
    for sample in tqdm(test_data):
        pred = generate_music_description(sample["scene"], k=6)
        round_results.append({
            "scene": sample["scene"],
            "reference": sample["music"],
            "prediction": pred
        })

    references = [x["reference"] for x in round_results]
    predictions = [x["prediction"] for x in round_results]

    rouge = evaluate.load("rouge")
    meteor = evaluate.load("meteor")
    bertscore = evaluate.load("bertscore")

    smoothie = SmoothingFunction().method4
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    rouge_result = rouge.compute(predictions=predictions, references=references)
    meteor_result = meteor.compute(predictions=predictions, references=references)
    bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
    avg_bertscore_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])

    results_by_round.append({
        "BLEU": avg_bleu,
        "ROUGE-L": rouge_result["rougeL"],
        "METEOR": meteor_result["meteor"],
        "BERTScore_F1": avg_bertscore_f1
    })

# ✅ 可视化评估指标
rounds = list(range(1, epochs + 1))
plt.figure(figsize=(10, 6))
for metric in ["BLEU", "ROUGE-L", "METEOR", "BERTScore_F1"]:
    scores = [res[metric] for res in results_by_round]
    plt.plot(rounds, scores, label=metric)

plt.xlabel("Round")
plt.ylabel("Score")
plt.title("Qwen Few-shot Multi-round Evaluation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# ✅ 输出每轮得分表格
for idx, scores in enumerate(results_by_round):
    print(f"\n📊 Round {idx + 1} Scores:")
    for k, v in scores.items():
        print(f"{k}: {v:.4f}")

In [None]:
# 74
# 185样本，已清洗
# musicgen_scene_music_pairs_all_cleaned_train.jsonl
# musicgen_scene_music_pairs_all_cleaned_test.jsonl
# few-shot k=10
# 5 epoch

# ✅ 安装依赖（首次运行）
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import matplotlib.pyplot as plt

# ✅ 读取新数据集（185条完整数据，格式已标准化）
with open("musicgen_scene_music_pairs_all_cleaned_train.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_all_cleaned_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ Qwen API 设置（请替换为你自己的 API Key）
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构造 few-shot prompt（每轮 10 条）
def build_few_shot_messages(k=10):
    examples = random.sample(train_data, k)
    messages = [{
        "role": "system",
        "content": "You are a music cognition expert. Convert restaurant scene descriptions into MusicGen-style music prompts. Describe BPM, genre, mood, key, instrumentation."
    }]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen 接口
def generate_music_description(scene_desc, k=10):
    messages = build_few_shot_messages(k)
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 多轮验证（默认 5 轮）
epochs = 5
results_by_round = []

for round_idx in range(epochs):
    print(f"\n🚀 Round {round_idx + 1}/{epochs}")
    round_results = []
    for sample in tqdm(test_data):
        pred = generate_music_description(sample["scene"], k=10)
        round_results.append({
            "scene": sample["scene"],
            "reference": sample["music"],
            "prediction": pred
        })

    references = [x["reference"] for x in round_results]
    predictions = [x["prediction"] for x in round_results]

    rouge = evaluate.load("rouge")
    meteor = evaluate.load("meteor")
    bertscore = evaluate.load("bertscore")

    smoothie = SmoothingFunction().method4
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    rouge_result = rouge.compute(predictions=predictions, references=references)
    meteor_result = meteor.compute(predictions=predictions, references=references)
    bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
    avg_bertscore_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])

    results_by_round.append({
        "BLEU": avg_bleu,
        "ROUGE-L": rouge_result["rougeL"],
        "METEOR": meteor_result["meteor"],
        "BERTScore_F1": avg_bertscore_f1
    })

# ✅ 可视化评估指标
rounds = list(range(1, epochs + 1))
plt.figure(figsize=(10, 6))
for metric in ["BLEU", "ROUGE-L", "METEOR", "BERTScore_F1"]:
    scores = [res[metric] for res in results_by_round]
    plt.plot(rounds, scores, label=metric)

plt.xlabel("Round")
plt.ylabel("Score")
plt.title("Qwen Few-shot Multi-round Evaluation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# ✅ 输出每轮得分表格
for idx, scores in enumerate(results_by_round):
    print(f"\n📊 Round {idx + 1} Scores:")
    for k, v in scores.items():
        print(f"{k}: {v:.4f}")

In [None]:
# 85
# 185样本，已清洗
# musicgen_scene_music_pairs_all_cleaned_train.jsonl
# musicgen_scene_music_pairs_all_cleaned_test.jsonl
# few-shot k=10
# 10 epoch

# ✅ 安装依赖（首次运行）
# !pip install evaluate bert_score rouge_score nltk --quiet
# !pip install git+https://github.com/google-research/bleurt.git

# ✅ 导入模块
import json
import random
import requests
from tqdm import tqdm
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import matplotlib.pyplot as plt

# ✅ 读取新数据集（185条完整数据，格式已标准化）
with open("musicgen_scene_music_pairs_all_cleaned_train.jsonl", "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]
with open("musicgen_scene_music_pairs_all_cleaned_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# ✅ Qwen API 设置（请替换为你自己的 API Key）
QWEN_API_KEY = "sk-"
QWEN_API_URL = "https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation"

# ✅ 构造 few-shot prompt（每轮 10 条）
def build_few_shot_messages(k=10):
    examples = random.sample(train_data, k)
    messages = [{
        "role": "system",
        "content": "You are a music cognition expert. Convert restaurant scene descriptions into MusicGen-style music prompts. Describe BPM, genre, mood, key, instrumentation."
    }]
    for ex in examples:
        messages.append({"role": "user", "content": f"Scene: {ex['scene']}"})
        messages.append({"role": "assistant", "content": ex['music']})
    return messages

# ✅ 调用 Qwen 接口
def generate_music_description(scene_desc, k=10):
    messages = build_few_shot_messages(k)
    messages.append({"role": "user", "content": f"Scene: {scene_desc}"})
    headers = {
        "Authorization": f"Bearer {QWEN_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "qwen2.5-14b-instruct",
        "input": {"messages": messages},
        "parameters": {"temperature": 0.5, "max_tokens": 150}
    }
    response = requests.post(QWEN_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json().get("output", {}).get("text", "").strip()
    else:
        return "ERROR"

# ✅ 多轮验证（默认 5 轮）
epochs = 10
results_by_round = []

for round_idx in range(epochs):
    print(f"\n🚀 Round {round_idx + 1}/{epochs}")
    round_results = []
    for sample in tqdm(test_data):
        pred = generate_music_description(sample["scene"], k=10)
        round_results.append({
            "scene": sample["scene"],
            "reference": sample["music"],
            "prediction": pred
        })

    references = [x["reference"] for x in round_results]
    predictions = [x["prediction"] for x in round_results]

    rouge = evaluate.load("rouge")
    meteor = evaluate.load("meteor")
    bertscore = evaluate.load("bertscore")

    smoothie = SmoothingFunction().method4
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    rouge_result = rouge.compute(predictions=predictions, references=references)
    meteor_result = meteor.compute(predictions=predictions, references=references)
    bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")
    avg_bertscore_f1 = sum(bertscore_result["f1"]) / len(bertscore_result["f1"])

    results_by_round.append({
        "BLEU": avg_bleu,
        "ROUGE-L": rouge_result["rougeL"],
        "METEOR": meteor_result["meteor"],
        "BERTScore_F1": avg_bertscore_f1
    })

# ✅ 可视化评估指标
rounds = list(range(1, epochs + 1))
plt.figure(figsize=(10, 6))
for metric in ["BLEU", "ROUGE-L", "METEOR", "BERTScore_F1"]:
    scores = [res[metric] for res in results_by_round]
    plt.plot(rounds, scores, label=metric)

plt.xlabel("Round")
plt.ylabel("Score")
plt.title("Qwen Few-shot Multi-round Evaluation")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# ✅ 输出每轮得分表格
for idx, scores in enumerate(results_by_round):
    print(f"\n📊 Round {idx + 1} Scores:")
    for k, v in scores.items():
        print(f"{k}: {v:.4f}")