<a href="https://colab.research.google.com/github/yohoobot/works/blob/main/Deezer_abandon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -*- coding: utf-8 -*-
"""
Deezer餐馆场景音乐爬虫：构建"场景描述" - "背景音乐描述"训练样本
新方案：使用 Deezer API 搜索播放列表，下载预览音频并提取详细特征。
无需认证，兼容 Colab/Jupyter。
"""

import requests
import pandas as pd
import time
import os

# ========== 1. 构建关键词列表 ==========
KEYWORDS = [
    "restaurant", "cafe", "coffee shop", "dining",
    "brunch spot", "fine dining", "bistro", "bar",
    "japanese restaurant", "sushi bar", "izakaya",
    "chinese restaurant", "hotpot", "dim sum",
    "korean bbq", "french restaurant", "italian restaurant",
    "mexican restaurant", "indian restaurant", "vegan cafe"
]

# ========== 2. 初始化数据集 & 音频保存文件夹 ==========
data_pairs = []
os.makedirs("preview_audio", exist_ok=True)

# ========== 3. 预先抓取 genre 对照表 ==========
genre_map = {}
genre_resp = requests.get("https://api.deezer.com/genre")
if genre_resp.status_code == 200:
    genre_data = genre_resp.json().get("data", [])
    genre_map = {item["id"]: item["name"] for item in genre_data}

# ========== 4. 遍历关键词，使用 Deezer API 搜索播放列表并提取曲目 ==========
for keyword in KEYWORDS:
    print(f"🔍 正在搜索关键词：{keyword}")
    playlist_search_url = f"https://api.deezer.com/search/playlist?q={keyword}"
    resp = requests.get(playlist_search_url)
    if resp.status_code != 200:
        print(f"⚠️ 播放列表搜索失败：{playlist_search_url}")
        continue

    search_data = resp.json()
    playlists = search_data.get("data", [])
    print(f"📊 找到播放列表数：{len(playlists)}")

    if not playlists:
        print("❌ 没有找到相关播放列表。")
        continue

    for pl in playlists[:3]:  # 每个关键词取前3个播放列表
        playlist_id = pl.get("id")
        playlist_title = pl.get("title")
        playlist_desc = pl.get("description", "")
        print(f"🎵 正在抓取播放列表：{playlist_title}")

        playlist_url = f"https://api.deezer.com/playlist/{playlist_id}"
        playlist_resp = requests.get(playlist_url)
        if playlist_resp.status_code != 200:
            print(f"⚠️ 歌单获取失败：{playlist_url}")
            continue

        playlist_data = playlist_resp.json()
        tracks = playlist_data.get("tracks", {}).get("data", [])
        if not tracks:
            print(f"⚠️ 播放列表无曲目：{playlist_title}")
            continue

        for item in tracks[:5]:  # 每个播放列表取前5首歌
            track_id = item.get("id")
            track_title = item.get("title")
            artist_name = item.get("artist", {}).get("name")
            preview_url = item.get("preview")
            album_title = item.get("album", {}).get("title")
            track_link = item.get("link")

            # 请求 track 详情以获取 bpm、gain、genre 等
            detail_url = f"https://api.deezer.com/track/{track_id}"
            detail_resp = requests.get(detail_url)
            if detail_resp.status_code != 200:
                print(f"⚠️ 获取详情失败：{detail_url}")
                continue

            detail = detail_resp.json()
            bpm = detail.get("bpm")
            gain = detail.get("gain")
            genre_id = detail.get("genre_id")
            genre_name = genre_map.get(genre_id, "unknown")

            if not preview_url:
                print(f"⚠️ 无试听链接：{track_title}")
                continue

            # 下载预览音频
            try:
                audio_data = requests.get(preview_url).content
                filename = f"preview_audio/{track_id}.mp3"
                with open(filename, "wb") as f:
                    f.write(audio_data)
                print(f"🎧 预览音频已保存：{filename}")
            except Exception as e:
                print(f"⚠️ 下载失败：{preview_url}")
                continue

            scene_desc = f"Music from playlist '{playlist_title}' (keyword: '{keyword}'). Description: {playlist_desc}"
            music_info = {
                "track_id": track_id,
                "track_title": track_title,
                "artist": artist_name,
                "deezer_link": track_link,
                "bpm": bpm,
                "gain": gain,
                "genre": genre_name,
                "preview_path": filename
            }
            data_pairs.append({"scene": scene_desc.strip(), **music_info})
            time.sleep(1)  # 请休息 1 秒，避免请求过快

# ========== 5. 保存为CSV ==========
df = pd.DataFrame(data_pairs)
df.to_csv("scene_music_pairs.csv", index=False, encoding="utf-8-sig")
print("✅ 数据抓取完成，保存为 scene_music_pairs.csv")


🔍 正在搜索关键词：restaurant
📊 找到播放列表数：25
🎵 正在抓取播放列表：Restaurant Background Music 🍸 chill covers
🎧 预览音频已保存：preview_audio/3070104201.mp3
🎧 预览音频已保存：preview_audio/2893242971.mp3
🎧 预览音频已保存：preview_audio/128156069.mp3
🎧 预览音频已保存：preview_audio/3149220331.mp3
🎧 预览音频已保存：preview_audio/2898482391.mp3
🎵 正在抓取播放列表：Dinner Lounge 🍸 Cocktail Bar, Restaurant, Café
🎧 预览音频已保存：preview_audio/2893242971.mp3
🎧 预览音频已保存：preview_audio/3070104201.mp3
🎧 预览音频已保存：preview_audio/3149220331.mp3
🎧 预览音频已保存：preview_audio/2715671332.mp3
🎧 预览音频已保存：preview_audio/2934415641.mp3
🎵 正在抓取播放列表：Restaurant Bossa Nova : lounge, apéro bossanova
🎧 预览音频已保存：preview_audio/1161654902.mp3
🎧 预览音频已保存：preview_audio/716910742.mp3
🎧 预览音频已保存：preview_audio/2294518895.mp3
🎧 预览音频已保存：preview_audio/1228436532.mp3
🎧 预览音频已保存：preview_audio/964492122.mp3
🔍 正在搜索关键词：cafe
📊 找到播放列表数：25
🎵 正在抓取播放列表：Café Lounge ☕️ Coffee Chill Covers, Koffie, Kaffee
🎧 预览音频已保存：preview_audio/2873362382.mp3
🎧 预览音频已保存：preview_audio/3070104201.mp3
🎧 预览音频已保存：preview_audio/3130358391.mp3
🎧 预览音频