### Sample subset of the data

In [9]:
from datasets import load_dataset
import pandas as pd

# Load full dataset
ds = load_dataset("google/MusicCaps")["train"]

# Shuffle and sample
sampled = ds.shuffle(seed=42).select(range(120))  # pick 100 random entries

# Convert to DataFrame
df = pd.DataFrame(sampled)

# Save to CSV
df[["ytid", "caption"]].to_csv("sampled_musiccaps.csv", index=False)

### Fetch audio clips

In [None]:
# import pandas as pd
# import os
# import subprocess
# from tqdm import tqdm

# # Load the sample
# df = pd.read_csv("sampled_musiccaps.csv")
# output_dir = "clips"
# os.makedirs(output_dir, exist_ok=True)

# def download_clip(ytid, output_dir):
#     url = f"https://www.youtube.com/watch?v={ytid}"
#     output_path = os.path.join(output_dir, f"{ytid}.mp3")

#     if os.path.exists(output_path):
#         return

#     command = [
#         "yt-dlp",
#         "-x", "--audio-format", "mp3",
#         "--postprocessor-args", "-ss 00:00:00 -t 10",  # first 10 seconds
#         url,
#         "-o", os.path.join(output_dir, f"{ytid}.%(ext)s")
#     ]
    
#     try:
#         subprocess.run(command, check=True)
#     except subprocess.CalledProcessError as e:
#         print(f"Failed to download {ytid}: {e}")

# # Download all clips
# for ytid in tqdm(df["ytid"]):
#     download_clip(ytid, output_dir)

In [10]:
import os
import subprocess
import pandas as pd
from tqdm import tqdm

# ---- Config ----
input_csv = "sampled_musiccaps.csv"
output_csv = "metadata_100.csv"
output_dir = "clips"
target_count = 100
os.makedirs(output_dir, exist_ok=True)

# ---- Load full metadata ----
df = pd.read_csv(input_csv)
valid_rows = []

def download_10s_clip(ytid, out_dir):
    url = f"https://www.youtube.com/watch?v={ytid}"
    out_path = os.path.join(out_dir, f"{ytid}.mp3")
    if os.path.exists(out_path):
        return True

    command = [
        "yt-dlp",
        "-x", "--audio-format", "mp3",
        "--postprocessor-args", "-ss 00:00:00 -t 10",
        url,
        "-o", os.path.join(out_dir, f"{ytid}.%(ext)s")
    ]

    try:
        subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        return os.path.exists(out_path)
    except subprocess.CalledProcessError:
        return False

# ---- Try downloading until 100 clips are available ----
print(f"🚀 Attempting to download 100 valid clips...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    if len(valid_rows) >= target_count:
        break
    ytid = row["ytid"]
    success = download_10s_clip(ytid, output_dir)
    if success:
        valid_rows.append(row)

# ---- Save updated metadata ----
print(f"✅ Successfully downloaded {len(valid_rows)} audio clips.")
pd.DataFrame(valid_rows).to_csv(output_csv, index=False)

🚀 Attempting to download 100 valid clips...


 88%|████████▊ | 105/120 [00:24<00:03,  4.23it/s]

✅ Successfully downloaded 100 audio clips.





### Compute CLAP Similarity

In [14]:
import os
import torchaudio
import torch
import numpy as np
from transformers import ClapProcessor, ClapModel
from scipy.spatial.distance import cosine
from tqdm import tqdm
import pandas as pd

# -------------------------
# Config
# -------------------------
CLIP_DIR = "clips"  # where your downloaded 10s audio clips are
SAMPLE_RATE = 48000  # CLAP expects 48kHz
OUTPUT_CSV = "clap_similarity_results.csv"

# -------------------------
# Load CLAP
# -------------------------
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")

# -------------------------
# Load Metadata
# -------------------------
df = pd.read_csv("metadata_100.csv")  # should have columns: ytid, caption

# -------------------------
# CLAP Similarity Computation
# -------------------------
results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    ytid = row["ytid"]
    caption = row["caption"]
    audio_path = None

    # Try both .mp3 and .wav
    for ext in [".mp3", ".wav"]:
        possible_path = os.path.join(CLIP_DIR, f"{ytid}{ext}")
        if os.path.exists(possible_path):
            audio_path = possible_path
            break

    if audio_path is None:
        print(f"⚠️ Audio not found for {ytid}, skipping.")
        continue

    try:
        # Load and resample to 48kHz
        waveform, sr = torchaudio.load(audio_path)

        # Resample if needed
        if sr != SAMPLE_RATE:
            waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=SAMPLE_RATE)

        # Convert to mono if stereo
        if waveform.ndim == 2 and waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Make sure it's shape (1, N)
        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)

        # Process for CLAP
        waveform_np = waveform.squeeze().numpy() 
        audio_inputs = clap_processor(audios=waveform_np, return_tensors="pt", sampling_rate=SAMPLE_RATE)
        text_inputs = clap_processor(text=[caption], return_tensors="pt")

        with torch.no_grad():
            audio_embed = clap_model.get_audio_features(**audio_inputs).numpy()[0]
            text_embed = clap_model.get_text_features(**text_inputs).numpy()[0]

        similarity = 1 - cosine(audio_embed, text_embed)
        results.append({"ytid": ytid, "caption": caption, "clap_similarity": similarity})
    
    except Exception as e:
        print(f"❌ Error processing {ytid}: {e}")
        continue

# -------------------------
# Save Results
# -------------------------
if results:
    average_clap = np.mean([r['clap_similarity'] for r in results])
    print(f"📊 Average CLAP similarity across {len(results)} examples: {average_clap:.4f}")
    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print("✅ CLAP similarity computation complete. Results saved to:", OUTPUT_CSV)
else:
    print("⚠️ No valid audio processed.")

100%|██████████| 100/100 [00:29<00:00,  3.44it/s]

📊 Average CLAP similarity across 100 examples: 0.4410
✅ CLAP similarity computation complete. Results saved to: clap_similarity_results.csv





### Musicgen

In [2]:
import os
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import ClapProcessor, ClapModel
from scipy.spatial.distance import cosine

from audiocraft.models import MusicGen, MultiBandDiffusion

# -------------------------
# CONFIG
# -------------------------
MODEL_SIZE = 'facebook/musicgen-large'  # Options: musicgen-small, medium, large
USE_DIFFUSION_DECODER = False           # Set True if using MultiBandDiffusion
GEN_DURATION = 10                       # seconds
TOP_K = 250
SAMPLE_RATE = 48000                     # Required for CLAP
OUTPUT_DIR = 'generated_clap_eval'
CSV_INPUT = 'metadata_100.csv'          # Requires 'ytid' and 'caption' columns
CSV_OUTPUT = 'clap_similarity_results_2.csv'

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------
# LOAD MODELS
# -------------------------
print(f"🔊 Loading MusicGen model: {MODEL_SIZE}")
model = MusicGen.get_pretrained(MODEL_SIZE)
model.set_generation_params(use_sampling=True, top_k=TOP_K, duration=GEN_DURATION)

mbd = None
if USE_DIFFUSION_DECODER:
    print("🎧 Loading MultiBandDiffusion...")
    mbd = MultiBandDiffusion.get_mbd_musicgen()

print("🎼 Loading CLAP model...")
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")

# -------------------------
# LOAD DATASET
# -------------------------
df = pd.read_csv(CSV_INPUT)

# -------------------------
# GENERATE + CLAP LOOP
# -------------------------
results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    ytid = row["ytid"]
    prompt = row["caption"]

    try:
        # --- MusicGen Generation ---
        output = model.generate([prompt], progress=False, return_tokens=USE_DIFFUSION_DECODER)
        wav = output[0].cpu()

        if USE_DIFFUSION_DECODER:
            wav = mbd.tokens_to_wav(output[1])[0].cpu()

        # Save audio
        out_path = os.path.join(OUTPUT_DIR, f"{ytid}.wav")
        torchaudio.save(out_path, wav, SAMPLE_RATE)

        # --- CLAP Similarity ---
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        wav_np = wav.squeeze().numpy()

        audio_inputs = clap_processor(audios=wav_np, return_tensors="pt", sampling_rate=SAMPLE_RATE)
        text_inputs = clap_processor(text=[prompt], return_tensors="pt")

        with torch.no_grad():
            audio_embed = clap_model.get_audio_features(**audio_inputs).numpy()[0]
            text_embed = clap_model.get_text_features(**text_inputs).numpy()[0]

        similarity = 1 - cosine(audio_embed, text_embed)
        results.append({
            "ytid": ytid,
            "caption": prompt,
            "clap_similarity": similarity
        })

    except Exception as e:
        print(f"❌ Error with {ytid}: {e}")
        continue

# -------------------------
# SAVE RESULTS
# -------------------------
if results:
    avg_sim = np.mean([r['clap_similarity'] for r in results])
    print(f"📊 Avg CLAP similarity across {len(results)} examples: {avg_sim:.4f}")
    pd.DataFrame(results).to_csv(CSV_OUTPUT, index=False)
    print(f"✅ Done. Results saved to: {CSV_OUTPUT}")
else:
    print("⚠️ No results generated.")

🔊 Loading MusicGen model: facebook/musicgen-large




🎼 Loading CLAP model...


100%|██████████| 100/100 [33:40<00:00, 20.21s/it]

📊 Avg CLAP similarity across 100 examples: 0.4225
✅ Done. Results saved to: clap_similarity_results_2.csv





### Baseline for our curated dataset

In [3]:
import os
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import ClapProcessor, ClapModel
from scipy.spatial.distance import cosine

from audiocraft.models import MusicGen, MultiBandDiffusion

# -------------------------
# CONFIG
# -------------------------
MODEL_SIZE = 'facebook/musicgen-large'  # Options: musicgen-small, medium, large
USE_DIFFUSION_DECODER = False           # Set True if using MultiBandDiffusion
GEN_DURATION = 10                       # seconds
TOP_K = 250
SAMPLE_RATE = 48000                     # Required for CLAP
OUTPUT_DIR = 'generated_clap_eval_curated'
CSV_INPUT = 'prompt_template.csv'          # Requires 'ytid' and 'caption' columns
CSV_OUTPUT = 'clap_similarity_results_curated.csv'

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------
# LOAD MODELS
# -------------------------
print(f"🔊 Loading MusicGen model: {MODEL_SIZE}")
model = MusicGen.get_pretrained(MODEL_SIZE)
model.set_generation_params(use_sampling=True, top_k=TOP_K, duration=GEN_DURATION)

mbd = None
if USE_DIFFUSION_DECODER:
    print("🎧 Loading MultiBandDiffusion...")
    mbd = MultiBandDiffusion.get_mbd_musicgen()

print("🎼 Loading CLAP model...")
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")

# -------------------------
# LOAD DATASET
# -------------------------
df = pd.read_csv(CSV_INPUT)

# -------------------------
# GENERATE + CLAP LOOP
# -------------------------
results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    audio = row["audio"]
    prompt = row["prompt"]

    try:
        # --- MusicGen Generation ---
        output = model.generate([prompt], progress=False, return_tokens=USE_DIFFUSION_DECODER)
        wav = output[0].cpu()

        if USE_DIFFUSION_DECODER:
            wav = mbd.tokens_to_wav(output[1])[0].cpu()

        # Save audio
        out_path = os.path.join(OUTPUT_DIR, f"{audio}.wav")
        torchaudio.save(out_path, wav, SAMPLE_RATE)

        # --- CLAP Similarity ---
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        wav_np = wav.squeeze().numpy()

        audio_inputs = clap_processor(audios=wav_np, return_tensors="pt", sampling_rate=SAMPLE_RATE)
        text_inputs = clap_processor(text=[prompt], return_tensors="pt")

        with torch.no_grad():
            audio_embed = clap_model.get_audio_features(**audio_inputs).numpy()[0]
            text_embed = clap_model.get_text_features(**text_inputs).numpy()[0]

        similarity = 1 - cosine(audio_embed, text_embed)
        results.append({
            "audio": audio,
            "caption": prompt,
            "clap_similarity": similarity
        })

    except Exception as e:
        print(f"❌ Error with {audio}: {e}")
        continue

# -------------------------
# SAVE RESULTS
# -------------------------
if results:
    avg_sim = np.mean([r['clap_similarity'] for r in results])
    print(f"📊 Avg CLAP similarity across {len(results)} examples: {avg_sim:.4f}")
    pd.DataFrame(results).to_csv(CSV_OUTPUT, index=False)
    print(f"✅ Done. Results saved to: {CSV_OUTPUT}")
else:
    print("⚠️ No results generated.")

🔊 Loading MusicGen model: facebook/musicgen-large




🎼 Loading CLAP model...


100%|██████████| 38/38 [12:45<00:00, 20.14s/it]

📊 Avg CLAP similarity across 38 examples: 0.4404
✅ Done. Results saved to: clap_similarity_results_curated.csv





## Training with (MusicGen Frozen - Update the embedding layer)