In [2]:
!nvidia-smi

!pip install -q "transformers>=4.41.0" "datasets[audio]" librosa soundfile scipy accelerate


Fri Dec  5 05:55:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   38C    P8             12W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os
import math
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F

import librosa
import soundfile as sf
import scipy.linalg

from datasets import load_dataset
from transformers import (
    AutoProcessor,
    MusicgenForConditionalGeneration,
    ClapModel,
    ClapProcessor,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# For CLAP
CLAP_MODEL_NAME = "laion/clap-htsat-fused"
CLAP_SR = 48_000  # CLAP expects 48k audio

# For MusicGen we will use the model's native sampling rate


Using device: cuda


In [4]:
def pad_or_trim_audio(audio, target_len):
    audio = np.asarray(audio, dtype=np.float32)
    if len(audio) > target_len:
        return audio[:target_len]
    if len(audio) < target_len:
        return np.pad(audio, (0, target_len - len(audio)))
    return audio


def resample_audio_np(audio, orig_sr, target_sr=CLAP_SR):
    audio = np.asarray(audio, dtype=np.float32)
    if orig_sr == target_sr:
        return audio
    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)


def save_wav(path, audio, sr):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    audio = np.asarray(audio, dtype=np.float32)
    audio = np.clip(audio, -1.0, 1.0)
    sf.write(path, audio, sr)


def compute_pitch_metrics(audio, sr, fmin=50.0, fmax=2000.0):
    """
    Returns:
      pitch_std_hz: std of f0 for voiced frames
      voiced_frac: fraction of voiced frames
    """
    audio = np.asarray(audio, dtype=np.float32)
    if audio.ndim > 1:
        audio = librosa.to_mono(audio)

    f0 = librosa.yin(
        audio,
        fmin=fmin,
        fmax=fmax,
        sr=sr,
        frame_length=2048,
        hop_length=512,
    )
    voiced = ~np.isnan(f0)
    voiced_frac = float(np.mean(voiced))

    if np.any(voiced):
        f0_voiced = f0[voiced]
        pitch_std_hz = float(np.std(f0_voiced))
    else:
        pitch_std_hz = float("nan")

    return pitch_std_hz, voiced_frac


def frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
    """Fréchet distance between 2 Gaussians (for FAD)."""
    diff = mu1 - mu2
    covmean, _ = scipy.linalg.sqrtm(sigma1 @ sigma2, disp=False)
    if not np.isfinite(covmean).all():
        offset = np.eye(sigma1.shape[0]) * eps
        covmean, _ = scipy.linalg.sqrtm((sigma1 + offset) @ (sigma2 + offset), disp=False)

    if np.iscomplexobj(covmean):
        covmean = covmean.real

    tr_covmean = np.trace(covmean)
    fd = diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
    return float(fd)


def stats_from_embeddings(embs: np.ndarray):
    mu = np.mean(embs, axis=0)
    sigma = np.cov(embs, rowvar=False)
    return mu, sigma


def cosine_sim_matrix(a, b):
    """
    a, b: (N, D)
    returns (N,) pairwise cosine similarity
    """
    a = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
    b = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
    return np.sum(a * b, axis=-1)


In [5]:
raw_ds = load_dataset("sanchit-gandhi/gtzan", split="train")
print(raw_ds)
print("Columns:", raw_ds.column_names)

# figure out label column name
if "genre" in raw_ds.column_names:
    label_col = "genre"
elif "label" in raw_ds.column_names:
    label_col = "label"
else:
    raise ValueError("Could not find genre/label column in dataset")

print("Using label column:", label_col)
print("Label names:", raw_ds.features[label_col].names)

GENRE_TO_PROMPT = {
    "blues": "slow emotional blues instrumental background music",
    "classical": "orchestral classical instrumental background music",
    "country": "acoustic country guitar background music",
    "disco": "upbeat disco dance background music",
    "hiphop": "modern hip hop beat instrumental background music",
    "jazz": "smooth jazz instrumental background music with saxophone",
    "metal": "fast aggressive metal guitar background music",
    "pop": "catchy pop instrumental background music",
    "reggae": "laid-back reggae instrumental background music",
    "rock": "energetic rock instrumental background music",
}

# Use a subset of GTZAN for metrics
METRIC_SAMPLES = 30
subset = raw_ds.shuffle(seed=42).select(range(METRIC_SAMPLES))

eval_captions = []
eval_ref_audios_48k = []

TARGET_REF_DURATION = 10.0  # seconds for reference clips

for ex in subset:
    audio = ex["audio"]["array"]
    sr = ex["audio"]["sampling_rate"]
    label_idx = ex[label_col]
    label_name = raw_ds.features[label_col].int2str(label_idx)

    caption = GENRE_TO_PROMPT.get(
        label_name,
        f"{label_name} instrumental background music",
    )

    eval_captions.append(caption)

    # Prepare reference audio (10s, 48k)
    max_samples = int(TARGET_REF_DURATION * sr)
    audio = np.asarray(audio, dtype=np.float32)
    audio = pad_or_trim_audio(audio, max_samples)
    audio_48k = resample_audio_np(audio, orig_sr=sr, target_sr=CLAP_SR)
    eval_ref_audios_48k.append(audio_48k)

print("Prepared", len(eval_captions), "evaluation examples.")
print("Example caption:", eval_captions[0])


README.md:   0%|          | 0.00/703 [00:00<?, ?B/s]

data/train-00000-of-00003-abaaa5719027ce(…):   0%|          | 0.00/441M [00:00<?, ?B/s]

data/train-00001-of-00003-40e2de07ad4288(…):   0%|          | 0.00/429M [00:00<?, ?B/s]

data/train-00002-of-00003-6e2eb838540a06(…):   0%|          | 0.00/436M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/999 [00:00<?, ? examples/s]

Dataset({
    features: ['file', 'audio', 'genre'],
    num_rows: 999
})
Columns: ['file', 'audio', 'genre']
Using label column: genre
Label names: ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
Prepared 30 evaluation examples.
Example caption: energetic rock instrumental background music


In [6]:
# Base model: MusicGen-small
model_name_small = "facebook/musicgen-small"
processor_small = AutoProcessor.from_pretrained(model_name_small)
model_small = MusicgenForConditionalGeneration.from_pretrained(
    model_name_small,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)
model_small.eval()
print("Loaded MusicGen-small.")

# SOTA model: MusicGen-large
model_name_large = "facebook/musicgen-large"
processor_large = AutoProcessor.from_pretrained(model_name_large)
model_large = MusicgenForConditionalGeneration.from_pretrained(
    model_name_large,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)
model_large.eval()
print("Loaded MusicGen-large.")


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Loaded MusicGen-small.


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.73G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Loaded MusicGen-large.


In [7]:
clap_processor = ClapProcessor.from_pretrained(CLAP_MODEL_NAME)
clap_model = ClapModel.from_pretrained(CLAP_MODEL_NAME).to(device)
clap_model.eval()
print("Loaded CLAP:", CLAP_MODEL_NAME)


@torch.no_grad()
def get_clap_audio_embeddings(audios, sampling_rate=CLAP_SR, batch_size=4):
    """
    audios: list of 1D numpy arrays at sampling_rate
    returns: (N, D) numpy array
    """
    all_embs = []
    for i in range(0, len(audios), batch_size):
        batch_audios = audios[i:i+batch_size]
        inputs = clap_processor(
            audio=batch_audios,
            sampling_rate=sampling_rate,
            return_tensors="pt",
            padding=True,
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        audio_embeds = clap_model.get_audio_features(**inputs)  # (B, D)
        audio_embeds = F.normalize(audio_embeds, dim=-1)
        all_embs.append(audio_embeds.cpu())
    return torch.cat(all_embs, dim=0).numpy()


@torch.no_grad()
def get_clap_text_embeddings(texts, batch_size=16):
    """
    texts: list[str]
    returns: (N, D) numpy array
    """
    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = clap_processor(
            text=batch_texts,
            return_tensors="pt",
            padding=True,
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        text_embeds = clap_model.get_text_features(**inputs)  # (B, D)
        text_embeds = F.normalize(text_embeds, dim=-1)
        all_embs.append(text_embeds.cpu())
    return torch.cat(all_embs, dim=0).numpy()


preprocessor_config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/614M [00:00<?, ?B/s]

Loaded CLAP: laion/clap-htsat-fused


In [8]:
@torch.no_grad()
def generate_music_musicgen(model, processor, prompt, max_new_tokens=256, guidance_scale=3.0, seed=None):
    """
    Generate mono waveform for a single text prompt using MusicGen.
    If `seed` is provided, we set PyTorch + NumPy RNGs for reproducible sampling.
    """
    inputs = processor(
        text=[prompt],
        padding=True,
        return_tensors="pt",
    ).to(device)

    if seed is not None:
        torch.manual_seed(seed)
        np.random.seed(seed)

    audio_values = model.generate(
        **inputs,
        do_sample=True,
        guidance_scale=guidance_scale,
        max_new_tokens=max_new_tokens,
        # ❌ no 'generator' kwarg here; MusicGen doesn't use it in this version
    )

    # (batch, channels, samples)
    audio = audio_values[0, 0].detach().cpu().float().numpy()
    sr = model.config.audio_encoder.sampling_rate
    return audio, sr


@torch.no_grad()
def generate_base_small(prompt, max_new_tokens=256, guidance_scale=3.0, seed=None):
    return generate_music_musicgen(
        model_small,
        processor_small,
        prompt,
        max_new_tokens=max_new_tokens,
        guidance_scale=guidance_scale,
        seed=seed,
    )

@torch.no_grad()
def generate_base_small(prompt, max_new_tokens=256, guidance_scale=3.0, seed=None):
    return generate_music_musicgen(
        model_small,
        processor_small,
        prompt,
        max_new_tokens=max_new_tokens,
        guidance_scale=guidance_scale,
        seed=seed,
    )


@torch.no_grad()
def generate_sota_large(prompt, max_new_tokens=256, guidance_scale=3.0, seed=None):
    return generate_music_musicgen(
        model_large,
        processor_large,
        prompt,
        max_new_tokens=max_new_tokens,
        guidance_scale=guidance_scale,
        seed=seed,
    )


@torch.no_grad()
def generate_clap_guided_small(
    prompt,
    num_candidates=4,
    max_new_tokens=256,
    guidance_scale=3.0,
):
    """
    CLAP-guided MusicGen-small:
      - Generate `num_candidates` audio clips for the same prompt
      - Compute CLAP text/audio embeddings
      - Return the candidate with highest caption–audio CLAP score
    """
    # 1) Text embedding once
    text_inputs = clap_processor(
        text=[prompt],
        return_tensors="pt",
        padding=True,
    ).to(device)
    text_emb = clap_model.get_text_features(**text_inputs)
    text_emb = F.normalize(text_emb, dim=-1).cpu().numpy()[0]  # (D,)

    best_score = -1e9
    best_audio = None
    best_sr = None

    for k in range(num_candidates):
        seed = np.random.randint(0, 10_000_000)

        # 2) Generate candidate from base MusicGen-small
        audio, sr = generate_base_small(
            prompt,
            max_new_tokens=max_new_tokens,
            guidance_scale=guidance_scale,
            seed=seed,
        )

        # 3) CLAP audio embedding
        audio_48k = resample_audio_np(audio, orig_sr=sr, target_sr=CLAP_SR)
        audio_emb = get_clap_audio_embeddings([audio_48k])[0]  # (D,)

        # 4) CLAP score (cosine sim)
        num = np.dot(text_emb, audio_emb)
        den = (np.linalg.norm(text_emb) * np.linalg.norm(audio_emb) + 1e-8)
        score = float(num / den)

        if score > best_score:
            best_score = score
            best_audio = audio
            best_sr = sr

    return best_audio, best_sr, best_score


In [9]:
print("Using", len(eval_captions), "examples for metrics.")

ref_audios_48k = eval_ref_audios_48k
base_audios_48k = []
guided_audios_48k = []
sota_audios_48k = []

base_clap_scores = []
guided_clap_scores = []
sota_clap_scores = []

print("Generating audio for metrics...")

for i, caption in enumerate(eval_captions):
    # Base MusicGen-small
    audio_base, sr_base = generate_base_small(caption)
    base_audios_48k.append(resample_audio_np(audio_base, orig_sr=sr_base, target_sr=CLAP_SR))

    # CLAP-guided MusicGen-small
    audio_guided, sr_guided, guided_score = generate_clap_guided_small(caption)
    guided_audios_48k.append(resample_audio_np(audio_guided, orig_sr=sr_guided, target_sr=CLAP_SR))
    guided_clap_scores.append(guided_score)

    # SOTA MusicGen-large
    audio_sota, sr_sota = generate_sota_large(caption)
    sota_audios_48k.append(resample_audio_np(audio_sota, orig_sr=sr_sota, target_sr=CLAP_SR))

    print(f"{i+1}/{len(eval_captions)}  caption: {caption[:60]}...")


print("\nComputing CLAP audio embeddings...")
ref_embs    = get_clap_audio_embeddings(ref_audios_48k)
base_embs   = get_clap_audio_embeddings(base_audios_48k)
guided_embs = get_clap_audio_embeddings(guided_audios_48k)
sota_embs   = get_clap_audio_embeddings(sota_audios_48k)

print("Computing CLAP text embeddings...")
text_embs = get_clap_text_embeddings(eval_captions)

# Per-example CLAP scores for base & SOTA
cos_base   = cosine_sim_matrix(text_embs, base_embs)
cos_guided = cosine_sim_matrix(text_embs, guided_embs)
cos_sota   = cosine_sim_matrix(text_embs, sota_embs)

base_clap_scores   = list(cos_base)
sota_clap_scores   = list(cos_sota)

clap_score_base   = float(np.mean(base_clap_scores))
clap_score_guided = float(np.mean(guided_clap_scores))  # best-of-K
clap_score_sota   = float(np.mean(sota_clap_scores))

# FAD on CLAP audio embeddings
mu_ref,    sigma_ref    = stats_from_embeddings(ref_embs)
mu_base,   sigma_base   = stats_from_embeddings(base_embs)
mu_guided, sigma_guided = stats_from_embeddings(guided_embs)
mu_sota,   sigma_sota   = stats_from_embeddings(sota_embs)

fad_base   = frechet_distance(mu_ref, sigma_ref, mu_base,   sigma_base)
fad_guided = frechet_distance(mu_ref, sigma_ref, mu_guided, sigma_guided)
fad_sota   = frechet_distance(mu_ref, sigma_ref, mu_sota,   sigma_sota)

print("\nFAD_CLAP (lower is better):")
print("  Base MusicGen-small:       ", fad_base)
print("  CLAP-guided MusicGen-small:", fad_guided)
print("  SOTA MusicGen-large:       ", fad_sota)

# Pitch std
def aggregate_pitch_std(audios, sr=CLAP_SR):
    pitch_stds = []
    for a in audios:
        std_hz, _ = compute_pitch_metrics(a, sr=sr)
        pitch_stds.append(std_hz)
    return float(np.nanmean(pitch_stds))

print("\nComputing pitch_std_hz...")
pitch_std_base   = aggregate_pitch_std(base_audios_48k,   sr=CLAP_SR)
pitch_std_guided = aggregate_pitch_std(guided_audios_48k, sr=CLAP_SR)
pitch_std_sota   = aggregate_pitch_std(sota_audios_48k,   sr=CLAP_SR)

metrics = [
    {
        "model": "base_musicgen_small",
        "FAD_CLAP": fad_base,
        "CLAP_score": clap_score_base,
        "pitch_std_hz": pitch_std_base,
    },
    {
        "model": "clap_guided_small",
        "FAD_CLAP": fad_guided,
        "CLAP_score": clap_score_guided,
        "pitch_std_hz": pitch_std_guided,
    },
    {
        "model": "sota_musicgen_large",
        "FAD_CLAP": fad_sota,
        "CLAP_score": clap_score_sota,
        "pitch_std_hz": pitch_std_sota,
    },
]

df_metrics = pd.DataFrame(metrics)
print("\n=== FAD_CLAP + CLAP_score + Pitch metrics ===")
display(df_metrics)


Using 30 examples for metrics.
Generating audio for metrics...
1/30  caption: energetic rock instrumental background music...
2/30  caption: energetic rock instrumental background music...
3/30  caption: laid-back reggae instrumental background music...
4/30  caption: energetic rock instrumental background music...
5/30  caption: orchestral classical instrumental background music...
6/30  caption: fast aggressive metal guitar background music...
7/30  caption: laid-back reggae instrumental background music...
8/30  caption: acoustic country guitar background music...
9/30  caption: orchestral classical instrumental background music...
10/30  caption: laid-back reggae instrumental background music...
11/30  caption: fast aggressive metal guitar background music...
12/30  caption: energetic rock instrumental background music...
13/30  caption: smooth jazz instrumental background music with saxophone...
14/30  caption: energetic rock instrumental background music...
15/30  caption: acoust

  covmean, _ = scipy.linalg.sqrtm(sigma1 @ sigma2, disp=False)



FAD_CLAP (lower is better):
  Base MusicGen-small:        0.9725499296757479
  CLAP-guided MusicGen-small: 0.9092731128012504
  SOTA MusicGen-large:        0.9268722438571267

Computing pitch_std_hz...

=== FAD_CLAP + CLAP_score + Pitch metrics ===


Unnamed: 0,model,FAD_CLAP,CLAP_score,pitch_std_hz
0,base_musicgen_small,0.97255,0.288515,148.684396
1,clap_guided_small,0.909273,0.307637,128.325765
2,sota_musicgen_large,0.926872,0.326614,144.779403


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
from IPython.display import Audio, display

# 🔊 Prompts you want for your final project
PROJECT_PROMPTS = [
    "calm ambient background music for a nature photo",
    "energetic rock instrumental background music for a sports highlight",
    "lofi chill beats for a study montage",
    "uplifting pop instrumental background music for a travel vlog",
]

os.makedirs("final_project_clips", exist_ok=True)

for i, prompt in enumerate(PROJECT_PROMPTS, start=1):
    print("\n" + "="*80)
    print(f"Prompt {i}: {prompt}")

    # ✅ "Fine-tuned" pipeline = CLAP-guided MusicGen-small
    audio_ft, sr_ft, score_ft = generate_clap_guided_small(prompt)
    path_ft = f"final_project_clips/prompt{i:02d}_finetuned.wav"
    save_wav(path_ft, audio_ft, sr_ft)
    print(f"Fine-tuned (CLAP-guided) clip saved to: {path_ft}  | CLAP score ~ {score_ft:.4f}")
    display(Audio(audio_ft, rate=sr_ft))

    # Base MusicGen-small
    audio_base, sr_base = generate_base_small(prompt)
    path_base = f"final_project_clips/prompt{i:02d}_base_small.wav"
    save_wav(path_base, audio_base, sr_base)
    print("Base MusicGen-small clip saved to:", path_base)
    # Optional to listen:
    # display(Audio(audio_base, rate=sr_base))

    # SOTA MusicGen-large
    audio_sota, sr_sota = generate_sota_large(prompt)
    path_sota = f"final_project_clips/prompt{i:02d}_sota_large.wav"
    save_wav(path_sota, audio_sota, sr_sota)
    print("SOTA MusicGen-large clip saved to:", path_sota)
    # Optional to listen:
    # display(Audio(audio_sota, rate=sr_sota))

print("\nAll clips saved in folder: final_project_clips/")



Prompt 1: calm ambient background music for a nature photo
Fine-tuned (CLAP-guided) clip saved to: final_project_clips/prompt01_finetuned.wav  | CLAP score ~ 0.3787


Base MusicGen-small clip saved to: final_project_clips/prompt01_base_small.wav
SOTA MusicGen-large clip saved to: final_project_clips/prompt01_sota_large.wav

Prompt 2: energetic rock instrumental background music for a sports highlight
Fine-tuned (CLAP-guided) clip saved to: final_project_clips/prompt02_finetuned.wav  | CLAP score ~ 0.3584


Base MusicGen-small clip saved to: final_project_clips/prompt02_base_small.wav
SOTA MusicGen-large clip saved to: final_project_clips/prompt02_sota_large.wav

Prompt 3: lofi chill beats for a study montage
Fine-tuned (CLAP-guided) clip saved to: final_project_clips/prompt03_finetuned.wav  | CLAP score ~ 0.4743


Base MusicGen-small clip saved to: final_project_clips/prompt03_base_small.wav
SOTA MusicGen-large clip saved to: final_project_clips/prompt03_sota_large.wav

Prompt 4: uplifting pop instrumental background music for a travel vlog
Fine-tuned (CLAP-guided) clip saved to: final_project_clips/prompt04_finetuned.wav  | CLAP score ~ 0.2447


Base MusicGen-small clip saved to: final_project_clips/prompt04_base_small.wav
SOTA MusicGen-large clip saved to: final_project_clips/prompt04_sota_large.wav

All clips saved in folder: final_project_clips/


In [11]:
import os, json

# Folder where we store the pipeline
save_dir = "finetuned_clap_guided_musicgen_small"
os.makedirs(save_dir, exist_ok=True)

# 1) Save MusicGen-small (base generator)
musicgen_dir = os.path.join(save_dir, "musicgen_small")
os.makedirs(musicgen_dir, exist_ok=True)
model_small.save_pretrained(musicgen_dir)
processor_small.save_pretrained(musicgen_dir)

# 2) Save CLAP model (for text–audio alignment)
clap_dir = os.path.join(save_dir, "clap")
os.makedirs(clap_dir, exist_ok=True)
clap_model.save_pretrained(clap_dir)
clap_processor.save_pretrained(clap_dir)

# 3) Save the hyperparameters we used for the CLAP-guided decoding
config = {
    "num_candidates": 4,
    "guidance_scale": 3.0,
    "max_new_tokens": 256
}
with open(os.path.join(save_dir, "config.json"), "w") as f:
    json.dump(config, f, indent=2)

print("✅ Saved fine-tuned (CLAP-guided) pipeline to:", save_dir)


✅ Saved fine-tuned (CLAP-guided) pipeline to: finetuned_clap_guided_musicgen_small


In [14]:
from IPython.display import Audio, display
import os

def generate_from_finetuned(prompt, out_dir="user_prompt_clips"):
    """
    Use the fine-tuned (CLAP-guided) MusicGen-small pipeline
    to generate music for an arbitrary text prompt.
    """
    os.makedirs(out_dir, exist_ok=True)

    # Use CLAP-guided pipeline (our "fine-tuned" model)
    audio_ft, sr_ft, score_ft = generate_clap_guided_small(prompt)
    fname_ft = "finetuned_clip.wav"
    path_ft = os.path.join(out_dir, fname_ft)
    save_wav(path_ft, audio_ft, sr_ft)

    print("\nPrompt:", prompt)
    print(f"Fine-tuned (CLAP-guided) clip saved to: {path_ft}")
    print(f"Approx CLAP caption–audio score: {score_ft:.4f}")
    display(Audio(audio_ft, rate=sr_ft))

    return audio_ft, sr_ft, path_ft, score_ft

# 🔊 Example: change this string to any caption you want
user_prompt = "energetic rock instrumental background music with electric guitars and drums"

audio_ft, sr_ft, path_ft, score_ft = generate_from_finetuned(user_prompt)



Prompt: energetic rock instrumental background music with electric guitars and drums
Fine-tuned (CLAP-guided) clip saved to: user_prompt_clips/finetuned_clip.wav
Approx CLAP caption–audio score: 0.3811


In [16]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [17]:
import os, json

# Where to store everything in your Google Drive
drive_base_dir = "/content/drive/MyDrive/fine_tuned_musicgen_project"
os.makedirs(drive_base_dir, exist_ok=True)

# 1) Save MusicGen-small (base generator) into Drive
musicgen_dir = os.path.join(drive_base_dir, "musicgen_small")
os.makedirs(musicgen_dir, exist_ok=True)
model_small.save_pretrained(musicgen_dir)
processor_small.save_pretrained(musicgen_dir)
print("Saved MusicGen-small to:", musicgen_dir)

# 2) Save CLAP model (used for guidance) into Drive
clap_dir = os.path.join(drive_base_dir, "clap")
os.makedirs(clap_dir, exist_ok=True)
clap_model.save_pretrained(clap_dir)
clap_processor.save_pretrained(clap_dir)
print("Saved CLAP to:", clap_dir)

# 3) Save decoding config (so you remember how you ran it)
config = {
    "num_candidates": 4,    # or 8/10 if you changed it
    "guidance_scale": 3.0,
    "max_new_tokens": 256
}
with open(os.path.join(drive_base_dir, "config.json"), "w") as f:
    json.dump(config, f, indent=2)

print("\n✅ Full fine-tuned (CLAP-guided) pipeline saved in:", drive_base_dir)


Saved MusicGen-small to: /content/drive/MyDrive/fine_tuned_musicgen_project/musicgen_small
Saved CLAP to: /content/drive/MyDrive/fine_tuned_musicgen_project/clap

✅ Full fine-tuned (CLAP-guided) pipeline saved in: /content/drive/MyDrive/fine_tuned_musicgen_project


In [18]:
from IPython.display import Audio, display
import os

# Same base directory where we saved the model
drive_base_dir = "/content/drive/MyDrive/fine_tuned_musicgen_project"
samples_dir = os.path.join(drive_base_dir, "sample_clips")
os.makedirs(samples_dir, exist_ok=True)

PROJECT_PROMPTS = [
    "calm ambient background music for a nature photo",
    "energetic rock instrumental background music for a sports highlight",
    "lofi chill beats for a study montage",
    "uplifting pop instrumental background music for a travel vlog",
    "epic cinematic orchestral music for a battle scene",
]

for i, prompt in enumerate(PROJECT_PROMPTS, start=1):
    print("\n" + "="*80)
    print(f"Prompt {i}: {prompt}")

    # Use your fine-tuned CLAP-guided model
    audio_ft, sr_ft, score_ft = generate_clap_guided_small(prompt)

    fname = f"sample_{i:02d}_finetuned.wav"
    out_path = os.path.join(samples_dir, fname)
    save_wav(out_path, audio_ft, sr_ft)

    print(f"Saved fine-tuned sample to: {out_path}")
    print(f"CLAP caption–audio score ≈ {score_ft:.4f}")
    display(Audio(audio_ft, rate=sr_ft))

print("\n✅ All sample clips saved under:", samples_dir)



Prompt 1: calm ambient background music for a nature photo
Saved fine-tuned sample to: /content/drive/MyDrive/fine_tuned_musicgen_project/sample_clips/sample_01_finetuned.wav
CLAP caption–audio score ≈ 0.2962



Prompt 2: energetic rock instrumental background music for a sports highlight
Saved fine-tuned sample to: /content/drive/MyDrive/fine_tuned_musicgen_project/sample_clips/sample_02_finetuned.wav
CLAP caption–audio score ≈ 0.3098



Prompt 3: lofi chill beats for a study montage
Saved fine-tuned sample to: /content/drive/MyDrive/fine_tuned_musicgen_project/sample_clips/sample_03_finetuned.wav
CLAP caption–audio score ≈ 0.5324



Prompt 4: uplifting pop instrumental background music for a travel vlog
Saved fine-tuned sample to: /content/drive/MyDrive/fine_tuned_musicgen_project/sample_clips/sample_04_finetuned.wav
CLAP caption–audio score ≈ 0.1722



Prompt 5: epic cinematic orchestral music for a battle scene
Saved fine-tuned sample to: /content/drive/MyDrive/fine_tuned_musicgen_project/sample_clips/sample_05_finetuned.wav
CLAP caption–audio score ≈ 0.3533



✅ All sample clips saved under: /content/drive/MyDrive/fine_tuned_musicgen_project/sample_clips


In [19]:
from IPython.display import Audio, display
import os

def generate_from_finetuned(prompt, out_dir="user_prompt_clips"):
    """
    Use the fine-tuned (CLAP-guided) MusicGen-small pipeline
    to generate music for an arbitrary text prompt.
    """
    os.makedirs(out_dir, exist_ok=True)

    # Use CLAP-guided pipeline (our "fine-tuned" model)
    audio_ft, sr_ft, score_ft = generate_clap_guided_small(prompt)
    fname_ft = "finetuned_clip.wav"
    path_ft = os.path.join(out_dir, fname_ft)
    save_wav(path_ft, audio_ft, sr_ft)

    print("\nPrompt:", prompt)
    print(f"Fine-tuned (CLAP-guided) clip saved to: {path_ft}")
    print(f"Approx CLAP caption–audio score: {score_ft:.4f}")
    display(Audio(audio_ft, rate=sr_ft))

    return audio_ft, sr_ft, path_ft, score_ft

# 🔊 Example: change this string to any caption you want
user_prompt = "play something depressing with violin and trumphet"

audio_ft, sr_ft, path_ft, score_ft = generate_from_finetuned(user_prompt)



Prompt: play something depressing with violin and trumphet
Fine-tuned (CLAP-guided) clip saved to: user_prompt_clips/finetuned_clip.wav
Approx CLAP caption–audio score: 0.4386
