# 01c — MTG Jamendo downloader
Download and preprocessing pipeline for MTG Jamendo dataset.
Run the setup notebook first so system/Python deps and the AudioCraft repo are available.


In [3]:
from pathlib import Path
import os

BASE_DIR = Path("/root/workspace")
DATA_DIR = BASE_DIR / "data" / "mtg_jamendo"
RAW_DIR = DATA_DIR / "raw"
AUDIOCRAFT_REPO_DIR = BASE_DIR / "audiocraft"
EXPERIMENTS_DIR = BASE_DIR / "experiments" / "audiocraft"

SEGMENT_SECONDS = 10
TARGET_SR = 32000
CHANNELS = 1
TRAIN_RATIO = 0.9
RANDOM_SEED = 42
NUM_SAMPLES_TOTAL = None  # adjust to control how many source tracks to keep

MTG_JAMENDO_ARCHIVE_URLS = [
    os.environ.get("MTG_JAMENDO_ARCHIVE_URL"),
    # Add MTG Jamendo download URLs here if available
]
MTG_JAMENDO_ARCHIVE_URLS = [u for u in MTG_JAMENDO_ARCHIVE_URLS if u]

WAV_DIR = DATA_DIR / "wav_32k_mono"
SEGMENTS_DIR = DATA_DIR / "segments_10s"
MANIFEST_DIR = DATA_DIR / "manifests"
EGS_TRAIN = DATA_DIR / "egs" / "train"
EGS_VALID = DATA_DIR / "egs" / "valid"

for p in (DATA_DIR, RAW_DIR, WAV_DIR, SEGMENTS_DIR, MANIFEST_DIR, EGS_TRAIN, EGS_VALID, EXPERIMENTS_DIR):
    p.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("Using URLs (in order):", MTG_JAMENDO_ARCHIVE_URLS)

BASE_DIR: /root/workspace
Using URLs (in order): []


## 6) Download MTG Jamendo dataset
Downloads MTG Jamendo archive and extracts audio files.


In [5]:
import zipfile
import subprocess
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import os

# MTG Jamendo data is organized in subdirectories (00-99)
# Check if we're working with existing files in the data/mtg_jamendo directory
existing_audio_root = BASE_DIR / "data" / "mtg_jamendo"
audio_extensions = ["*.mp3", "*.wav", "*.flac", "*.ogg"]

# Collect all existing audio files from subdirectories
existing_audio_files = []
for ext in audio_extensions:
    existing_audio_files.extend(existing_audio_root.rglob(ext))

# Filter out files that are in our output directories (wav_32k_mono, segments_10s, etc.)
output_dirs = {WAV_DIR, SEGMENTS_DIR, MANIFEST_DIR, EGS_TRAIN, EGS_VALID, RAW_DIR}
source_audio_files = [
    f for f in existing_audio_files 
    if not any(output_dir in f.parents for output_dir in output_dirs)
]

if source_audio_files:
    print(f"Found {len(source_audio_files)} existing audio files in subdirectories")
    print("Using existing files; skipping download.")
    source_audio_files = sorted(source_audio_files)
    
    # Apply NUM_SAMPLES_TOTAL limit if specified
    if NUM_SAMPLES_TOTAL:
        source_audio_files = source_audio_files[:NUM_SAMPLES_TOTAL]
        print(f"Limited to {len(source_audio_files)} files (NUM_SAMPLES_TOTAL={NUM_SAMPLES_TOTAL})")
else:
    # Download if no existing files found
    archive_path = RAW_DIR / "mtg_jamendo.zip"
    audio_root = RAW_DIR / "mtg_jamendo"

    def aria2_download(url: str, out_path: Path):
        out_path.parent.mkdir(parents=True, exist_ok=True)
        cmd = [
            "aria2c",
            "-x", "16",          # connections per server
            "-s", "16",          # split count
            "-k", "1M",          # chunk size
            "--file-allocation=none",
            "--allow-overwrite=true",
            "-o", out_path.name,
            "-d", str(out_path.parent),
            url,
        ]
        print("Running:", " ".join(cmd))
        subprocess.run(cmd, check=True)

    def extract_member(args):
        """Extract a single member from the zip archive"""
        archive_path, member, raw_dir = args
        dest = raw_dir / member
        if dest.exists():
            return False
        dest.parent.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(archive_path) as zf:
            with zf.open(member) as src, open(dest, "wb") as dst:
                dst.write(src.read())
        return True

    if not archive_path.exists():
        if not MTG_JAMENDO_ARCHIVE_URLS:
            raise RuntimeError("Provide at least one MTG Jamendo archive URL (set MTG_JAMENDO_ARCHIVE_URL)")
        last_err = None
        for url in MTG_JAMENDO_ARCHIVE_URLS:
            try:
                print(f"Downloading {url} → {archive_path} (aria2c multi-conn)")
                aria2_download(url, archive_path)
                break
            except Exception as e:  # noqa: BLE001
                last_err = e
                print(f"Failed {url}: {e}")
        if not archive_path.exists():
            raise RuntimeError(f"Download failed; last error: {last_err}")

    with zipfile.ZipFile(archive_path) as zf:
        # Look for common audio file extensions
        members = sorted([m for m in zf.namelist() 
                         if any(m.endswith(ext) for ext in [".mp3", ".wav", ".flac", ".ogg"])])
        if NUM_SAMPLES_TOTAL:
            members = members[:NUM_SAMPLES_TOTAL]
        print(f"Extracting {len(members)} audio files from archive (parallel)…")
        
        # Prepare arguments for parallel extraction
        extract_args = [(archive_path, member, RAW_DIR) for member in members]
        
        # Use ThreadPoolExecutor for parallel extraction
        max_workers = min(os.cpu_count() or 4, 48)
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(tqdm(
                executor.map(extract_member, extract_args),
                total=len(extract_args),
                desc="Extracting",
                unit=" files"
            ))
        
        extracted_count = sum(results)
        print(f"Extracted {extracted_count} new files, {len(results) - extracted_count} already existed")

    source_audio_files = []
    for ext in audio_extensions:
        source_audio_files.extend(audio_root.rglob(ext))
    source_audio_files = sorted(source_audio_files)

print("Audio files ready:", len(source_audio_files))
print("Sample files from subdirectories:")
for name in source_audio_files[:5]:
    print(" •", name)

Found 55701 existing audio files in subdirectories
Using existing files; skipping download.


KeyboardInterrupt: 

## 7) Convert to mono 32k wav
Uses ffmpeg; idempotent if WAVs already exist.


In [6]:
import subprocess
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

WAV_DIR.mkdir(parents=True, exist_ok=True)

def convert_to_wav(src):
    """Convert a single audio file to WAV"""
    dst = WAV_DIR / f"{src.stem}.wav"
    if dst.exists():
        return "skip"
    try:
        cmd = [
            "ffmpeg", "-hide_banner", "-loglevel", "error",
            "-i", str(src),
            "-ac", str(CHANNELS),
            "-ar", str(TARGET_SR),
            "-map_metadata", "-1",
            "-vn",
            str(dst),
        ]
        subprocess.run(cmd, check=True)
        return "success"
    except subprocess.CalledProcessError as e:
        # If conversion fails, log and skip the corrupted file
        return ("error", src)

# Use ThreadPoolExecutor for parallel conversion
max_workers = min(os.cpu_count() or 4, 32)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(
        executor.map(convert_to_wav, source_audio_files),
        total=len(source_audio_files),
        desc="Converting to WAV",
        unit=" files"
    ))

converted_count = sum(1 for r in results if r == "success")
skipped_count = sum(1 for r in results if r == "skip")
errors = [r[1] for r in results if isinstance(r, tuple) and r[0] == "error"]

print(f"Converted {converted_count} new files, {skipped_count} already existed")
if errors:
    print(f"WARNING: {len(errors)} files failed to convert (likely corrupted):")
    for err_file in errors[:10]:  # Show first 10 errors
        print(f"  ✗ {err_file}")
    if len(errors) > 10:
        print(f"  ... and {len(errors) - 10} more")

wav_files = sorted(WAV_DIR.glob("*.wav"))
print("WAV files ready:", len(wav_files))
for name in wav_files[:5]:
    print(" •", name)

Converting to WAV: 100%|██████████| 55701/55701 [33:11<00:00, 27.97 files/s]  


Converted 55701 new files, 0 already existed
WAV files ready: 55701
 • /root/workspace/data/mtg_jamendo/wav_32k_mono/1000082.low.wav
 • /root/workspace/data/mtg_jamendo/wav_32k_mono/1000083.low.wav
 • /root/workspace/data/mtg_jamendo/wav_32k_mono/1000084.low.wav
 • /root/workspace/data/mtg_jamendo/wav_32k_mono/1000085.low.wav
 • /root/workspace/data/mtg_jamendo/wav_32k_mono/1000086.low.wav


## 8) Segment into 10s chunks
Segments each WAV into fixed-duration clips.


In [None]:
import subprocess
import random
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
wav_files = sorted(WAV_DIR.glob("*.wav"))

def segment_wav(wav):
    """Segment a single WAV file into fixed-duration clips"""
    existing = list(SEGMENTS_DIR.glob(f"{wav.stem}_seg_*.wav"))
    if existing:
        return False
    pattern = SEGMENTS_DIR / f"{wav.stem}_seg_%03d.wav"
    cmd = [
        "ffmpeg", "-hide_banner", "-loglevel", "error",
        "-i", str(wav),
        "-f", "segment",
        "-segment_time", str(SEGMENT_SECONDS),
        "-reset_timestamps", "1",
        "-map_metadata", "-1",
        str(pattern),
    ]
    subprocess.run(cmd, check=True)
    return True

# Use ThreadPoolExecutor for parallel segmentation
max_workers = min(os.cpu_count() or 4, 32)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(
        executor.map(segment_wav, wav_files),
        total=len(wav_files),
        desc="Segmenting WAVs",
        unit=" files"
    ))

segmented_count = sum(results)
print(f"Segmented {segmented_count} new files, {len(results) - segmented_count} already existed")

segments = sorted(SEGMENTS_DIR.glob("*.wav"))
print("Total segments:", len(segments))

def probe_duration(path: Path) -> float:
    res = subprocess.run(
        [
            "ffprobe", "-v", "error",
            "-show_entries", "format=duration",
            "-of", "default=noprint_wrappers=1:nokey=1",
            str(path),
        ],
        capture_output=True,
        text=True,
        check=True,
    )
    return float(res.stdout.strip())

sample_check = random.sample(segments, k=min(5, len(segments))) if segments else []
for s in sample_check:
    print(s.name, "→", round(probe_duration(s), 3), "s")

## 9) Create train/valid manifests
Deterministic split using `RANDOM_SEED` and `TRAIN_RATIO`.


In [None]:
import json
import random
from concurrent.futures import ThreadPoolExecutor

segments = sorted(SEGMENTS_DIR.glob("*.wav"))
random.seed(RANDOM_SEED)
random.shuffle(segments)

split_idx = int(len(segments) * TRAIN_RATIO)
train_files = segments[:split_idx]
valid_files = segments[split_idx:] or segments[-1:]

MANIFEST_DIR.mkdir(parents=True, exist_ok=True)

def write_manifest(split_name, files):
    """Write a manifest file for a split"""
    manifest_path = MANIFEST_DIR / f"{split_name}.jsonl"
    with open(manifest_path, "w") as f:
        for p in files:
            f.write(json.dumps({"path": str(p)}) + "\n")
    return split_name, len(files)

# Write both manifests in parallel
with ThreadPoolExecutor(max_workers=2) as executor:
    futures = [
        executor.submit(write_manifest, "train", train_files),
        executor.submit(write_manifest, "valid", valid_files)
    ]
    results = [f.result() for f in futures]

print("Train/valid counts:", len(train_files), len(valid_files))
print("Sample manifest line:")
print((MANIFEST_DIR / "train.jsonl").read_text().splitlines()[:1])

## 10) Create `egs/train` and `egs/valid` symlinks


In [None]:
import os

for split, files in [("train", train_files), ("valid", valid_files)]:
    dest_dir = EGS_TRAIN if split == "train" else EGS_VALID
    dest_dir.mkdir(parents=True, exist_ok=True)
    for path in files:
        link = dest_dir / path.name
        if link.exists() or link.is_symlink():
            link.unlink()
        link.symlink_to(path)

print("egs/train count:", len(list(EGS_TRAIN.glob("*.wav"))))
print("egs/valid count:", len(list(EGS_VALID.glob("*.wav"))))

## 11) Generate AudioCraft-native data.jsonl
Adds `duration`, `sample_rate`, `channels` for each split.


In [None]:
import json
import subprocess
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

def probe(path: Path) -> dict:
    res = subprocess.run(
        [
            "ffprobe", "-v", "error",
            "-select_streams", "a:0",
            "-show_entries", "stream=sample_rate", "-show_entries", "format=duration",
            "-of", "json",
            str(path),
        ],
        capture_output=True,
        text=True,
        check=True,
    )
    info = json.loads(res.stdout)
    duration = float(info["format"]["duration"])
    sample_rate = int(info["streams"][0]["sample_rate"])
    return {"duration": duration, "sample_rate": sample_rate, "channels": CHANNELS}

def process_file(args):
    """Probe a file and return the payload for data.jsonl"""
    path, egs_dir = args
    payload = {"path": str(egs_dir / path.name)}
    payload.update(probe(path))
    return payload

for split, src_files, egs_dir in [
    ("train", train_files, EGS_TRAIN),
    ("valid", valid_files, EGS_VALID),
]:
    # Probe files in parallel
    max_workers = min(os.cpu_count() or 4, 32)
    probe_args = [(path, egs_dir) for path in src_files]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        payloads = list(tqdm(
            executor.map(process_file, probe_args),
            total=len(probe_args),
            desc=f"Probing {split}",
            unit=" files"
        ))
    
    # Write all results to file
    out_path = egs_dir / "data.jsonl"
    with open(out_path, "w") as f:
        for payload in payloads:
            f.write(json.dumps(payload) + "\n")
    
    print(f"Wrote {split} data.jsonl →", out_path)
    with open(out_path) as f:
        first_line = f.readline().strip()
    print("First entry:", first_line)

## 12) Create Hydra dataset config


In [None]:
%%bash
set -euo pipefail
mkdir -p /root/workspace/audiocraft/config/dset/audio
cat > /root/workspace/audiocraft/config/dset/audio/mtg_jamendo.yaml <<'YAML'
# @package __global__

datasource:
  max_sample_rate: 32000
  max_channels: 1
  train: /root/workspace/data/mtg_jamendo/egs/train
  valid: /root/workspace/data/mtg_jamendo/egs/valid
  evaluate: /root/workspace/data/mtg_jamendo/egs/valid
  generate: /root/workspace/data/mtg_jamendo/egs/valid
YAML
ls -l /root/workspace/audiocraft/config/dset/audio/mtg_jamendo.yaml

## 13) Ready-to-train checklist


In [None]:
import json
import torch

print({
    "torch": torch.__version__,
    "cuda": torch.cuda.is_available(),
    "train_wavs": len(list(EGS_TRAIN.glob("*.wav"))),
    "valid_wavs": len(list(EGS_VALID.glob("*.wav"))),
    "train_jsonl": EGS_TRAIN / "data.jsonl",
    "valid_jsonl": EGS_VALID / "data.jsonl",
    "config": Path('/root/workspace/audiocraft/config/dset/audio/mtg_jamendo.yaml'),
    "experiments_dir": EXPERIMENTS_DIR,
})