# 01b — FMA large mini downloader
Sections 6–13 (download + preprocessing) split out from `01_fma_large_setup.ipynb`.
Run the setup notebook first so system/Python deps and the AudioCraft repo are available.


In [None]:
from pathlib import Path
import os

BASE_DIR = Path("/workspace")
DATA_DIR = BASE_DIR / "data" / "fma_large"
RAW_DIR = BASE_DIR / "data" / "fma_raw"
AUDIOCRAFT_REPO_DIR = BASE_DIR / "audiocraft"
EXPERIMENTS_DIR = BASE_DIR / "experiments" / "audiocraft"

SEGMENT_SECONDS = 10
TARGET_SR = 32000
CHANNELS = 1
TRAIN_RATIO = 0.9
RANDOM_SEED = 42
NUM_SAMPLES_TOTAL = None  # adjust to control how many source tracks to keep

FMA_ARCHIVE_URLS = [
    os.environ.get("FMA_SAMPLE_ARCHIVE_URL"),
    "https://os.unil.cloud.switch.ch/fma/fma_large.zip",
    "https://mirror.math.princeton.edu/pub/fma/fma_large.zip",
    "https://huggingface.co/datasets/echonest/fma_large/resolve/main/fma_large.zip",
]
FMA_ARCHIVE_URLS = [u for u in FMA_ARCHIVE_URLS if u]

WAV_DIR = DATA_DIR / "wav_32k_mono"
SEGMENTS_DIR = DATA_DIR / "segments_10s"
MANIFEST_DIR = DATA_DIR / "manifests"
EGS_TRAIN = DATA_DIR / "egs" / "train"
EGS_VALID = DATA_DIR / "egs" / "valid"

for p in (DATA_DIR, RAW_DIR, WAV_DIR, SEGMENTS_DIR, MANIFEST_DIR, EGS_TRAIN, EGS_VALID, EXPERIMENTS_DIR):
    p.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("Using URLs (in order):", FMA_ARCHIVE_URLS)


## 6) Download a large FMA subset
Downloads `fma_large.zip` (or a user-provided largeer archive) and extracts a limited number of MP3s.


In [None]:
import zipfile
import subprocess
from pathlib import Path
from tqdm import tqdm

archive_path = RAW_DIR / "fma_large.zip"
mp3_root = RAW_DIR / "fma_large"

def aria2_download(url: str, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    cmd = [
        "aria2c",
        "-x", "16",          # connections per server
        "-s", "16",          # split count
        "-k", "1M",          # chunk size
        "--file-allocation=none",
        "--allow-overwrite=true",
        "-o", out_path.name,
        "-d", str(out_path.parent),
        url,
    ]
    print("Running:", " ".join(cmd))
    subprocess.run(cmd, check=True)

if not any(mp3_root.rglob("*.mp3")):
    if not archive_path.exists():
        if not FMA_ARCHIVE_URLS:
            raise RuntimeError("Provide at least one FMA archive URL (set FMA_SAMPLE_ARCHIVE_URL)")
        last_err = None
        for url in FMA_ARCHIVE_URLS:
            try:
                print(f"Downloading {url} → {archive_path} (aria2c multi-conn)")
                aria2_download(url, archive_path)
                break
            except Exception as e:  # noqa: BLE001
                last_err = e
                print(f"Failed {url}: {e}")
        if not archive_path.exists():
            raise RuntimeError(f"Download failed; last error: {last_err}")

    with zipfile.ZipFile(archive_path) as zf:
        members = sorted([m for m in zf.namelist() if m.endswith(".mp3")])
        if NUM_SAMPLES_TOTAL:
            members = members[:NUM_SAMPLES_TOTAL]
        print(f"Extracting {len(members)} MP3s from archive…")
        for member in tqdm(members, desc="Extracting", unit=" files"):
            dest = RAW_DIR / member
            if dest.exists():
                continue
            dest.parent.mkdir(parents=True, exist_ok=True)
            with zf.open(member) as src, open(dest, "wb") as dst:
                dst.write(src.read())
else:
    print("MP3s already present; skipping download/extract.")

sampled_mp3s = sorted(mp3_root.rglob("*.mp3"))
print("MP3 files ready:", len(sampled_mp3s))
for name in sampled_mp3s[:5]:
    print(" •", name)

## 7) Convert to mono 32k wav
Uses ffmpeg; idempotent if WAVs already exist.


In [None]:
import subprocess

WAV_DIR.mkdir(parents=True, exist_ok=True)
mp3_files = sorted((RAW_DIR / "fma_large").rglob("*.mp3"))

for src in mp3_files:
    dst = WAV_DIR / f"{src.stem}.wav"
    if dst.exists():
        continue
    cmd = [
        "ffmpeg", "-hide_banner", "-loglevel", "error",
        "-i", str(src),
        "-ac", str(CHANNELS),
        "-ar", str(TARGET_SR),
        "-map_metadata", "-1",
        "-vn",
        str(dst),
    ]
    subprocess.run(cmd, check=True)

wav_files = sorted(WAV_DIR.glob("*.wav"))
print("WAV files ready:", len(wav_files))
for name in wav_files[:5]:
    print(" •", name)


## 8) Segment into 10s chunks
Segments each WAV into fixed-duration clips.


In [None]:
import subprocess
import random

SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
wav_files = sorted(WAV_DIR.glob("*.wav"))

for wav in wav_files:
    existing = list(SEGMENTS_DIR.glob(f"{wav.stem}_seg_*.wav"))
    if existing:
        continue
    pattern = SEGMENTS_DIR / f"{wav.stem}_seg_%03d.wav"
    cmd = [
        "ffmpeg", "-hide_banner", "-loglevel", "error",
        "-i", str(wav),
        "-f", "segment",
        "-segment_time", str(SEGMENT_SECONDS),
        "-reset_timestamps", "1",
        "-map_metadata", "-1",
        str(pattern),
    ]
    subprocess.run(cmd, check=True)

segments = sorted(SEGMENTS_DIR.glob("*.wav"))
print("Total segments:", len(segments))

def probe_duration(path: Path) -> float:
    res = subprocess.run(
        [
            "ffprobe", "-v", "error",
            "-show_entries", "format=duration",
            "-of", "default=noprint_wrappers=1:nokey=1",
            str(path),
        ],
        capture_output=True,
        text=True,
        check=True,
    )
    return float(res.stdout.strip())

sample_check = random.sample(segments, k=min(5, len(segments))) if segments else []
for s in sample_check:
    print(s.name, "→", round(probe_duration(s), 3), "s")


## 9) Create train/valid manifests
Deterministic split using `RANDOM_SEED` and `TRAIN_RATIO`.


In [None]:
import json
import random

segments = sorted(SEGMENTS_DIR.glob("*.wav"))
random.seed(RANDOM_SEED)
random.shuffle(segments)

split_idx = int(len(segments) * TRAIN_RATIO)
train_files = segments[:split_idx]
valid_files = segments[split_idx:] or segments[-1:]

MANIFEST_DIR.mkdir(parents=True, exist_ok=True)
with open(MANIFEST_DIR / "train.jsonl", "w") as f:
    for p in train_files:
        f.write(json.dumps({"path": str(p)}) + "\n")
with open(MANIFEST_DIR / "valid.jsonl", "w") as f:
    for p in valid_files:
        f.write(json.dumps({"path": str(p)}) + "\n")

print("Train/valid counts:", len(train_files), len(valid_files))
print("Sample manifest line:")
print((MANIFEST_DIR / "train.jsonl").read_text().splitlines()[:1])

## 10) Create `egs/train` and `egs/valid` symlinks


In [None]:
import os

for split, files in [("train", train_files), ("valid", valid_files)]:
    dest_dir = EGS_TRAIN if split == "train" else EGS_VALID
    dest_dir.mkdir(parents=True, exist_ok=True)
    for path in files:
        link = dest_dir / path.name
        if link.exists() or link.is_symlink():
            link.unlink()
        link.symlink_to(path)

print("egs/train count:", len(list(EGS_TRAIN.glob("*.wav"))))
print("egs/valid count:", len(list(EGS_VALID.glob("*.wav"))))


## 11) Generate AudioCraft-native data.jsonl
Adds `duration`, `sample_rate`, `channels` for each split.


In [None]:
import json
import subprocess

def probe(path: Path) -> dict:
    res = subprocess.run(
        [
            "ffprobe", "-v", "error",
            "-select_streams", "a:0",
            "-show_entries", "stream=sample_rate", "-show_entries", "format=duration",
            "-of", "json",
            str(path),
        ],
        capture_output=True,
        text=True,
        check=True,
    )
    info = json.loads(res.stdout)
    duration = float(info["format"]["duration"])
    sample_rate = int(info["streams"][0]["sample_rate"])
    return {"duration": duration, "sample_rate": sample_rate, "channels": CHANNELS}

for split, src_files, egs_dir in [
    ("train", train_files, EGS_TRAIN),
    ("valid", valid_files, EGS_VALID),
]:
    out_path = egs_dir / "data.jsonl"
    with open(out_path, "w") as f:
        for path in src_files:
            payload = {"path": str(egs_dir / path.name)}
            payload.update(probe(path))
            f.write(json.dumps(payload) + "\n")
    print(f"Wrote {split} data.jsonl →", out_path)
    with open(out_path) as f:
        first_line = f.readline().strip()
    print("First entry:", first_line)

## 12) Create Hydra dataset config


In [None]:
%%bash
set -euo pipefail
mkdir -p /workspace/audiocraft/config/dset/audio
cat > /workspace/audiocraft/config/dset/audio/fma_large.yaml <<'YAML'
# @package __global__

datasource:
  max_sample_rate: 32000
  max_channels: 1
  train: /workspace/data/fma_large/egs/train
  valid: /workspace/data/fma_large/egs/valid
  evaluate: /workspace/data/fma_large/egs/valid
  generate: /workspace/data/fma_large/egs/valid
YAML
ls -l /workspace/audiocraft/config/dset/audio/fma_large.yaml


## 13) Ready-to-train checklist


In [None]:
import json
import torch

print({
    "torch": torch.__version__,
    "cuda": torch.cuda.is_available(),
    "train_wavs": len(list(EGS_TRAIN.glob("*.wav"))),
    "valid_wavs": len(list(EGS_VALID.glob("*.wav"))),
    "train_jsonl": EGS_TRAIN / "data.jsonl",
    "valid_jsonl": EGS_VALID / "data.jsonl",
    "config": Path('/workspace/audiocraft/config/dset/audio/fma_large.yaml'),
    "experiments_dir": EXPERIMENTS_DIR,
})
