# 01d â€” MTG Jamendo/FMA combo
Combine and split the data into train/test sets for MTG and FMA

In [None]:
import subprocess
import random
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from tqdm import tqdm
import os

FMA_DIR = Path('/root/workspace/data/fma_large/wav_32k_mono')
MTG_JAMENDO_DIR = Path('/root/workspace/data/mtg_jamendo/wav_32k_mono')
OUT_DIR = Path('/root/workspace/data/all_data')



SEGMENT_SECONDS = 60
TARGET_SR = 32000
CHANNELS = 1
TRAIN_RATIO = 0.9
RANDOM_SEED = 42
NUM_SAMPLES_TOTAL = None  # adjust to control how many source tracks to keep

SEGMENTS_DIR = OUT_DIR / f'segments_{SEGMENT_SECONDS}s'


OUT_DIR.mkdir(parents=True, exist_ok=True)
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
# WAV_DIR = OUT_DIR / 'wav_32k_mono'
# WAV_DIR.mkdir(parents=True, exist_ok=True)


print(len(os.listdir(FMA_DIR)))
print(len(os.listdir(MTG_JAMENDO_DIR)))


def segment_wav(input_path: Path, output_pattern: Path,
                segment_seconds=SEGMENT_SECONDS, target_sr=TARGET_SR, channels=CHANNELS):
    cmd = [
        "ffmpeg",
        "-hide_banner", "-loglevel", "error",
        "-i", str(input_path),
        "-ar", str(target_sr),
        "-ac", str(channels),
        "-f", "segment",
        "-segment_time", str(segment_seconds),
        "-reset_timestamps", "1",
        str(output_pattern),
    ]
    subprocess.run(cmd, check=True)

# Helper Functions

In [None]:
from pathlib import Path
import subprocess



def _tag(src: Path) -> str:
    s = str(src)
    if s.startswith(str(FMA_DIR)):
        return "fma"
    if s.startswith(str(MTG_JAMENDO_DIR)):
        return "mtg"
    return "unk"

def segment_one(src: Path):
    prefix = _tag(src) + "__"
    # pattern required by ffmpeg segment muxer
    out_pattern = SEGMENTS_DIR / f"{prefix}{src.stem}_%05d.wav"
    segment_wav(src, out_pattern)   # <-- segment_wav should accept (input_path, output_pattern)

# Filter for selected durations

In [None]:
import contextlib
import wave
import subprocess
from pathlib import Path

def get_duration_seconds_fast(p: Path) -> float:
    # Fast path: WAV header (no decoding, no subprocess)
    try:
        with contextlib.closing(wave.open(str(p), "rb")) as wf:
            return wf.getnframes() / float(wf.getframerate())
    except Exception:
        # Fallback: ffprobe (for odd/unsupported WAV encodings)
        r = subprocess.run(
            ["ffprobe", "-v", "error",
             "-show_entries", "format=duration",
             "-of", "default=noprint_wrappers=1:nokey=1",
             str(p)],
            capture_output=True, text=True, check=True
        )
        return float(r.stdout.strip())

def should_keep(p: Path, min_seconds: float) -> bool:
    try:
        return get_duration_seconds_fast(p) >= min_seconds
    except Exception:
        return False



wav_files = sorted(FMA_DIR.rglob("*.wav")) + sorted(MTG_JAMENDO_DIR.rglob("*.wav"))
print(f"Total WAV: {len(wav_files)}")

kept = []
for p in tqdm(wav_files, desc=f"Filtering < {SEGMENT_SECONDS}s", unit=" files"):
    if should_keep(p, SEGMENT_SECONDS):
        kept.append(p)

wav_files = kept
print(f"WAV after duration filter: {len(wav_files)}")


In [None]:
import subprocess
import random
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import os

max_workers = min(os.cpu_count() or 4, 32)
print(f"Max workers: {max_workers}")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    for _ in tqdm(executor.map(segment_one, wav_files),
                  total=len(wav_files),
                  desc="Segmenting WAVs",
                  unit=" files"):
        pass