In [2]:
#!/usr/bin/env python3
"""
Simple LibriSpeech manifest maker ‚Äî no CLI args.
Creates one JSONL manifest per chosen subset folder (dev-clean, dev-other, etc.)
in the specified output directory.
"""

import json
from pathlib import Path

# -------------------------------------------------------------------------
# CONFIGURATION ‚Äî just edit these variables

# Path to your LibriSpeech root directory
LIBRISPEECH_ROOT = Path("/home/puneets/datasets/librispeech/LibriSpeech")

# Output directory for manifest files
OUT_DIR = Path("/home/puneets/datasets/librispeech/manifests")

# Which subset folders to process
# Comment/uncomment as needed
SUBSETS = [
    "dev-clean",
    "dev-other",
    "test-clean",
    "test-other",
    "train-clean-100",
]

# -------------------------------------------------------------------------

AUDIO_EXTS = [".flac", ".wav", ".mp3", ".m4a", ".ogg"]

def find_audio_file(base_dir: Path, uttid: str):
    """Find audio file for given utterance ID in the same folder."""
    for ext in AUDIO_EXTS:
        p = base_dir / f"{uttid}{ext}"
        if p.exists():
            return p
    # fallback: any file matching prefix
    for p in base_dir.glob(f"{uttid}.*"):
        if p.is_file():
            return p
    return None


def process_text(text: str) -> str:
    """Optionally clean or normalize text."""
    return text  # no change ‚Äî keep as is


def make_manifest_for_subset(subset_name: str):
    subset_dir = LIBRISPEECH_ROOT / subset_name
    if not subset_dir.exists():
        print(f"‚ö†Ô∏è  Subset folder not found: {subset_dir}")
        return

    out_path = OUT_DIR / f"{subset_name}.jsonl"
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    trans_files = list(subset_dir.rglob("*.trans.txt"))
    print(f"üìÅ Processing {subset_name} ‚Äî found {len(trans_files)} transcription files")

    records = []
    for tfile in trans_files:
        with tfile.open("r", encoding="utf-8") as fh:
            for line in fh:
                line = line.strip()
                if not line:
                    continue
                parts = line.split(" ", 1)
                if len(parts) < 2:
                    continue
                uttid, transcription = parts[0].strip(), parts[1].strip()
                audio_path = find_audio_file(tfile.parent, uttid)
                if not audio_path:
                    # try one level up if not found
                    audio_path = find_audio_file(tfile.parent.parent, uttid)
                if not audio_path:
                    print(f"‚ö†Ô∏è  Missing audio for {uttid} in {tfile}")
                    continue

                record = {
                    "key": f"libri-{uttid}_ASR",
                    "source": str(audio_path.resolve()),
                    "target": transcription,
                    "processed_target": process_text(transcription),
                }
                records.append(record)

    with out_path.open("w", encoding="utf-8") as outfh:
        for rec in records:
            outfh.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"‚úÖ Wrote {len(records)} records ‚Üí {out_path}")


# -------------------------------------------------------------------------
# MAIN EXECUTION ‚Äî loops through selected subsets
# -------------------------------------------------------------------------

if __name__ == "__main__":
    for subset in SUBSETS:
        make_manifest_for_subset(subset)


üìÅ Processing dev-clean ‚Äî found 97 transcription files
‚úÖ Wrote 2703 records ‚Üí /home/puneets/datasets/librispeech/manifests/dev-clean.jsonl
üìÅ Processing dev-other ‚Äî found 91 transcription files
‚úÖ Wrote 2864 records ‚Üí /home/puneets/datasets/librispeech/manifests/dev-other.jsonl
üìÅ Processing test-clean ‚Äî found 87 transcription files
‚úÖ Wrote 2620 records ‚Üí /home/puneets/datasets/librispeech/manifests/test-clean.jsonl
üìÅ Processing test-other ‚Äî found 90 transcription files
‚úÖ Wrote 2939 records ‚Üí /home/puneets/datasets/librispeech/manifests/test-other.jsonl
üìÅ Processing train-clean-100 ‚Äî found 585 transcription files
‚úÖ Wrote 28539 records ‚Üí /home/puneets/datasets/librispeech/manifests/train-clean-100.jsonl
