In [None]:
# 0) ENV + Imports
# =========================
from __future__ import annotations

import os
import csv
import time
from pathlib import Path
import re

import numpy as np
import torch
import soundfile as sf
from IPython.display import Audio

os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["COQUI_TTS_AGREED"] = "1"

from TTS.api import TTS



In [None]:
# 1) Paths + Globals
# =========================
DATA_DIR = Path("/home/SpeakerRec/BioVoice/data/")
WAVS_DIR = DATA_DIR / "wavs"

TTS_DIR = DATA_DIR / "tts" / "coqui"
TTS_DIR.mkdir(parents=True, exist_ok=True)

PREP_DIR = TTS_DIR / "_prepared_refs"
PREP_DIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
LANGUAGE = "en"

TRIM_REF_SECONDS = 6.0
NORMALIZE_REF = True
USE_GPU = torch.cuda.is_available()

print("CUDA available:", USE_GPU)
print("TTS_DIR:", TTS_DIR)



In [None]:
# 2) Sentences (1..30)
# =========================
SENTENCES: list[str] = [
    "My voice is my password.",
    "Verify me with my voice.",
    "Authenticate this speaker, please.",
    "Grant access to my account.",
    "Unlock the system for me.",
    "Confirm my identity by voice.",
    "This is my secret phrase.",
    "Secure login with my voice.",
    "Trust but verify my speech.",
    "Match this voice to me.",
    "Approve access for this speaker.",
    "Voice check for my login.",
    "Identity check, voice only.",
    "Compare my voiceprint now.",
    "Validate this voice as mine.",
    "I request secure entry.",
    "Open my profile securely.",
    "Allow login after verification.",
    "Voice key engaged, confirm.",
    "Check phrase against enrollment.",
    "This utterance authenticates me.",
    "Biometric login, voice sample.",
    "Authenticate this session, please.",
    "Approve sign-in by voice.",
    "Access gate, verify speaker.",
    "This is my access phrase.",
    "Confirm speaker equals account owner.",
    "Match phrase to enrolled sample.",
    "Security check, no typed password.",
    "Thank you for verification.",
]
assert len(SENTENCES) == 30



In [None]:
# 3) Speaker refs (2 each) + picking logic
# =========================
SPEAKER_REFS: dict[str, list[Path]] = {
    "idan": [WAVS_DIR / "idan_001.wav", WAVS_DIR / "idan_002.wav"],
    "yoav": [WAVS_DIR / "yoav_001.wav", WAVS_DIR / "yoav_002.wav"],
    "eden": [WAVS_DIR / "eden_001.wav", WAVS_DIR / "eden_002.wav"],
}

for spk, refs in SPEAKER_REFS.items():
    assert len(refs) == 2, f"{spk} must have exactly 2 ref wavs"
    for r in refs:
        assert r.exists(), f"Missing ref wav: {r}"

def pick_ref_for_sentence(refs: list[Path], sentence_idx_1based: int) -> Path:
    # ref #1 for 1..15, ref #2 for 16..30
    return refs[0] if sentence_idx_1based <= 15 else refs[1]



In [None]:
# 4) Prepare ref wavs (soundfile; no torchaudio dependency)
# =========================
def prepare_ref_wav(
    in_wav: Path,
    out_wav: Path,
    *,
    trim_seconds: float | None = None,
    normalize: bool = True,
) -> Path:
    """
    Loads with soundfile, converts to mono, trims, normalizes, saves float32 wav.
    """
    audio, sr = sf.read(in_wav.as_posix(), dtype="float32", always_2d=True)  # [T, C]
    mono = audio.mean(axis=1)  # [T]

    if trim_seconds is not None and trim_seconds > 0:
        n = int(trim_seconds * sr)
        mono = mono[:n]

    if normalize:
        peak = float(np.max(np.abs(mono)) + 1e-8)
        mono = (mono / peak) * 0.95

    sf.write(out_wav.as_posix(), mono, sr)
    return out_wav

# prepare refs once
PREP_REFS: dict[Path, Path] = {}
for spk, refs in SPEAKER_REFS.items():
    for r in refs:
        out = PREP_DIR / f"{r.stem}_prep.wav"
        if not out.exists():
            prepare_ref_wav(r, out, trim_seconds=TRIM_REF_SECONDS, normalize=NORMALIZE_REF)
        PREP_REFS[r] = out

print("Prepared refs:")
for k, v in PREP_REFS.items():
    print(" ", k.name, "->", v.name)



In [None]:
# 5) Load XTTS model once
# =========================
tts = TTS(MODEL_NAME, gpu=USE_GPU)
print("Loaded:", MODEL_NAME)

# optional: check output sample rate if exposed
sr_out = getattr(getattr(tts, "synthesizer", None), "output_sample_rate", None)
print("Output SR (if known):", sr_out)



In [None]:
# 6) Batch generate (same as F5)
# =========================
manifest_path = TTS_DIR / "manifest.csv"
rows: list[dict[str, str]] = []

total = 0
failed = 0
t0 = time.time()

for speaker, refs in SPEAKER_REFS.items():
    for i, gen_text in enumerate(SENTENCES, start=1):
        ref_wav = pick_ref_for_sentence(refs, i)
        ref_prepared = PREP_REFS[ref_wav]

        out_wav = TTS_DIR / f"{speaker}_{i:02d}.wav"

        try:
            # XTTS-v2: reference voice is speaker_wav
            tts.tts_to_file(
                text=gen_text,
                speaker_wav=ref_prepared.as_posix(),
                language=LANGUAGE,
                file_path=out_wav.as_posix(),
            )

            rows.append(
                {
                    "speaker": speaker,
                    "sentence_idx": str(i),
                    "gen_text": gen_text,
                    "ref_wav": str(ref_wav),
                    "ref_prepared": str(ref_prepared),
                    "out_wav": str(out_wav),
                    "model_name": MODEL_NAME,
                    "language": LANGUAGE,
                    "ok": "1",
                    "error": "",
                }
            )
            total += 1
            print(f"[OK] {speaker} {i:02d}/30 -> {out_wav.name}")

        except Exception as e:
            failed += 1
            rows.append(
                {
                    "speaker": speaker,
                    "sentence_idx": str(i),
                    "gen_text": gen_text,
                    "ref_wav": str(ref_wav),
                    "ref_prepared": str(ref_prepared),
                    "out_wav": str(out_wav),
                    "model_name": MODEL_NAME,
                    "language": LANGUAGE,
                    "ok": "0",
                    "error": repr(e),
                }
            )
            print(f"[FAIL] {speaker} {i:02d}/30 -> {out_wav.name} | {e}")

with open(manifest_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=[
            "speaker",
            "sentence_idx",
            "gen_text",
            "ref_wav",
            "ref_prepared",
            "out_wav",
            "model_name",
            "language",
            "ok",
            "error",
        ],
    )
    writer.writeheader()
    writer.writerows(rows)

dt = time.time() - t0
print(f"\nDone. ok={total} failed={failed} time_sec={dt:.1f}")
print("Manifest:", manifest_path)



In [None]:
# 7) Quick preview
# =========================
Audio(str(TTS_DIR / "yoav_01.wav"))
