In [None]:
from __future__ import annotations

from pathlib import Path
import csv
import time
import torch
from IPython.display import Audio

from f5_tts.api import F5TTS


In [None]:
DATA_DIR = Path("/home/SpeakerRec/BioVoice/data/")
WAVS_DIR = DATA_DIR / "wavs"
TTS_DIR = DATA_DIR / "tts" / "f5"
TTS_DIR.mkdir(parents=True, exist_ok=True)

USE_GPU = torch.cuda.is_available()
DEVICE = "cuda" if USE_GPU else "cpu"

print("CUDA available:", USE_GPU)

In [None]:
SENTENCES: list[str] = [
    "My voice is my password.",
    "Verify me with my voice.",
    "Authenticate this speaker, please.",
    "Grant access to my account.",
    "Unlock the system for me.",
    "Confirm my identity by voice.",
    "This is my secret phrase.",
    "Secure login with my voice.",
    "Trust but verify my speech.",
    "Match this voice to me.",
    "Approve access for this speaker.",
    "Voice check for my login.",
    "Identity check, voice only.",
    "Compare my voiceprint now.",
    "Validate this voice as mine.",
    "I request secure entry.",
    "Open my profile securely.",
    "Allow login after verification.",
    "Voice key engaged, confirm.",
    "Check phrase against enrollment.",
    "This utterance authenticates me.",
    "Biometric login, voice sample.",
    "Authenticate this session, please.",
    "Approve sign-in by voice.",
    "Access gate, verify speaker.",
    "This is my access phrase.",
    "Confirm speaker equals account owner.",
    "Match phrase to enrolled sample.",
    "Security check, no typed password.",
    "Thank you for verification.",
]
assert len(SENTENCES) == 30


In [None]:
SPEAKER_REFS: dict[str, list[Path]] = {
    "idan": [WAVS_DIR / "idan_001.wav", WAVS_DIR / "idan_002.wav"],
    "yoav": [WAVS_DIR / "yoav_001.wav", WAVS_DIR / "yoav_002.wav"],
    "eden": [WAVS_DIR / "eden_001.wav", WAVS_DIR / "eden_002.wav"],
}

REF_TEXT_BY_WAV: dict[Path, str] = {
    SPEAKER_REFS["idan"][0]: "My voice is my password.",
    SPEAKER_REFS["idan"][1]: "Verify me with my voice.",
    SPEAKER_REFS["yoav"][0]: "My voice is my password.",
    SPEAKER_REFS["yoav"][1]: "Verify me with my voice.",
    SPEAKER_REFS["eden"][0]: "My voice is my password.",
    SPEAKER_REFS["eden"][1]: "Verify me with my voice.",
}

for spk, refs in SPEAKER_REFS.items():
    assert len(refs) == 2, f"{spk} must have exactly 2 ref wavs"
    for r in refs:
        assert r.exists(), f"Missing ref wav: {r}"



In [None]:
try:
    f5 = F5TTS(device=DEVICE)
except TypeError:
    f5 = F5TTS()

print("F5TTS loaded.")


In [None]:
def pick_ref_for_sentence(refs: list[Path], sentence_idx_1based: int) -> Path:
    return refs[0] if sentence_idx_1based <= 15 else refs[1]



In [None]:
manifest_path = TTS_DIR / "manifest.csv"
rows: list[dict[str, str]] = []

remove_silence = False 

total = 0
failed = 0
t0 = time.time()

for speaker, refs in SPEAKER_REFS.items():
    for i, gen_text in enumerate(SENTENCES, start=1):
        ref_wav = pick_ref_for_sentence(refs, i)
        ref_text = REF_TEXT_BY_WAV.get(ref_wav, "")

        out_wav = TTS_DIR / f"{speaker}_{i:02d}.wav"

        try:
            wav, sr, spec = f5.infer(
                ref_file=str(ref_wav),
                ref_text=ref_text,     
                gen_text=gen_text,    
                file_wave=str(out_wav),
                remove_silence=remove_silence,
            )

            rows.append(
                {
                    "speaker": speaker,
                    "sentence_idx": str(i),
                    "gen_text": gen_text,
                    "ref_wav": str(ref_wav),
                    "ref_text": ref_text,
                    "out_wav": str(out_wav),
                    "sample_rate": str(sr),
                    "ok": "1",
                    "error": "",
                }
            )
            total += 1
            print(f"[OK] {speaker} {i:02d}/30 -> {out_wav.name}")

        except Exception as e:
            failed += 1
            rows.append(
                {
                    "speaker": speaker,
                    "sentence_idx": str(i),
                    "gen_text": gen_text,
                    "ref_wav": str(ref_wav),
                    "ref_text": ref_text,
                    "out_wav": str(out_wav),
                    "sample_rate": "",
                    "ok": "0",
                    "error": repr(e),
                }
            )
            print(f"[FAIL] {speaker} {i:02d}/30 -> {out_wav.name} | {e}")

# write manifest
with open(manifest_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=[
            "speaker",
            "sentence_idx",
            "gen_text",
            "ref_wav",
            "ref_text",
            "out_wav",
            "sample_rate",
            "ok",
            "error",
        ],
    )
    writer.writeheader()
    writer.writerows(rows)

dt = time.time() - t0
print(f"\nDone. ok={total} failed={failed} time_sec={dt:.1f}")
print("Manifest:", manifest_path)


