In [17]:
from __future__ import annotations

import re
from pathlib import Path

import torch
import torchcodec

import torchaudio
import soundfile as sf
from IPython.display import Audio

from TTS.api import TTS



In [18]:
import os

os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["COQUI_TTS_AGREED"] = "1"


In [None]:
DATA_DIR = Path("/home/SpeakerRec/BioVoice/data/") 
TTS_DIR = DATA_DIR / "tts/coqui"
REF_WAV = DATA_DIR / "wavs/yoav_001.wav"          # path to reference speaker wav
TEXT = "Hello World, I'm Yoav."  # what you want to synthesize
OUT_WAV = TTS_DIR / "yoav_001_cloned.wav"   # output path

LANGUAGE = "en"                    # for XTTS-v2, e.g. "en"
MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
TRIM_REF_SECONDS = 6.0             # XTTS often works well with 3â€“6 seconds :contentReference[oaicite:2]{index=2}
NORMALIZE_REF = True
SPLIT_LONG_TEXT = True             # safer for long paragraphs
USE_GPU = torch.cuda.is_available()

In [20]:
def _prepare_ref_wav(
    in_wav: Path,
    out_wav: Path,
    *,
    trim_seconds: float | None = None,
    normalize: bool = True,
) -> Path:
    """
    Loads, converts to mono, optionally trims, normalizes, and saves a cleaned ref wav.
    Keeping it short/clean makes cloning more stable.
    """
    assert in_wav.exists(), f"Missing REF_WAV: {in_wav}"
    wav, sr = torchaudio.load(in_wav.as_posix())  # (ch, n)

    # mono
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)

    # trim
    if trim_seconds is not None and trim_seconds > 0:
        n = int(trim_seconds * sr)
        wav = wav[:, :n]

    # normalize
    if normalize:
        peak = wav.abs().max().clamp_min(1e-8)
        wav = wav / peak * 0.95

    out_wav.parent.mkdir(parents=True, exist_ok=True)
    sf.write(out_wav.as_posix(), wav.squeeze(0).cpu().numpy(), sr)
    return out_wav


def _split_sentences(text: str) -> list[str]:
    # lightweight sentence split (no extra deps)
    parts = re.split(r"(?<=[.!?])\s+", text.strip())
    return [p for p in parts if p]




In [None]:
ref_prepared = _prepare_ref_wav(
    REF_WAV,
    TTS_DIR.with_name(REF_WAV.stem + "_prepared.wav"),
    trim_seconds=TRIM_REF_SECONDS,
    normalize=NORMALIZE_REF,
)
print("Prepared ref:", ref_prepared)




Prepared ref: /home/SpeakerRec/BioVoice/data/wavs/yoav_001_prepared.wav


In [22]:
print("CUDA available:", torch.cuda.is_available())
tts = TTS(MODEL_NAME, gpu=USE_GPU)  # XTTS-v2 via Coqui TTS :contentReference[oaicite:3]{index=3}
print("Loaded:", MODEL_NAME)


CUDA available: True
Loaded: tts_models/multilingual/multi-dataset/xtts_v2


In [23]:
OUT_WAV.parent.mkdir(parents=True, exist_ok=True)

if not SPLIT_LONG_TEXT:
    # simplest: single call
    tts.tts_to_file(
        text=TEXT,
        speaker_wav=ref_prepared.as_posix(),
        language=LANGUAGE,
        file_path=OUT_WAV.as_posix(),
    )
else:
    # safer for longer text: generate sentence-by-sentence and concatenate
    chunks = _split_sentences(TEXT)
    sr_out = 24000  # XTTS commonly outputs 24kHz (library handles details)
    all_audio = []

    for i, chunk in enumerate(chunks, 1):
        wav = tts.tts(
            text=chunk,
            speaker_wav=ref_prepared.as_posix(),
            language=LANGUAGE,
        )
        all_audio.append(torch.tensor(wav, dtype=torch.float32))

        # short pause between sentences
        all_audio.append(torch.zeros(int(0.15 * sr_out), dtype=torch.float32))

        print(f"Done chunk {i}/{len(chunks)}")

    audio = torch.cat(all_audio).cpu().numpy()
    sf.write(OUT_WAV.as_posix(), audio, sr_out)

print("Wrote:", OUT_WAV.resolve())
Audio(OUT_WAV.as_posix())


Done chunk 1/1
Wrote: /home/SpeakerRec/BioVoice/data/tts/yoav_001_cloned.wav
