In [1]:
import os
os.environ["AUDIOCRAFT_CLUSTER"] = "default"
from omegaconf import OmegaConf

import torch
import lightning as L
from IPython.display import Audio

from helper import load_tokenizer, load_voicecraftx, load_speaker_model, generate

config = OmegaConf.load("config/inference/tts.yaml")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
"""
VoiceCraft-X supports "english", "chinese", "japanese", "korean", "spanish", "french", 
"german", "italian", "portuguese", "dutch", "polish" 11 languages.
"""
LANGUAGE = "english" 
config.MAX_LENGTH = 10 # 10 seconds, you can increase/decrease it

# set to False if the input text is Chinese
config.skip_zh_tn_model = False if LANGUAGE == "chinese" else True 
campplus_session = load_speaker_model(config)
text_tokenizer, audio_tokenizer = load_tokenizer(config)
audio_tokenizer = audio_tokenizer.to(device)
model = load_voicecraftx(config)
model = model.to(device)

2025-07-17 22:38:55,136 WETEXT INFO found existing fst: /home/ubuntu/miniconda3/envs/voicecraftx/lib/python3.10/site-packages/tn/en_tn_tagger.fst
2025-07-17 22:38:55,137 WETEXT INFO                     /home/ubuntu/miniconda3/envs/voicecraftx/lib/python3.10/site-packages/tn/en_tn_verbalizer.fst
2025-07-17 22:38:55,138 WETEXT INFO skip building fst for en_normalizer ...
Dora directory: /tmp/audiocraft_ubuntu


In [3]:
prompt_audio = "../data/samples/84_121123_000008_000000.wav"
prompt_text = "Villefort rose, half ashamed of being surprised in such a paroxysm of grief."
target_text = "As the doctors entered the street, they saw a man in a cassock standing on the threshold of the next door."

# prompt_audio = "/home/ubuntu/VoiceCraft-X/data/samples/X0000000021_240514196_S00041.wav"
# prompt_text = "跟随的炮船向上方发出的雷达振波也传回了完全不可理解的回波。"
# target_text = "中国社会都教育自己的孩子要少说话，敏于思而辣于言。"

In [4]:
L.seed_everything(seed=0)
n_samples = 5

print(f"Prompt text: {prompt_text}")
display(Audio(prompt_audio, rate=config.SAMPLE_RATE))

outputs = generate(
    config=config, device=device, language=LANGUAGE,
    prompt_audio=prompt_audio,
    prompt_text=prompt_text, target_text=target_text,
    model=model, speaker_model=campplus_session,
    text_tokenizer=text_tokenizer, 
    audio_tokenizer=audio_tokenizer
)

print(f"Target text: {target_text}")
for i in range(n_samples):
    gen_audio = audio_tokenizer.decode(outputs[i])
    gen_audio = gen_audio[0].detach().cpu().numpy()
    display(Audio(gen_audio, rate=config.SAMPLE_RATE))

Seed set to 0


Prompt text: Villefort rose, half ashamed of being surprised in such a paroxysm of grief.


Target text: As the doctors entered the street, they saw a man in a cassock standing on the threshold of the next door.
