In [1]:
import os
os.environ["AUDIOCRAFT_CLUSTER"] = "default"
from omegaconf import OmegaConf

import torch
import lightning as L
from IPython.display import Audio

from helper import load_tokenizer, load_voicecraftx, load_speaker_model, generate

config = OmegaConf.load("config/inference/edit.yaml")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
"""
VoiceCraft-X supports "english", "chinese", "japanese", "korean", "spanish", "french", 
"german", "italian", "portuguese", "dutch", "polish" 11 languages.
"""
LANGUAGE = "english" 
config.MAX_LENGTH = 10 # 10 seconds, you can increase/decrease it

# set to False if the input text is Chinese
config.skip_zh_tn_model = False if LANGUAGE == "chinese" else True 
campplus_session = load_speaker_model(config)
text_tokenizer, audio_tokenizer = load_tokenizer(config)
audio_tokenizer = audio_tokenizer.to(device)
model = load_voicecraftx(config)
model = model.to(device)

2025-07-17 22:40:47,302 WETEXT INFO found existing fst: /home/ubuntu/miniconda3/envs/voicecraftx/lib/python3.10/site-packages/tn/en_tn_tagger.fst
2025-07-17 22:40:47,303 WETEXT INFO                     /home/ubuntu/miniconda3/envs/voicecraftx/lib/python3.10/site-packages/tn/en_tn_verbalizer.fst
2025-07-17 22:40:47,303 WETEXT INFO skip building fst for en_normalizer ...
Dora directory: /tmp/audiocraft_ubuntu


In [3]:
original_audio = "../data/samples/4446_2275_000003_000000.wav"
original_text = "She pushed him toward the big chair by the fire, and sat down on a stool at the opposite side of the hearth, her knees drawn up to her chin, laughing like a happy little girl."
target_text = "She pushed him toward the big chair by the fire, and swiftly arranged herself upon a low hassock near the flickering flames, her knees drawn up to her chin, laughing like a happy little girl."

with open(original_audio.replace(".wav", ".txt"), "w", encoding="utf-8") as wf:
    print(original_text, file=wf)


# os.system("mfa model download dictionary english_us_arpa")
# os.system("mfa model download acoustic english_us_arpa")
# os.system("mfa align -j 1 --output_format csv ../../data/samples english_us_arpa english_us_arpa ../../data/samples/mfa_alignments -t ../../data/samples/temp_cache")
# os.system("mfa model download dictionary mandarin_china_mfa")
# os.system("mfa model download acoustic mandarin_mfa")
# os.system("mfa align -j 1 --output_format csv ./demo mandarin_china_mfa mandarin_mfa ./demo/mfa_alignments -t ./demo/temp_cache")

alignment_path = original_audio.replace(".wav", ".csv").replace("/samples/", "/samples/mfa_alignments/")

In [4]:
L.seed_everything(seed=0)
n_samples = 5

print(f"Original text: {original_text}")
display(Audio(original_audio, rate=config.SAMPLE_RATE))

outputs = generate(
    config=config, device=device, language=LANGUAGE,
    prompt_audio=original_audio,
    prompt_text=original_text, target_text=target_text,
    model=model, speaker_model=campplus_session,
    text_tokenizer=text_tokenizer, 
    audio_tokenizer=audio_tokenizer,
    task="editing",
    alignment_path=alignment_path
)

print(f"Target text: {target_text}")
for i in range(n_samples):
    gen_audio = audio_tokenizer.decode(outputs[i])
    gen_audio = gen_audio[0].detach().cpu().numpy()
    display(Audio(gen_audio, rate=config.SAMPLE_RATE))

Seed set to 0


Original text: She pushed him toward the big chair by the fire, and sat down on a stool at the opposite side of the hearth, her knees drawn up to her chin, laughing like a happy little girl.


Target text: She pushed him toward the big chair by the fire, and swiftly arranged herself upon a low hassock near the flickering flames, her knees drawn up to her chin, laughing like a happy little girl.
