In [1]:
from IPython.display import Audio

import torch
import torchaudio
from tqdm import tqdm

from data.tokenizer import (
    AudioTokenizer,
    TextTokenizer,
)
from models import voicecraft
from inference_tts_scale import inference_one_sample

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# hyperparameters for inference
CODEC_AUDIO_SR = 16000
CODEC_SR = 50
TOP_K = 0
TOP_P = 0.8
TEMPERATURE = 0.8
KVCACHE = 1
SILENCE_TOKENS = [1388, 1898, 131]

# adjust the following params if generation is not good
## if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1
STOP_REPETITION = 1
## if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4
## # what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest
SAMPLE_BATCH_SIZE = 3

# Setup decoding configurations
DECODE_CONFIG = {
    "codec_audio_sr": CODEC_AUDIO_SR,
    "codec_sr": CODEC_SR,
    "top_k": TOP_K,
    "top_p": TOP_P,
    "temperature": TEMPERATURE,
    "kvcache": KVCACHE,
    "silence_tokens": SILENCE_TOKENS,
    "stop_repetition": STOP_REPETITION,
    "sample_batch_size": SAMPLE_BATCH_SIZE
}

# Model paths
ENCODEC_CKPTPATH = "./pretrained_models/encodec_4cb2048_giga.th"
VOICECRAFT_CKPTPATH = "./pretrained_models/giga830M.pth"
# VOICECRAFT_CKPTPATH = "./pretrained_models/giga330M.pth"

In [3]:
# Initialize VC model
ckpt = torch.load(VOICECRAFT_CKPTPATH, map_location="cpu")
model = voicecraft.VoiceCraft(ckpt["config"])
model.load_state_dict(ckpt["model"])
model.to(DEVICE)
model.eval()

# Phoneme mapper
phn2num = ckpt['phn2num']

# Tokenizers
text_tokenizer = TextTokenizer(backend="espeak")
audio_tokenizer = AudioTokenizer(signature=ENCODEC_CKPTPATH) # will also put the neural codec model on gpu

Dora directory: /tmp/audiocraft_root


In [4]:
# Prompt: make sure input audio (up till CUTOFF_SEC) is aligned with the starting transcript (TEXT_PROMPT)
CUTOFF_SEC = 3.02
AUDIO_PROMPT_PATH = "./examples/84_121550_000074_000000.wav"
TEXT_PROMPT = "But when I had approached so near to them the common"

audio_info = torchaudio.info(AUDIO_PROMPT_PATH)
audio_duration = audio_info.num_frames / audio_info.sample_rate 

assert CUTOFF_SEC < audio_duration, f"CUTOFF_SEC {CUTOFF_SEC} is longer than the audio duration {audio_duration}"
prompt_end_frame = int(CUTOFF_SEC * audio_info.sample_rate)

In [5]:
APPEND_TRANSCRIPTS_STR = """
Do you believe that this was completely generated? 

From just 3 seconds of audio as prompt? 

I don't believe it either. But it's the complete truth.

Be afraid.
"""

append_transcripts = APPEND_TRANSCRIPTS_STR.split("\n")
append_transcripts = [at.strip() for at in append_transcripts if len(at) > 0]
input_texts = [f"{TEXT_PROMPT} {at}" for at in append_transcripts] 

In [6]:
generated_audios = []
for input_text in tqdm(input_texts):
    _, gen_audio = inference_one_sample(
        model, ckpt["config"], phn2num, text_tokenizer, audio_tokenizer,
        AUDIO_PROMPT_PATH, input_text,
        DEVICE, DECODE_CONFIG,
        prompt_end_frame
    )
    gen_audio = gen_audio[0].cpu()
    generated_audios.append(gen_audio)

100% 4/4 [01:21<00:00, 20.30s/it]


In [7]:
concatenated_gen_audio = torch.cat(generated_audios, dim=1)
display(Audio(concatenated_gen_audio, rate=CODEC_AUDIO_SR))