<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
# default_exp e2e

In [None]:
# export
import torch

from uberduck_ml_dev.text.symbols import NVIDIA_TACO2_SYMBOLS
from uberduck_ml_dev.text.util import text_to_sequence
from uberduck_ml_dev.data_loader import prepare_input_sequence

In [None]:
text, input_lengths = prepare_input_sequence(["One", "Two three"], cpu_run=True)
assert text.equal(
    torch.LongTensor(
        [[52, 51, 42, 0, 0, 0, 0, 0, 0], [57, 60, 52, 11, 57, 45, 55, 42, 42]]
    )
)
assert input_lengths.equal(torch.LongTensor([3, 9]))
text, input_lengths = prepare_input_sequence(
    ["Two three", "one"], cpu_run=True, arpabet=1
)
assert text.equal(
    torch.LongTensor([[133, 141, 11, 134, 130, 113], [144, 74, 119, 0, 0, 0]])
)
assert input_lengths.equal(torch.LongTensor([6, 3]))

In [None]:
# export

from typing import List

from uberduck_ml_dev.models.tacotron2 import Tacotron2
from uberduck_ml_dev.vocoders.hifigan import HiFiGanGenerator


def tts(
    lines: List[str],
    model,
    device: str,
    vocoder,
    arpabet=False,
    symbol_set=NVIDIA_TACO2_SYMBOLS,
    max_wav_value=32768.0,
    speaker_ids=None,
):
    assert isinstance(
        model, Tacotron2
    ), "Only Tacotron2 text-to-mel models are supported"
    assert isinstance(vocoder, HiFiGanGenerator), "Only Hifi GAN vocoders are supported"
    cpu_run = device == "cpu"
    sequences, input_lengths = prepare_input_sequence(
        lines, cpu_run=cpu_run, arpabet=arpabet, symbol_set=symbol_set
    )
    if speaker_ids is None:
        speaker_ids = torch.zeros(len(lines), dtype=torch.long, device=device)
    input_ = sequences, input_lengths, speaker_ids
    _, mel_outputs_postnet, gate_outputs, alignment, lengths = model.inference(input_)
    mels = mel_outputs_postnet
    mel = mels[0, :, : lengths[0].item()]
    for idx in range(1, mels.size(0)):
        length = lengths[idx].item()
        mel = torch.cat((mel, mels[idx, :, :length]), dim=-1)
    tensor_cls = torch.FloatTensor if device == "cpu" else torch.cuda.FloatTensor
    mel = mel[None, :]
    y_g_hat = vocoder(tensor_cls(mel).to(device=device))
    audio = y_g_hat.reshape(1, -1)
    audio = audio * max_wav_value
    return audio

In [None]:
# skip
from IPython import display as ipd

from uberduck_ml_dev.data_loader import MelSTFT
from uberduck_ml_dev.models.tacotron2 import DEFAULTS as TACOTRON2_DEFAULTS
from uberduck_ml_dev.text.symbols import NVIDIA_TACO2_SYMBOLS
from uberduck_ml_dev.vocoders.hifigan import HiFiGanGenerator

from IPython.display import display, Audio

model = Tacotron2(TACOTRON2_DEFAULTS)
loaded = torch.load("../models/tacotron2-eminem-arpabet-400-2021-12-14.pt")
model.load_state_dict(loaded)
hg = HiFiGanGenerator("../models/config_v1.json", "../models/g_02590000_8spk")
audio = tts(
    ["The quick brown fox jumped over the lazy dog."], model, "cpu", hg, arpabet=True
)
ipd.display(ipd.Audio(audio, rate=22050))

FileNotFoundError: [Errno 2] No such file or directory: '../models/tacotron2-eminem-arpabet-400-2021-12-14.pt'

In [None]:
# export

from typing import Optional

from uberduck_ml_dev.models.common import MelSTFT


@torch.no_grad()
def rhythm_transfer(
    original_audio: torch.tensor,
    original_text: str,
    model,
    vocoder,
    device: str,
    symbol_set=NVIDIA_TACO2_SYMBOLS,
    arpabet=False,
    max_wav_value=32768.0,
    speaker_id=0,
):
    assert len(original_audio.shape) == 1
    cpu_run = device == "cpu"
    # TODO(zach): Support non-default STFT parameters.
    stft = MelSTFT()
    p_arpabet = float(arpabet)
    sequence, input_lengths, _ = prepare_input_sequence(
        [original_text], arpabet=arpabet, cpu_run=cpu_run, symbol_set=symbol_set
    )
    original_target_mel = stft.mel_spectrogram(original_audio[None])
    if not cpu_run:
        original_target_mel = original_target_mel.cuda()
    max_len = original_target_mel.size(2)
    speaker_ids = torch.tensor([speaker_id], dtype=torch.long, device=device)
    inputs = (
        sequence,
        input_lengths,
        original_target_mel,
        max_len,
        torch.tensor([max_len], dtype=torch.long, device=device),
        speaker_ids,
    )
    attn = model.get_alignment(inputs)
    _, mel_postnet, _, _ = model.inference_noattention(
        (sequence, input_lengths, speaker_ids, attn.transpose(0, 1))
    )
    y_g_hat = vocoder(torch.tensor(mel_postnet, dtype=torch.float, device=device))
    audio = y_g_hat.reshape(1, -1)
    audio = audio * max_wav_value
    return audio

In [None]:
# skip
import IPython.display as ipd
import numpy as np
from scipy.io.wavfile import read

transcription = "Well you know as you know the web's a pretty miraculous thing and it was a very simple paradigm that was invented which was."
sr, data = read("./test/fixtures/wavs/stevejobs-1.wav")
assert sr == 22050
assert len(data.shape) == 1
assert isinstance(data, np.ndarray)
assert data.dtype == np.int16
data = torch.FloatTensor(data) / 32768.0

audio = rhythm_transfer(data, transcription, model, hg, "cpu", arpabet=True)
ipd.display(ipd.Audio(audio, rate=22050))

int16
int16


  y_g_hat = vocoder(torch.tensor(mel_postnet, dtype=torch.float, device=device))
