# YourTTS Converter

## TTS Model setup

### Download and install Coqui TTS

In [None]:
# !git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS
# %pip install -q -e TTS/

### Download TTS Checkpoint

In [None]:
# TTS checkpoints
import os

if not os.path.exists('best_model.pth.tar'):
    # download config
    ! gdown --id 1-PfXD66l1ZpsZmJiC-vhL055CDSugLyP
    # download language json
    ! gdown --id 1_Vb2_XHqcC0OcvRF82F883MTxfTRmerg
    # download speakers json
    ! gdown --id 1SZ9GE0CBM-xGstiXH2-O2QWdmSXsBKdC -O speakers.json
    # download checkpoint
    # ! gdown --id 1j1TuaCGTizpuHtKPNclkxtWamfpTT1Su -O best_model.pth.tar
    ! gdown --id 1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR -O best_model.pth.tar

### Imports

In [None]:
import sys
TTS_PATH = "TTS/"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally

import os

import IPython
import torch
import torch.nn.functional as F

from TTS.config import load_config
from TTS.tts.models import setup_model
from TTS.tts.models.vits import *
from TTS.utils.audio import AudioProcessor

### Paths definition

In [None]:
# model vars 
MODEL_PATH = 'best_model.pth.tar'
CONFIG_PATH = 'config.json'
TTS_LANGUAGES = "language_ids.json"
TTS_SPEAKERS = "speakers.json"
USE_CUDA = torch.cuda.is_available()

### Restore model

In [None]:
# load the config
C = load_config(CONFIG_PATH)


# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))

# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)


model.eval()

if USE_CUDA:
    model = model.cuda()

# synthesize voice
use_griffin_lim = False

##Speaker encoder setup

### Install helper libraries

In [None]:
# %pip install -q pydub ffmpeg-normalize

### Paths definition

In [None]:
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"

if not os.path.exists(CHECKPOINT_SE_PATH):
    # download config 
    ! gdown --id  19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1 -O $CONFIG_SE_PATH
    # download checkpoint  
    ! gdown --id   17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X -O $CHECKPOINT_SE_PATH

###Imports

In [None]:
from TTS.tts.utils.speakers import SpeakerManager
from pydub import AudioSegment
import librosa

###Load the Speaker encoder

In [None]:
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

###Define helper function

In [None]:
def compute_spec(ref_file):
  y, sr = librosa.load(ref_file, sr=ap.sample_rate)
  spec = ap.spectrogram(y)
  spec = torch.FloatTensor(spec).unsqueeze(0)
  return spec

## TTS

### Upload, normalize and resample your reference wav files

Please upload wav files

In [None]:
reference_files = [f'../samples/speaker_man_korean/{index:>06d}.wav' for index in range(11)]

### Compute embedding

In [None]:
reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files)

### Define inference variables

In [None]:
model.length_scale = 1.0  # scaler for the duration predictor. The larger it is, the slower the speech.
model.inference_noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.
model.inference_noise_scale_dp = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.
# model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference.
# model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference.

### Chose language id

In [None]:
model.language_manager.language_id_mapping

In [None]:
language_id = 0

### Convert the model

In [None]:
model_speaker_encoder = torch.jit.trace(SE_speaker_manager.speaker_encoder, torch.randn(1, 129150))
model_speaker_encoder.save("../speaker_encoder.pt")

In [None]:

dummy_text_inputs = torch.LongTensor(
    [[164,  58, 164,  56, 164, 163, 164,  53, 164,  55, 164,  42, 164,  56,
         164,  46, 164,  41, 164,  42, 164,  51, 164,  57, 164, 163, 164,  47,
         164,  52, 164,  42, 164, 163, 164,  39, 164,  46, 164,  41, 164,  42,
         164,  51, 164, 163, 164,  45, 164,  38, 164,  56, 164, 163, 164,  38,
         164,  51, 164,  51, 164,  52, 164,  58, 164,  51, 164,  40, 164,  42,
         164,  41, 164, 163, 164,  38, 164, 163, 164,  58, 164,  56, 164, 163,
         164,  39, 164,  38, 164,  51, 164, 163, 164,  52, 164,  51, 164, 163,
         164,  55, 164,  58, 164,  56, 164,  56, 164,  46, 164,  38, 164,  51,
         164, 163, 164,  52, 164,  46, 164,  49, 164, 163, 164,  38, 164,  51,
         164,  41, 164, 163, 164,  52, 164,  57, 164,  45, 164,  42, 164,  55,
         164, 163, 164,  42, 164,  51, 164,  42, 164,  55, 164,  44, 164,  62,
         164, 163, 164,  46, 164,  50, 164,  53, 164,  52, 164,  55, 164,  57,
         164,  56, 164, 157, 164, 163, 164,  55, 164,  38, 164,  50, 164,  53,
         164,  46, 164,  51, 164,  44, 164, 163, 164,  58, 164,  53, 164, 163,
         164,  38, 164, 163, 164,  53, 164,  55, 164,  42, 164,  56, 164,  56,
         164,  58, 164,  55, 164,  42, 164, 163, 164,  40, 164,  38, 164,  50,
         164,  53, 164,  38, 164,  46, 164,  44, 164,  51, 164, 163, 164,  52,
         164,  51, 164, 163, 164,  50, 164,  52, 164,  56, 164,  40, 164,  52,
         164,  60, 164, 163, 164,  46, 164,  51, 164, 163, 164,  55, 164,  42,
         164,  57, 164,  38, 164,  49, 164,  46, 164,  38, 164,  57, 164,  46,
         164,  52, 164,  51, 164, 163, 164,  43, 164,  52, 164,  55, 164, 163,
         164,  57, 164,  45, 164,  42, 164, 163, 164,  46, 164,  51, 164,  59,
         164,  38, 164,  56, 164,  46, 164,  52, 164,  51, 164, 163, 164,  52,
         164,  43, 164, 163, 164,  58, 164,  48, 164,  55, 164,  38, 164,  46,
         164,  51, 164,  42, 164, 159, 164]]
)
dummy_text_inputs_lengths = torch.tensor(dummy_text_inputs.shape[1:2])

dummy_sid = None
dummy_g = torch.tensor([reference_emb]).unsqueeze(-1)
dummy_lid = torch.tensor([language_id])

args = (dummy_lid,)
model_jit_emb_l = torch.jit.trace(
    model.emb_l, args,
)
model_jit_emb_l.save('emb_l.pt')
dummy_lang_emb = model_jit_emb_l(*args).unsqueeze(-1)

args = (dummy_text_inputs, dummy_text_inputs_lengths, dummy_lang_emb,)
model_jit_text_encoder = torch.jit.trace(
    model.text_encoder, args,
)
model_jit_text_encoder.save('text_encoder.pt')
dummy_x, dummy_m_p, dummy_logs_p, dummy_x_mask = model_jit_text_encoder(*args)

args = (dummy_x, dummy_text_inputs_lengths, dummy_x_mask, torch.zeros(0), dummy_g, dummy_lang_emb, torch.tensor(True), torch.tensor(model.inference_noise_scale_dp))
model_jit_duration_predictor_reversed = torch.jit.trace(
    model.duration_predictor, args,
)
model_jit_duration_predictor_reversed.save('duration_predictor_reversed.pt')
dummy_logw = model_jit_duration_predictor_reversed(*args)

dummy_w = torch.exp(dummy_logw) * dummy_x_mask * model.length_scale
dummy_w_ceil = torch.ceil(dummy_w)
dummy_y_lengths = torch.clamp_min(torch.sum(dummy_w_ceil, [1, 2]), 1).long()
dummy_y_mask = sequence_mask(dummy_y_lengths, dummy_y_lengths.data.max()).to(dummy_x_mask.dtype)
dummy_attn_mask = torch.unsqueeze(dummy_x_mask, 2) * torch.unsqueeze(dummy_y_mask, -1)
dummy_attn = generate_path(dummy_w_ceil.squeeze(1), dummy_attn_mask.squeeze(1).transpose(1, 2))

dummy_m_p = torch.matmul(dummy_attn.transpose(1, 2), dummy_m_p.transpose(1, 2)).transpose(1, 2)
dummy_logs_p = torch.matmul(dummy_attn.transpose(1, 2), dummy_logs_p.transpose(1, 2)).transpose(1, 2)

dummy_z_p = dummy_m_p + torch.randn_like(dummy_m_p) * torch.exp(dummy_logs_p) * model.inference_noise_scale

args = (dummy_z_p, dummy_y_mask, dummy_g, torch.tensor(False),)
model_jit_flow = torch.jit.trace(
    model.flow, args,
)
model_jit_flow.save('flow.pt')

args = (dummy_z_p, dummy_y_mask, dummy_g, torch.tensor(True),)
model_jit_flow_reversed = torch.jit.trace(
    model.flow, args,
)
model_jit_flow_reversed.save('flow_reversed.pt')
dummy_z = model_jit_flow_reversed(*args)

args = ((dummy_z * dummy_y_mask)[:, :, : model.max_inference_len], dummy_g,)
model_jit_waveform_decoder = torch.jit.trace(
    model.waveform_decoder, args,
)
model_jit_waveform_decoder.save('waveform_decoder.pt')
dummy_o = model_jit_waveform_decoder(*args)

In [None]:
dummy_speaker_cond_src = torch.tensor([reference_emb])
dummy_speaker_cond_tgt = torch.tensor([reference_emb])
dummy_y = torch.randn(1, 513, 241)
dummy_y_lengths = torch.tensor([dummy_y.size(2)])

dummy_g_src = F.normalize(dummy_speaker_cond_src).unsqueeze(-1)
dummy_g_tgt = F.normalize(dummy_speaker_cond_tgt).unsqueeze(-1)

args = (dummy_y, dummy_y_lengths, dummy_g_src,)
model_jit_posterior_encoder = torch.jit.trace(
    model.posterior_encoder, args,
)
model_jit_posterior_encoder.save('posterior_encoder.pt')
dummy_z, _, _, dummy_y_mask = model_jit_posterior_encoder(*args)

### Test

In [None]:

dummy_text_inputs = torch.LongTensor(
    [[164,  58, 164,  56, 164, 163, 164,  53, 164,  55, 164,  42, 164,  56,
         164,  46, 164,  41, 164,  42, 164,  51, 164,  57, 164]]
)
dummy_text_inputs_lengths = torch.tensor(dummy_text_inputs.shape[1:2])

dummy_sid = None
dummy_g = torch.tensor([reference_emb]).unsqueeze(-1)
dummy_lid = torch.tensor([language_id])

args = (dummy_lid,)
dummy_lang_emb = model_jit_emb_l(*args).unsqueeze(-1)

args = (dummy_text_inputs, dummy_text_inputs_lengths, dummy_lang_emb,)
dummy_x, dummy_m_p, dummy_logs_p, dummy_x_mask = model_jit_text_encoder(*args)

args = (dummy_x, dummy_text_inputs_lengths, dummy_x_mask, torch.zeros(0), dummy_g, dummy_lang_emb, torch.tensor(True), torch.tensor(model.inference_noise_scale_dp))
dummy_logw = model_jit_duration_predictor_reversed(*args)

dummy_w = torch.exp(dummy_logw) * dummy_x_mask * model.length_scale
dummy_w_ceil = torch.ceil(dummy_w)
dummy_y_lengths = torch.clamp_min(torch.sum(dummy_w_ceil, [1, 2]), 1).long()
dummy_y_mask = sequence_mask(dummy_y_lengths, dummy_y_lengths.data.max()).to(dummy_x_mask.dtype)
dummy_attn_mask = torch.unsqueeze(dummy_x_mask, 2) * torch.unsqueeze(dummy_y_mask, -1)
dummy_attn = generate_path(dummy_w_ceil.squeeze(1), dummy_attn_mask.squeeze(1).transpose(1, 2))

dummy_m_p = torch.matmul(dummy_attn.transpose(1, 2), dummy_m_p.transpose(1, 2)).transpose(1, 2)
dummy_logs_p = torch.matmul(dummy_attn.transpose(1, 2), dummy_logs_p.transpose(1, 2)).transpose(1, 2)

dummy_z_p = dummy_m_p + torch.randn_like(dummy_m_p) * torch.exp(dummy_logs_p) * model.inference_noise_scale

args = (dummy_z_p, dummy_y_mask, dummy_g, torch.tensor(True),)
dummy_z = model_jit_flow_reversed(*args)

args = ((dummy_z * dummy_y_mask)[:, :, : model.max_inference_len], dummy_g,)
dummy_o = model_jit_waveform_decoder(*args)

wav = dummy_o.detach().double().numpy()[0, 0]
ap.save_wav(wav, '../../output-tts.wav')

wav.shape