In [None]:
# !apt install aria2
# !pip3 install youtube-dl

In [None]:
# !youtube-dl -o "tmp.m4a" -f "bestaudio[ext=m4a]" --external-downloader aria2c --external-downloader-args "-j 16 -x 16 -s 16 -k 1M"  https://www.youtube.com/watch?v=G0d8nrMYMro

In [None]:
# !ffmpeg -ss 00:40:10.00 -i tmp.m4a -t 00:01:00.00 -c copy $TARGET_VOICE_FILE_PATH

In [11]:
DATA_PATH = '/content'
OUT_PATH = f'{DATA_PATH}/out/'
TTS_PATH = f'{DATA_PATH}/TTS/'
CONFIG_PATH = f'{DATA_PATH}/config.json'
MODEL_PATH = f'{DATA_PATH}/best_model.pth.tar'
CONFIG_SE_PATH = f'{DATA_PATH}/config_se.json'
TTS_SPEAKERS_PATH = f'{DATA_PATH}/speakers.json'
TTS_LANGUAGES_PATH = f'{DATA_PATH}/language_ids.json'
CHECKPOINT_SE_PATH = f'{DATA_PATH}/SE_checkpoint.pth.tar'

In [12]:
TARGET_VOICE_FILE_PATH = f'{DATA_PATH}/target.ogg' # !!!
SOURCE_VOICE_FILE_PATH = f'{DATA_PATH}/source.ogg' # !!!
NORM_TARGET_VOICE_FILE_PATH = f'{DATA_PATH}/norm_target_voice.wav'
NORM_SOURCE_VOICE_FILE_PATH = f'{DATA_PATH}/norm_source_voice.wav'

In [13]:
MODEL_FILE_ID = '1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR'
CONFIG_FILE_ID = '1-PfXD66l1ZpsZmJiC-vhL055CDSugLyP'
CONFIG_SE_FILE_ID = '19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1'
TTS_SPEAKERS_FILE_ID = '1SZ9GE0CBM-xGstiXH2-O2QWdmSXsBKdC'
TTS_LANGUAGES_FILE_ID = '1_Vb2_XHqcC0OcvRF82F883MTxfTRmerg'
CHECKPOINT_SE_FILE_ID = '17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X'

In [None]:
!gdown --id $MODEL_FILE_ID -O $MODEL_PATH
!gdown --id $CONFIG_FILE_ID -O $CONFIG_PATH
!gdown --id $CONFIG_SE_FILE_ID -O $CONFIG_SE_PATH
!gdown --id $TTS_SPEAKERS_FILE_ID -O $TTS_SPEAKERS_PATH
!gdown --id $TTS_LANGUAGES_FILE_ID -O $TTS_LANGUAGES_PATH
!gdown --id $CHECKPOINT_SE_FILE_ID -O $CHECKPOINT_SE_PATH

In [None]:
!git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS

In [19]:
!pip3 install -q -e TTS/
!pip3 install -q torchaudio==0.9.0
!pip3 install -q ffmpeg-normalize==1.21.0

In [5]:
import os
import gc
import sys
import json
import torch
import soundfile
import subprocess

sys.path.append(TTS_PATH)

import TTS
import TTS.tts
import TTS.utils
import TTS.config

In [None]:
torch.set_grad_enabled(False)
os.makedirs(OUT_PATH, exist_ok=True)
use_cuda = torch.cuda.is_available()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

sm_params = {
    'encoder_model_path': CHECKPOINT_SE_PATH, 
    'encoder_config_path': CONFIG_SE_PATH, 
    'use_cuda': use_cuda
}

cfg = TTS.config.load_config(CONFIG_PATH)
cfg.model_args['d_vector_file'] = TTS_SPEAKERS_PATH
cfg.model_args['use_speaker_encoder_as_loss'] = False
sm = TTS.tts.utils.speakers.SpeakerManager(**sm_params)
ap = TTS.utils.audio.AudioProcessor(**cfg.audio)

model = TTS.tts.models.setup_model(cfg)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES_PATH)
checkpoint = torch.load(MODEL_PATH, map_location=device)
model_weights = checkpoint['model'].copy()

for key in list(model_weights.keys()):
    if 'speaker_encoder' in key:
        del model_weights[key]

model.load_state_dict(model_weights)
model = model.to(device)
model.eval()

In [14]:
def norm(path, out_path):
    cmd = ('ffmpeg-normalize', path, 
           '-f', '-nt', 'rms', '-t', '-27', 
           '-ar', '16000', '-o', out_path)
    subprocess.call(cmd)

norm(TARGET_VOICE_FILE_PATH, NORM_TARGET_VOICE_FILE_PATH)
norm(SOURCE_VOICE_FILE_PATH, NORM_SOURCE_VOICE_FILE_PATH)

In [None]:
source_emb = sm.compute_d_vector_from_clip(NORM_SOURCE_VOICE_FILE_PATH)
target_emb = sm.compute_d_vector_from_clip(NORM_TARGET_VOICE_FILE_PATH)
source_audio, _ = soundfile.read(NORM_SOURCE_VOICE_FILE_PATH)
source_spec = ap.spectrogram(source_audio)

target_emb = torch.FloatTensor(target_emb).unsqueeze(0).to(device)
source_emb = torch.FloatTensor(source_emb).unsqueeze(0).to(device)
source_spec = torch.FloatTensor(source_spec).unsqueeze(0).to(device)
y_lengths = torch.tensor([source_spec.size(-1)]).to(device)

wav, _, _ = model.voice_conversion(source_spec, y_lengths, source_emb,  target_emb)
ap.save_wav(wav.numpy(), 'out.wav')
gc.collect()

In [None]:
model.length_scale = 2.3
model.inference_noise_scale = 0.3
model.inference_noise_scale_dp = 0.3
language_id = model.language_manager.language_id_mapping['en']
source_emb = sm.compute_d_vector_from_clip(NORM_SOURCE_VOICE_FILE_PATH)
target_emb = sm.compute_d_vector_from_clip(NORM_TARGET_VOICE_FILE_PATH)

params = {
    'style_wav': None, 
    'speaker_id': None, 
    'd_vector': target_emb, 
    'use_griffin_lim': True, 
    'do_trim_silence': False, 
    'language_id': language_id, 
    'enable_eos_bos_chars': cfg.enable_eos_bos_chars
}

text = "It took me quite a long time to develop a voice, and now, that I have it, I am not going to be silent."
wav, alignment, _, _ = TTS.tts.utils.synthesis.synthesis(model, text, cfg, use_cuda, ap, **params).values()
ap.save_wav(wav, 'out.wav')
gc.collect()