In [None]:
import os
import tempfile

import torch
from dotenv import load_dotenv
from IPython.display import Audio, display
from melo.api import TTS
from path import Path

from openvoice import se_extractor
from openvoice.api import ToneColorConverter

In [None]:
load_dotenv()
ckpt_converter = 'checkpoints_v2/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = Path(tempfile.gettempdir()) / 'outputs_v2'
print(device)

In [None]:
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
os.makedirs(output_dir, exist_ok=True)

In [None]:
reference_speaker = os.getenv('REFERENCE_SPEAKER')
if not reference_speaker:
    raise ValueError('Specify reference speaker in .env file')
target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)
src_path = output_dir / 'tmp.wav'

In [None]:
language = 'JP'  # EN_NEWEST, JP
model = TTS(language=language, device=device)
if len(model.hps.data.spk2id) > 1:
    raise ValueError(f'There are several speaker ids: {model.hps.data.spk2id}')

In [None]:
text = "が"
speaker_key = language.lower().replace('_', '-')

source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)
model.tts_to_file(text, speaker_id=0, output_path=src_path, speed=1, sdp_ratio=0.2, quiet=True)
save_path = output_dir / 'audio_with_cloned_voice.wav'

tone_color_converter.convert(
    audio_src_path=src_path,
    src_se=source_se,
    tgt_se=target_se,
    output_path=save_path,
    tau=0.5,
)
audio = Audio(save_path)
display(audio)