In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import torch

import commons
import utils
from models import SynthesizerTrn
from text import symbols, accent_symbols, text_to_sequence, accent_to_sequence
from include.preprocess_japanese.synthesize import preprocess_japanese

  from .autonotebook import tqdm as notebook_tqdm


## MB-iSTFT-VITS

In [2]:
output_dir = "output/JVS"

In [3]:
hps = utils.get_hparams_from_file(os.path.join(output_dir, "config.json"))
speaker_dict = {spk_name:spk_id for spk_id, spk_name in enumerate(hps.data.speakers)}
hps.model.n_speakers = len(speaker_dict)

In [4]:
net_g = SynthesizerTrn(
    len(symbols),
    len(accent_symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
net_g.eval()
utils.load_checkpoint(os.path.join(output_dir, "G_1000000.pth"), net_g, None);

Mutli-stream iSTFT VITS


In [5]:
raw_text = "素晴らしい声の人"
speaker = "jvs001"

with torch.no_grad():
    text, accent = preprocess_japanese(raw_text)
    text, accent = torch.LongTensor(text_to_sequence(text)), torch.LongTensor(accent_to_sequence(accent))
    text = text.unsqueeze(0).cuda()
    accent = accent.unsqueeze(0).cuda()
    x = (text, accent)
    sid = torch.LongTensor([speaker_dict[speaker]]).cuda()
    lengths = torch.LongTensor([len(t) for t in text]).cuda()
    audio = net_g.infer(x, lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

In [6]:
import torchaudio
from mel_processing import spectrogram_torch

ifname = "/data/hdd/kentaro_seki/corpus/jvs_ver1/jvs001/parallel100/wav24kHz16bit/VOICEACTRESS100_001.wav"
src_spk = "jvs003"
tgt_spk = "jvs001"

x, fs = torchaudio.load(ifname)
x = torchaudio.transforms.Resample(fs, hps.data.sampling_rate)(x)
audio_norm = x / hps.data.max_wav_value
spec = spectrogram_torch(audio_norm, hps.data.filter_length,
    hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
    center=False)
spec = torch.squeeze(spec, 0)

with torch.no_grad():
    y = spec.unsqueeze(0).cuda()
    lengths = torch.LongTensor([y.shape[1]]).cuda()
    sid = torch.LongTensor([speaker_dict[src_spk]]).cuda()
    tid = torch.LongTensor([speaker_dict[tgt_spk]]).cuda()
    audio = net_g.voice_conversion(y, lengths, sid, tid)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))