In [1]:
import IPython.display as ipd
import os
import torch
import utils

from models import SynthesizerTrn

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

hps = utils.get_hparams_from_file("./configs/vctk_e2e_32.json")
net_g = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.data.n_mel_channels,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("logs/gst/G_45000.pth", net_g, None)

INFO:root:Loaded checkpoint 'logs/gst/G_45000.pth' (iteration 36)


In [3]:
from utils import load_wav_to_torch
from mel_processing import spectrogram_torch, spec_to_mel_torch

def get_spec(path):
    global hps
    audio, sampling_rate = load_wav_to_torch(path)
    if sampling_rate !=  hps.data.sampling_rate:
        raise ValueError("{} {} SR doesn't match target {} SR".format(
            sampling_rate, hps.data.sampling_rate))
    audio_norm = audio / 32768
    audio_norm = audio_norm.unsqueeze(0)

    spec = spectrogram_torch(audio_norm, hps.data.filter_length,
                             hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
                             center=False).cuda()
    length = torch.LongTensor([spec.size(-1)]).cuda()
    print(spec.shape)
    mel = spec_to_mel_torch(spec, hps.data.filter_length, hps.data.n_mel_channels, 
                            hps.data.sampling_rate, hps.data.mel_fmin, hps.data.mel_fmax).cuda()
    return spec, mel, length

In [4]:
import soundfile as sf
wav1, sr = sf.read("data/DUMMY16K/p294_229.wav")
wav2, sr = sf.read("data/DUMMY16K/p258_004.wav") # unseen
wav3, sr = sf.read("data/DUMMY16K/p257_008.wav") # unseen
ipd.display(ipd.Audio(wav1,rate=sr))
ipd.display(ipd.Audio(wav2,rate=sr))
ipd.display(ipd.Audio(wav3,rate=sr))

In [5]:
spec1,mel1,length1 = get_spec("data/DUMMY16K/p294_229.wav")
spec2,mel2,length2 = get_spec("data/DUMMY16K/p258_004.wav")
spec3,mel3,length3 = get_spec("data/DUMMY16K/p257_008.wav")

torch.Size([1, 321, 170])
torch.Size([1, 321, 273])
torch.Size([1, 321, 294])


In [33]:
print("resynthesize")
audio1 = net_g.infer(spec1, length1, mel1, noise_scale=.667, noise_scale_w=0.8, length_scale=1.5)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))

resynthesize


In [30]:
print("voice conversion")
audio2 = net_g.infer(spec2, length2, mel3, noise_scale=.667, noise_scale_w=0.8, length_scale=1.5)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))

voice conversion


In [26]:
import random
print("random token")
g_token = torch.zeros(1, 1, 256).cuda()
for i in range(8):
    random_int = random.randint(0, 9)
    g_token[:,:,i*32:(i+1)*32] = net_g.gst.stl.embed.data[random_int] * random.random()
audio3 = net_g.voice_conversion(spec2, length2, g_token, noise_scale=.667, length_scale=1.5)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))

random token


In [25]:
print("random tensor")
g_token = torch.randn(1, 1, 256).cuda()
audio4 = net_g.voice_conversion(spec2, length2, g_token, noise_scale=.667, length_scale=1.5)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio4, rate=hps.data.sampling_rate, normalize=False))

random tensor
