In [1]:
%load_ext autoreload
%autoreload 2

### Initialize gpu

Write gpu number (only one gpu), that you'll use for inference

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = <write: gpu number e.g. "0">

### Init auxiliary functions

In [3]:
from audio_utils import AudioFeaturesParams, mel_spectrogram, load_and_preprocess_audio
from f0_utils import get_lf0_from_wav
import torch
import typing as tp
from pathlib import Path
from IPython.display import Audio, display
import random

def sample_audios(audios_dir: Path, audios_suffix=".wav") -> tp.Tuple[Path, Path]:
    all_wavs = list(audios_dir.glob(f"*{audios_suffix}"))
    source, target = random.sample(all_wavs, k=2)
    return source, target

### Initialize model

you can find link to model in readme.

In [4]:
device="cuda"
model_jit_path = "/hifi_vc/model.pt"

In [5]:
params = AudioFeaturesParams()

### Prepare audio paths

you can use audios from `/data` folder. It contains seen (from the validation set) samples and unseen (speakers, that was not participated in training). `sample_audios` will sample two random audios from folder. 

In [18]:
seen_dir = Path("/hifi_vc/data/seen")
unseen_dir = Path("/hifi_vc/data/unseen")

In [27]:
source_wav_path, target_wav_path = sample_audios(seen_dir)

### Convert audio

In [28]:
wav_source = load_and_preprocess_audio(source_wav_path, 16000).to(device)

wav_ref = load_and_preprocess_audio(target_wav_path, 24000, trim=True)
mel_ref = mel_spectrogram(wav_ref, params).to(device)

pitch = get_lf0_from_wav(str(source_wav_path))
pitch = pitch.to(device).float()

In [30]:
traced = torch.jit.load(model_jit_path).eval()

with torch.no_grad():
    converted = traced(wav_source, mel_ref, pitch)

wav_source_display = load_and_preprocess_audio(source_wav_path, 24000)

print("Source sentence")
display(Audio(wav_source_display.squeeze().numpy(), rate=24000))

print("Target Voice")
display(Audio(wav_ref.squeeze().numpy(), rate=24000))

print("Converted")
display(Audio(converted.cpu().squeeze().detach().numpy(), rate=24000))

Source sentence


Target Voice


Converted
