# Results

In [None]:
!nvidia-smi

## Load Dependencies

In [204]:
import IPython.display as ipd
import os, sys
from pathlib import Path
import torch
import yaml
import soundfile as sf
from parallel_wavegan.utils import load_model

os.environ["CUDA_VISIBLE_DEVICES"] = '1'  # Specify your cuda device here
PROJECT_ROOT = "/project/fdreyer/projects/vqvae-vc"
sys.path.append(PROJECT_ROOT)
%cd {PROJECT_ROOT}

from src.models.hle_vqvae_vc import HleVqVaeVc
from src.data.datamodules import VCDataModule

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Use {device} device")

sr = 24_000

/project/fdreyer/projects/vqvae-vc
Use cuda device


Vocoder:

In [205]:
VOCODER_CHECKPOINT_PATH = os.path.join(PROJECT_ROOT, "external", "hifigan_vocoder", "checkpoint-2500000steps.pkl")
vocoder = load_model(VOCODER_CHECKPOINT_PATH).to(device)

Validation Dataset:

In [206]:
VCTK_DATASET_CONFIG_PATH = os.path.join(PROJECT_ROOT, "config", "data", "vctk20", "vctk20mel-24kHZ.yml")
with open(VCTK_DATASET_CONFIG_PATH) as f:
    vctk_config = yaml.safe_load(f)["data"]

vctk_data_module = VCDataModule(**vctk_config)
vctk_data_module.prepare_data()
vctk_val_data_loader = vctk_data_module.val_dataloader()
vctk_train_data_loader = vctk_data_module.train_dataloader()

Create Train Dataset:
Load audio info
Load audio datasets
Create Val Dataset:
Load audio info
Load audio datasets


## HLE-VQVAE-VC

In [207]:
HLE_VQVAE_CHECKPOINT_DIR = os.path.join(PROJECT_ROOT, "lightning_logs", "version_101", "checkpoints")
checkpoint_path = os.path.join(HLE_VQVAE_CHECKPOINT_DIR, os.listdir(HLE_VQVAE_CHECKPOINT_DIR)[0])
print(checkpoint_path)
vqvae_vc = HleVqVaeVc.load_from_checkpoint(checkpoint_path)

/project/fdreyer/projects/vqvae-vc/lightning_logs/version_101/checkpoints/epoch=353-step=15199.ckpt


  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


In [208]:
batch = next(iter(vctk_val_data_loader))
originals, speakers = batch
convert_speakers = torch.randint(low=0, high=20, size=(speakers.size(0),))
reconstructions, _, _ = vqvae_vc(originals, speakers)
conversions, _, _ = vqvae_vc(originals, convert_speakers)
print(f"Audios shape: {originals.shape}\nSpeakers shape: {speakers.shape}\nReconstructions shape: {reconstructions.shape}")

Audios shape: torch.Size([64, 80, 696])
Speakers shape: torch.Size([64])
Reconstructions shape: torch.Size([64, 80, 696])


Listen to original, reconstructed and voice converted audios:

In [226]:
utterance = 21
mel_original = originals[utterance].transpose(-1, -2).squeeze().to(device)
wav_original = vocoder.inference(mel_original).squeeze().detach().cpu()
mel_reconstruction = reconstructions[utterance].transpose(-1, -2).squeeze().to(device)
wav_reconstruction = vocoder.inference(mel_reconstruction).squeeze().detach().cpu()
mel_vc = conversions[utterance].transpose(-1, -2).squeeze().to(device)
wav_vc = vocoder.inference(mel_vc).squeeze().detach().cpu()

In [227]:
display(ipd.Audio(wav_original, rate=sr))

In [228]:
display(ipd.Audio(wav_reconstruction, rate=sr))

In [229]:
display(ipd.Audio(wav_vc, rate=sr))

Save originals, reconstructed and voice converted audios:

In [225]:
model = "hle-vqvae-vc"
out_dir = os.path.join(".", "reports", "audios", model)
Path(out_dir).mkdir(parents=True, exist_ok=True)
for utterance, (original, reconstruction, conversion) in enumerate(zip(originals, reconstructions, conversions)):
    mel_original = original.transpose(-1, -2).squeeze().to(device)
    wav_original = vocoder.inference(mel_original).squeeze().detach().cpu()
    mel_reconstruction = reconstruction.transpose(-1, -2).squeeze().to(device)
    wav_reconstruction = vocoder.inference(mel_reconstruction).squeeze().detach().cpu()
    mel_vc = conversion.transpose(-1, -2).squeeze().to(device)
    wav_vc = vocoder.inference(mel_vc).squeeze().detach().cpu()
    sf.write(os.path.join(out_dir, f"{utterance}.original.wav"), wav_original, sr, format="wav")
    sf.write(os.path.join(out_dir, f"{utterance}.reconstructed.wav"), wav_reconstruction, sr, format="wav")
    sf.write(os.path.join(out_dir, f"{utterance}.converted.wav"), wav_vc, sr, format="wav")