# Results

In [122]:
!nvidia-smi

Thu Mar 16 09:49:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:61:00.0 Off |                    0 |
| N/A   58C    P0   237W / 300W |  28351MiB / 32768MiB |     84%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:62:00.0 Off |                    0 |
| N/A   31C    P0    56W / 300W |   3645MiB / 32768MiB |      0%      Defaul

## Load Dependencies

In [106]:
import IPython.display as ipd
import os, sys
from pathlib import Path
import torch
import yaml
import soundfile as sf
from parallel_wavegan.utils import load_model

os.environ["CUDA_VISIBLE_DEVICES"] = '1'
PROJECT_ROOT = "/project/fdreyer/projects/vqvae-vc"
sys.path.append(PROJECT_ROOT)
%cd {PROJECT_ROOT}

from src.models.hle_vqvae_vc import HleVqVaeVc
from src.data.datamodules import VCDataModule

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Use {device} device")

sr = 24_000

/project/fdreyer/projects/vqvae-vc
Use cuda device


Vocoder:

In [None]:
VOCODER_CHECKPOINT_PATH = os.path.join(PROJECT_ROOT, "external", "hifigan_vocoder", "checkpoint-2500000steps.pkl")
vocoder = load_model(VOCODER_CHECKPOINT_PATH).to(device)

Validation Dataset:

In [80]:
VCTK_DATASET_CONFIG_PATH = os.path.join(PROJECT_ROOT, "config", "data", "vctk20", "vctk20mel-24kHZ.yml")
with open(VCTK_DATASET_CONFIG_PATH) as f:
    vctk_config = yaml.safe_load(f)["data"]

vctk_data_module = VCDataModule(**vctk_config)
vctk_data_module.prepare_data()
vctk_val_data_loader = vctk_data_module.val_dataloader()
vctk_train_data_loader = vctk_data_module.train_dataloader()

Create Train Dataset:
Load audio info
Load audio datasets
Create Val Dataset:
Load audio info
Load audio datasets


## HLE-VQVAE-VC

In [181]:
HLE_VQVAE_CHECKPOINT_DIR = os.path.join(PROJECT_ROOT, "lightning_logs", "version_98", "checkpoints")
checkpoint_path = os.path.join(HLE_VQVAE_CHECKPOINT_DIR, os.listdir(HLE_VQVAE_CHECKPOINT_DIR)[0])
print(checkpoint_path)
vqvae_vc = HleVqVaeVc.load_from_checkpoint(checkpoint_path)

/project/fdreyer/projects/vqvae-vc/lightning_logs/version_98/checkpoints/epoch=292-step=12586.ckpt


In [182]:
batch = next(iter(vctk_val_data_loader))
originals, speakers = batch
convert_speakers = torch.randint(low=0, high=20, size=(speakers.size(0),))
reconstructions, _, _ = vqvae_vc(originals, speakers)
conversions, _, _ = vqvae_vc(originals, convert_speakers)
print(f"Audios shape: {originals.shape}\nSpeakers shape: {speakers.shape}\nReconstructions shape: {reconstructions.shape}")

Audios shape: torch.Size([64, 80, 696])
Speakers shape: torch.Size([64])
Reconstructions shape: torch.Size([64, 80, 696])


Listen to original, reconstructed and voice converted audios:

In [198]:
utterance = 17
mel_original = originals[utterance].transpose(-1, -2).squeeze().to(device)
wav_original = vocoder.inference(mel_original).squeeze().detach().cpu()
mel_reconstruction = reconstructions[utterance].transpose(-1, -2).squeeze().to(device)
wav_reconstruction = vocoder.inference(mel_reconstruction).squeeze().detach().cpu()
mel_vc = conversions[utterance].transpose(-1, -2).squeeze().to(device)
wav_vc = vocoder.inference(mel_vc).squeeze().detach().cpu()

In [199]:
display(ipd.Audio(wav_original, rate=sr))

In [200]:
display(ipd.Audio(wav_reconstruction, rate=sr))

In [201]:
display(ipd.Audio(wav_vc, rate=sr))

Save originals, reconstructed and voice converted audios:

In [202]:
model = "hle-vqvae-vc"
out_dir = os.path.join(".", "reports", "audios", model)
Path(out_dir).mkdir(parents=True, exist_ok=True)
for utterance, (original, reconstruction, conversion) in enumerate(zip(originals, reconstructions, conversions)):
    mel_original = original.transpose(-1, -2).squeeze().to(device)
    wav_original = vocoder.inference(mel_original).squeeze().detach().cpu()
    mel_reconstruction = reconstruction.transpose(-1, -2).squeeze().to(device)
    wav_reconstruction = vocoder.inference(mel_reconstruction).squeeze().detach().cpu()
    mel_vc = conversion.transpose(-1, -2).squeeze().to(device)
    wav_vc = vocoder.inference(mel_vc).squeeze().detach().cpu()
    sf.write(os.path.join(out_dir, f"{utterance}.original.wav"), wav_original, sr, format="wav")
    sf.write(os.path.join(out_dir, f"{utterance}.reconstructed.wav"), wav_reconstruction, sr, format="wav")
    sf.write(os.path.join(out_dir, f"{utterance}.converted.wav"), wav_vc, sr, format="wav")