# Results

In [1]:
!nvidia-smi

Thu Apr 20 06:24:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:61:00.0 Off |                    0 |
| N/A   35C    P0    41W / 300W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  Off  | 00000000:62:00.0 Off |                    0 |
| N/A   35C    P0    43W / 300W |      0MiB / 32768MiB |      0%      Defaul

## Load Dependencies

In [2]:
from collections import defaultdict
import IPython.display as ipd
import os, sys
from pathlib import Path
import torch
import yaml
import soundfile as sf
from torchaudio.transforms import MuLawDecoding
from parallel_wavegan.utils import load_model
from tqdm import tqdm

os.environ["CUDA_VISIBLE_DEVICES"] = '2'  # Specify your cuda device here
PROJECT_ROOT = "/project/fdreyer/projects/vqvae-vc"
sys.path.append(PROJECT_ROOT)
%cd {PROJECT_ROOT}

from src.models.hle_vqvae_vc import HleVqVaeVc
from src.data.datamodules import VCDataModule
from src.data.utils import EMOTIONS
from src.params import global_params

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Use {device} device")

sr = 24_000

/project/fdreyer/projects/vqvae-vc
Use cuda device


Vocoder:

In [3]:
vocoder = load_model(global_params.PATH_HIFIGAN_PARAMS).to(device)

Validation Dataset:

In [4]:
VCTK_DATASET_CONFIG_PATH = os.path.join(PROJECT_ROOT, "config", "data", "comb_vctk20_esd_eng", "comb_vctk20_esd_eng_24kHZ.yml")
with open(VCTK_DATASET_CONFIG_PATH) as f:
    vctk_config = yaml.safe_load(f)["data"]

vctk_data_module = VCDataModule(**vctk_config)
vctk_data_module.prepare_data()
vctk_val_data_loader = vctk_data_module.val_dataloader()
vctk_train_data_loader = vctk_data_module.train_dataloader()

Create Val Dataset
Create Train Dataset


In [5]:
mu_law_decoder = MuLawDecoding(quantization_channels=256)

## HLE-VQVAE-VC

In [6]:
HLE_VQVAE_CHECKPOINT_DIR = os.path.join(PROJECT_ROOT, "lightning_logs", "version_143", "checkpoints")
checkpoint_path = os.path.join(HLE_VQVAE_CHECKPOINT_DIR, os.listdir(HLE_VQVAE_CHECKPOINT_DIR)[0])
vqvae_vc = HleVqVaeVc.load_from_checkpoint(checkpoint_path)
vqvae_vc.to(device)

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


HleVqVaeVc(
  (speaker_embedding): SpeakerEmbedding(
    (embedding): Embedding(30, 16)
  )
  (encoder_bot): HleEncoder(
    (input_layer): Sequential(
      (0): HleConv1d(13, 256, kernel_size=(15,), stride=(1,), padding=(7,))
      (1): GLU(dim=1)
    )
    (downsampling_layer): Sequential(
      (0): HleConv1d(128, 256, kernel_size=(8,), stride=(2,), padding=(3,))
      (1): InstanceNorm1d(256, eps=1e-05, momentum=0.8, affine=False, track_running_stats=False)
      (2): GLU(dim=1)
    )
    (dilation_stack): WaveNetLikeStack(
      (stack): ModuleList(
        (0): WaveNetLikeCell(
          (in_seq_layer): Sequential(
            (0): HleConv1d(128, 256, kernel_size=(5,), stride=(1,), padding=(2,))
            (1): InstanceNorm1d(256, eps=1e-05, momentum=0.25, affine=False, track_running_stats=False)
          )
          (residual_layer): Sequential(
            (0): HleConv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
            (1): InstanceNorm1d(128, eps=1e-05, mo

Voice Conversion:

In [13]:
n_speakers = 30

speaker_utterances = defaultdict(lambda: [])
with torch.no_grad():
    for batch in tqdm(vctk_val_data_loader):
        original_mels, original_mfccs, original_wavs, original_speakers, original_emotions = batch
        original_mels, original_mfccs, original_wavs = original_mels.to(device), original_mfccs.to(device), mu_law_decoder(original_wavs)
        all_converted_mels = [[] for _ in range(original_mels.size(0))]
        all_converted_wavs = [[] for _ in range(original_wavs.size(0))]
        for converted_speaker in range(n_speakers):
            converted_speakers = torch.ones((original_mels.size(0),), dtype=torch.long).to(device) * converted_speaker
            converted_mels, _, _ = vqvae_vc(original_mfccs, converted_speakers)
            #converted_mels, _, _ = vqvae_vc(original_mels, converted_speakers)
            for idx, converted_mel in enumerate(converted_mels):
                all_converted_mels[idx].append(converted_mel.cpu().numpy())
                all_converted_wavs[idx].append(vocoder.inference(converted_mel.transpose(-1, -2).squeeze()).squeeze().detach().cpu().numpy())
        for original_mel, original_mfcc, original_wav, original_speaker, converted_mels, converted_wavs, emotion in zip(original_mels, original_mfccs, original_wavs, original_speakers, all_converted_mels, all_converted_wavs, original_emotions):
            speaker_utterances[original_speaker.item()].append({
                "original_mel": original_mel.cpu().numpy(),
                "original_mfcc": original_mfcc.cpu().numpy(),
                "original_wav": original_wav.numpy(),
                "converted_mels": converted_mels,
                "converted_wavs": converted_wavs,
                "emotion": emotion
            })

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [28:38<00:00, 59.26s/it]


Listen to original and converted audios:

In [14]:
speaker = 0
utterance = 45

utterance_dict = speaker_utterances[speaker][utterance]
print(f"Original ({EMOTIONS[utterance_dict['emotion']]}): ")
display(ipd.Audio(utterance_dict["original_wav"], rate=sr))
print("Conversions: ")
for i, converted_wav in enumerate(utterance_dict["converted_wavs"]):
    print(f"Conversion Speaker {i}")
    display(ipd.Audio(converted_wav, rate=sr))

Original (angry): 


Conversions: 
Conversion Speaker 0


Conversion Speaker 1


Conversion Speaker 2


Conversion Speaker 3


Conversion Speaker 4


Conversion Speaker 5


Conversion Speaker 6


Conversion Speaker 7


Conversion Speaker 8


Conversion Speaker 9


Conversion Speaker 10


Conversion Speaker 11


Conversion Speaker 12


Conversion Speaker 13


Conversion Speaker 14


Conversion Speaker 15


Conversion Speaker 16


Conversion Speaker 17


Conversion Speaker 18


Conversion Speaker 19


Conversion Speaker 20


Conversion Speaker 21


Conversion Speaker 22


Conversion Speaker 23


Conversion Speaker 24


Conversion Speaker 25


Conversion Speaker 26


Conversion Speaker 27


Conversion Speaker 28


Conversion Speaker 29


Save originals and voice converted audios:

In [15]:
model = "mfcc-hle-vqvae-vc - comb_vctk_20_esd_eng_24kHZ"

out_dir = os.path.join(".", "reports", "audios", model)
for speaker, utterances in tqdm(speaker_utterances.items()):
    out_dir_speaker = os.path.join(out_dir, f"speaker{str(speaker)}")
    for utterance_idx, utterance in enumerate(utterances):
        emotion = EMOTIONS[utterance["emotion"]]
        out_dir_speaker_utterance = os.path.join(out_dir_speaker, f"utterance{str(utterance_idx)}-{emotion}")
        Path(out_dir_speaker_utterance).mkdir(parents=True, exist_ok=True)
        original_wav = utterance["original_wav"]
        sf.write(os.path.join(out_dir_speaker_utterance, f"utterance{utterance_idx}.speaker{speaker}-{emotion}.original.wav"), original_wav, sr, format="wav")
        converted_wavs = utterance["converted_wavs"]
        for target_speaker, converted_wav in enumerate(converted_wavs):
            sf.write(os.path.join(out_dir_speaker_utterance, f"utterance{utterance_idx}.speaker{speaker}-to-speaker{target_speaker}-{emotion}.wav"), converted_wav, sr, format="wav")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [04:40<00:00,  9.34s/it]
