# Audio Playground

## Setup

In [None]:
!nvidia-smi

In [17]:
import IPython.display as ipd
from pydub import AudioSegment
from pydub.silence import split_on_silence
import numpy as np
import os, sys
import torch
import torchaudio
from torchaudio.transforms import MuLawDecoding, MuLawEncoding
from torch.utils.data import DataLoader
import librosa
import yaml
import soundfile as sf
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
PROJECT_ROOT = "/project/fdreyer/projects/vqvae-vc"
sys.path.append(PROJECT_ROOT)
%cd {PROJECT_ROOT}
from src.data.datamodules import VCDataModule
from src.params import global_params
from src.models.vqvae_vc import VQVAEVC

mu_law_encoding = MuLawEncoding(global_params.MU_QUANTIZATION_CHANNELS)
mu_law_decoding = MuLawDecoding(global_params.MU_QUANTIZATION_CHANNELS)
np.random.seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Use {device} device")

/project/fdreyer/projects/vqvae-vc
Use cuda device


## Load Validation Dataset

In [3]:
DATA_MODULE_CONFIC_PATH = os.path.join(PROJECT_ROOT, "config", "data", "vctk20", "vctk20-16.38kHZ.yml")
with open(DATA_MODULE_CONFIC_PATH) as f:
    config = yaml.safe_load(f)
data_module = VCDataModule(**config["data"])
data_module.prepare_data()
val_dataset = data_module.val_dataset

Create Train Dataset:
Load audio info
Load audio datasets
Create audio segment index


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2406/2406 [00:00<00:00, 31356.12it/s]


Create Val Dataset:
Load audio info
Load audio datasets
Create audio segment index


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 268/268 [00:00<00:00, 29424.47it/s]


In [4]:
raw_audios = val_dataset.get_raw_audios()
sr = raw_audios[0][1]
raw_audio_samples = [raw_audios[i] for i in np.random.randint(0, len(raw_audios), 4).tolist()]

In [5]:
len_audio = 2*sr
audios = torch.cat([mu_law_decoding(torch.tensor(audio))[:len_audio].unsqueeze(0) for audio, _, _ in raw_audio_samples], dim=0)
speakers = [speaker for _, _, speaker in raw_audio_samples]
target_speakers = torch.tensor(np.random.randint(0, 20, 4))

In [14]:
ipd.Audio(audios[2,:], rate=sr)

## Load VQVAE-VC

In [7]:
from src.models.encoders import LearnedDownsamplingEncoder1d
from src.models.decoders import UpsamplingDecoder1d
from src.models.quantizers import GroupVectorQuantizer
from src.models.wavenet import WaveNet
from src.models.speakers import SpeakerEmbedding

with open(os.path.join(PROJECT_ROOT, "config", "model", "group-vqvae-vc.yml")) as f:
    model_config = yaml.safe_load(f)["model"]
encoder = LearnedDownsamplingEncoder1d(**model_config["encoder"]["init_args"])
quantizer = GroupVectorQuantizer(**model_config["vector_quantizer"]["init_args"])
decoder = UpsamplingDecoder1d(**model_config["decoder"]["init_args"])
speaker_embedding = SpeakerEmbedding(**model_config["speaker_embedding"]["init_args"])
wavenet = WaveNet(**model_config["wavenet"]["init_args"])
learning_rate = model_config["learning_rate"]

checkpoint_path = os.path.join(PROJECT_ROOT, "lightning_logs", "version_1", "checkpoints", "epoch=17-step=60245.ckpt")
checkpoint = torch.load(checkpoint_path)

vqvae_vc = VQVAEVC(encoder, quantizer, decoder, speaker_embedding, wavenet, learning_rate).to(device)
vqvae_vc.load_state_dict(checkpoint["state_dict"])

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


<All keys matched successfully>

In [8]:
converted_audios = vqvae_vc.convert(audios.to(device), target_speakers.to(device))

Create Local and Global Conditions
Generate audio


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32768/32768 [25:56<00:00, 21.06it/s]


In [13]:
ipd.Audio(converted_audios[2,:].cpu(), rate=sr)

In [19]:
for i, audio in enumerate(audios):
    sf.write(f"./reports/audios/{i}-oritinal.wav", audio, sr, format="wav")

In [21]:
for i, audio in enumerate(converted_audios):
    sf.write(f"./reports/audios/{i}-converted.wav", audio.cpu(), sr, format="wav")

In [9]:
target_speakers

array([17,  3, 13, 17,  8,  1, 19, 14,  6, 11,  7, 14,  2, 13, 16,  3, 17,
        7,  3,  1,  5,  9,  3, 17, 11,  1,  9,  3, 13, 15, 14,  7])

In [10]:
print(target_speakers)

[17  3 13 17  8  1 19 14  6 11  7 14  2 13 16  3 17  7  3  1  5  9  3 17
 11  1  9  3 13 15 14  7]
