# Import libraries & parse arguments

In [1]:
import os
import glob
import tqdm
import torch
import random
import librosa
import soundfile
import argparse
import numpy as np
from multiprocessing import Pool, cpu_count

from utils.audio import Audio
from utils.hparams import HParam

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True,
                    help="yaml file for configuration")
parser.add_argument('-d', '--libri_dir', type=str, default=None,
                    help="Directory of LibriSpeech dataset, containing folders of train-clean-100, train-clean-360, dev-clean.")
parser.add_argument('-v', '--voxceleb_dir', type=str, default=None,
                    help="Directory of VoxCeleb2 dataset, ends with 'aac'")
parser.add_argument('-o', '--out_dir', type=str, required=True,
                    help="Directory of output training triplet")
parser.add_argument('-p', '--process_num', type=int, default=None,
                    help='number of processes to run. default: cpu_count')
parser.add_argument('--vad', type=int, default=0,
                    help='apply vad to wav file. yes(1) or no(0, default)')
parser.add_argument('--train_amt', type=int, default=4,
                    help='specify the amount of mixed train data (default is 4, equal to 10**4)')
args = parser.parse_args(["-c", "config.yaml", "-o", "tmp_gen", "-d", "datasets/LibriSpeech"])
hp = HParam(args.config)

  for doc in docs:


# Prepare

Make output directory

In [3]:
os.makedirs(args.out_dir, exist_ok=True)
os.makedirs(os.path.join(args.out_dir, 'train'), exist_ok=True)
os.makedirs(os.path.join(args.out_dir, 'test'), exist_ok=True)

cpu_num = cpu_count() if args.process_num is None else args.process_num

if args.libri_dir is None and args.voxceleb_dir is None:
    raise Exception("Please provide directory of data")

Get all folder paths (speaker based). Format will be a single list of folder paths

In [4]:
if args.libri_dir is not None:
    train_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-100', '*'))
                        if os.path.isdir(x)] + \
                    [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-360', '*'))
                        if os.path.isdir(x)]
                    # we recommned to exclude train-other-500
                    # See https://github.com/mindslab-ai/voicefilter/issues/5#issuecomment-497746793
                    # + \
                    #[x for x in glob.glob(os.path.join(args.libri_dir, 'train-other-500', '*'))
                    #    if os.path.isdir(x)]
    test_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'dev-clean', '*'))]

elif args.voxceleb_dir is not None:
    all_folders = [x for x in glob.glob(os.path.join(args.voxceleb_dir, '*'))
                        if os.path.isdir(x)]
    train_folders = all_folders[:-20]
    test_folders = all_folders[-20:]

Get all audio file for each speaker. Then remove all speakers who have less than 2 audio files. Format will be [speaker0, speaker1,...] where speakerx = [audiopath0, audiopath1,...]

In [5]:
train_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                for spk in train_folders]
train_spk = [x for x in train_spk if len(x) >= 2]

test_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                for spk in test_folders]
test_spk = [x for x in test_spk if len(x) >= 2]

Audio is an abstract class that help simplify many operation on a single audio file like convert to mel, waveform to mel or mel to waveform,...

In [6]:
audio = Audio(hp)

# Main

Function that generate path of new file with format: `dir_/num-target.wav`
<br>(form is specified in yaml as "*-target.wav", this function will replace * with num)

In [7]:
def formatter(dir_, form, num):
    return os.path.join(dir_, form.replace('*', '%06d' % num))

This function will cut off all segment that considered as silence (db <= 20) from an audio

In [8]:
def vad_merge(w):
    intervals = librosa.effects.split(w, top_db=20)
    temp = list()
    for s, e in intervals:
        temp.append(w[s:e])
    return np.concatenate(temp, axis=None)

This function will perform:
- Take 3 mono audio, trim leading and trailing silence
- Skip when getting short audio reference for d-vector
- Cut-off all silence segment from these audios, then take the first 3 second from these audio -> the repo author said that to get the same sdr, dont do this, I also think of the same thing too. But he also state that librispeech have many silence interval. Hmm...
- Normalize them (divide by max(abs)*1.1)
- Save normalized audio (target and mixed), then save spectrogram (torch.save for target and mixed), and save d-vector audio path as text file. All these saved file will have the same number on them.

Params:
- hp, args, audio: config and audio class
- num: output file number
- s1_dvec: audio for d-vector
- s1_target: audio used as target
- s2: audio is used to mix with s1_target, result in mixed audio 
- train: used to specify if output is test or train set

In [9]:
def mix(hp, args, audio, num, s1_dvec, s1_target, s2, train):
    srate = hp.audio.sample_rate
    dir_ = os.path.join(args.out_dir, 'train' if train else 'test')

    d, _ = librosa.load(s1_dvec, sr=srate)
    w1, _ = librosa.load(s1_target, sr=srate)
    w2, _ = librosa.load(s2, sr=srate)
    assert len(d.shape) == len(w1.shape) == len(w2.shape) == 1, \
        'wav files must be mono, not stereo'

    d, _ = librosa.effects.trim(d, top_db=20)
    w1, _ = librosa.effects.trim(w1, top_db =20)
    w2, _ = librosa.effects.trim(w2, top_db=20)

    # if reference for d-vector is too short, discard it
    if d.shape[0] < 1.1 * hp.embedder.window * hp.audio.hop_length:
        return

    # LibriSpeech dataset have many silent interval, so let's vad-merge them
    # VoiceFilter paper didn't do that. To test SDR in same way, don't vad-merge.
    if args.vad == 1:
        w1, w2 = vad_merge(w1), vad_merge(w2)

    # I think random segment length will be better, but let's follow the paper first
    # fit audio to `hp.data.audio_len` seconds.
    # if merged audio is shorter than `L`, discard it
    L = int(srate * hp.data.audio_len)
    if w1.shape[0] < L or w2.shape[0] < L:
        return
    w1, w2 = w1[:L], w2[:L]

    mixed = w1 + w2

    norm = np.max(np.abs(mixed)) * 1.1
    w1, w2, mixed = w1/norm, w2/norm, mixed/norm

    # save vad & normalized wav files
    target_wav_path = formatter(dir_, hp.form.target.wav, num)
    mixed_wav_path = formatter(dir_, hp.form.mixed.wav, num)
    soundfile.write(target_wav_path, w1, srate)
    soundfile.write(mixed_wav_path, mixed, srate)

    # save magnitude spectrograms
    target_mag, _ = audio.wav2spec(w1)
    mixed_mag, _ = audio.wav2spec(mixed)
    target_mag_path = formatter(dir_, hp.form.target.mag, num)
    mixed_mag_path = formatter(dir_, hp.form.mixed.mag, num)
    torch.save(torch.from_numpy(target_mag), target_mag_path)
    torch.save(torch.from_numpy(mixed_mag), mixed_mag_path)

    # save selected sample as text file. d-vec will be calculated soon
    dvec_text_path = formatter(dir_, hp.form.dvec, num)
    with open(dvec_text_path, 'w') as f:
        f.write(s1_dvec)

This is wrapper function before perform multi-processing, get 2 random speaker. For speaker 1, get 2 random audio, 1 for d-vector, and 1 for target. For speaker 2 get a random audio to mix with target audio. Then perform mix.

In [10]:
def train_wrapper(num):
    spk1, spk2 = random.sample(train_spk, 2)
    s1_dvec, s1_target = random.sample(spk1, 2)
    s2 = random.choice(spk2)
    mix(hp, args, audio, num, s1_dvec, s1_target, s2, train=True)

def test_wrapper(num):
    spk1, spk2 = random.sample(test_spk, 2)
    s1_dvec, s1_target = random.sample(spk1, 2)
    s2 = random.choice(spk2)
    mix(hp, args, audio, num, s1_dvec, s1_target, s2, train=False)

Generate 10k train data, and 100 test data. In the original repo, author generate 100k train data instead of 10k

In [11]:
arr = list(range(10**2))
with Pool(cpu_num) as p:
   r = list(tqdm.tqdm(p.imap(train_wrapper, arr), total=len(arr)))

arr = list(range(10**2))
with Pool(cpu_num) as p:
    r = list(tqdm.tqdm(p.imap(test_wrapper, arr), total=len(arr)))

100%|██████████| 100/100 [00:00<00:00, 211.62it/s]
100%|██████████| 100/100 [00:00<00:00, 369.45it/s]


Profiling time for getting each sample

In [None]:
import time

In [15]:
%%timeit
train_wrapper(int(time.time()))

24.5 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Without saving (true time)

In [22]:
def mix(hp, args, audio, num, s1_dvec, s1_target, s2, train):
    srate = hp.audio.sample_rate
    dir_ = os.path.join(args.out_dir, 'train' if train else 'test')

    d, _ = librosa.load(s1_dvec, sr=srate)
    w1, _ = librosa.load(s1_target, sr=srate)
    w2, _ = librosa.load(s2, sr=srate)
    assert len(d.shape) == len(w1.shape) == len(w2.shape) == 1, \
        'wav files must be mono, not stereo'

    d, _ = librosa.effects.trim(d, top_db=20)
    w1, _ = librosa.effects.trim(w1, top_db =20)
    w2, _ = librosa.effects.trim(w2, top_db=20)

    # if reference for d-vector is too short, discard it
    if d.shape[0] < 1.1 * hp.embedder.window * hp.audio.hop_length:
        return

    # LibriSpeech dataset have many silent interval, so let's vad-merge them
    # VoiceFilter paper didn't do that. To test SDR in same way, don't vad-merge.
    if args.vad == 1:
        w1, w2 = vad_merge(w1), vad_merge(w2)

    # I think random segment length will be better, but let's follow the paper first
    # fit audio to `hp.data.audio_len` seconds.
    # if merged audio is shorter than `L`, discard it
    L = int(srate * hp.data.audio_len)
    if w1.shape[0] < L or w2.shape[0] < L:
        return
    w1, w2 = w1[:L], w2[:L]

    mixed = w1 + w2

    norm = np.max(np.abs(mixed)) * 1.1
    w1, w2, mixed = w1/norm, w2/norm, mixed/norm

    # save vad & normalized wav files
    target_wav_path = formatter(dir_, hp.form.target.wav, num)
    mixed_wav_path = formatter(dir_, hp.form.mixed.wav, num)
    soundfile.write(target_wav_path, w1, srate)
    soundfile.write(mixed_wav_path, mixed, srate)

    # save magnitude spectrograms
    target_mag, _ = audio.wav2spec(w1)
    mixed_mag, _ = audio.wav2spec(mixed)

    dvec_mel = audio.get_mel(d)
    dvec_mel = torch.from_numpy(dvec_mel).float()

In [23]:
%%timeit
train_wrapper(int(time.time()))

41.2 ms ± 4.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
