# Import libraries & parse arguments

In [56]:
import os
import glob
import torch
import random
import librosa
import argparse
import numpy as np
import IPython.display
import torch
import torch.nn as nn
from utils.audio import Audio
from utils.hparams import HParam

In [14]:
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True,
                    help="yaml file for configuration")
parser.add_argument('-d', '--libri_dir', type=str, default=None,
                    help="Directory of LibriSpeech dataset, containing folders of train-clean-100, train-clean-360, dev-clean.")
parser.add_argument('-v', '--voxceleb_dir', type=str, default=None,
                    help="Directory of VoxCeleb2 dataset, ends with 'aac'")
parser.add_argument('-o', '--out_dir', type=str, required=True,
                    help="Directory of output training triplet")
parser.add_argument('-p', '--process_num', type=int, default=None,
                    help='number of processes to run. default: cpu_count')
parser.add_argument('--vad', type=int, default=0,
                    help='apply vad to wav file. yes(1) or no(0, default)')
parser.add_argument('--train_amt', type=int, default=4,
                    help='specify the amount of mixed train data (default is 4, equal to 10**4)')
args = parser.parse_args(["-c", "config.yaml", "-o", "tmp_gen", "-d", "datasets/LibriSpeech"])
hp = HParam(args.config)

# Prepare

Get all folder paths (speaker based). Format will be a single list of folder paths

In [15]:
if args.libri_dir is not None:
    train_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-100', '*'))
                        if os.path.isdir(x)] + \
                    [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-360', '*'))
                        if os.path.isdir(x)]
                    # we recommned to exclude train-other-500
                    # See https://github.com/mindslab-ai/voicefilter/issues/5#issuecomment-497746793
                    # + \
                    #[x for x in glob.glob(os.path.join(args.libri_dir, 'train-other-500', '*'))
                    #    if os.path.isdir(x)]
    test_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'dev-clean', '*'))]

elif args.voxceleb_dir is not None:
    all_folders = [x for x in glob.glob(os.path.join(args.voxceleb_dir, '*'))
                        if os.path.isdir(x)]
    train_folders = all_folders[:-20]
    test_folders = all_folders[-20:]

Get all audio file for each speaker. Then remove all speakers who have less than 2 audio files. Format will be [speaker0, speaker1,...] where speakerx = [audiopath0, audiopath1,...]

In [16]:
train_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                for spk in train_folders]
train_spk = [x for x in train_spk if len(x) >= 2]

test_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                for spk in test_folders]
test_spk = [x for x in test_spk if len(x) >= 2]

Audio is an abstract class that help simplify many operation on a single audio file like convert to mel, waveform to mel or mel to waveform,...

In [17]:
audio = Audio(hp)

# Main

This function will cut off all segment that considered as silence (db <= 20) from an audio

In [19]:
def vad_merge(w):
    intervals = librosa.effects.split(w, top_db=20)
    temp = list()
    for s, e in intervals:
        temp.append(w[s:e])
    return np.concatenate(temp, axis=None)

Sample 3 audio

In [20]:
spk1, spk2 = random.sample(train_spk, 2)
s1_dvec, s1_target = random.sample(spk1, 2)
s2 = random.choice(spk2)

In [37]:
srate = hp.audio.sample_rate

d, _ = librosa.load(s1_dvec, sr=srate)
w1, _ = librosa.load(s1_target, sr=srate)
w2, _ = librosa.load(s2, sr=srate)
assert len(d.shape) == len(w1.shape) == len(w2.shape) == 1, \
    'wav files must be mono, not stereo'

d, _ = librosa.effects.trim(d, top_db=20)
w1, _ = librosa.effects.trim(w1, top_db =20)
w2, _ = librosa.effects.trim(w2, top_db=20)

# if reference for d-vector is too short, discard it
if d.shape[0] < 1.1 * hp.embedder.window * hp.audio.hop_length:
    raise

# LibriSpeech dataset have many silent interval, so let's vad-merge them
# VoiceFilter paper didn't do that. To test SDR in same way, don't vad-merge.
if args.vad == 1:
    w1, w2 = vad_merge(w1), vad_merge(w2)

# I think random segment length will be better, but let's follow the paper first
# fit audio to `hp.data.audio_len` seconds.
# if merged audio is shorter than `L`, discard it
L = int(srate * hp.data.audio_len)
if w1.shape[0] < L or w2.shape[0] < L:
    raise
w1, w2 = w1[:L], w2[:L]

mixed = w1 + w2

norm = np.max(np.abs(mixed)) * 1.1
w1, w2, mixed = w1/norm, w2/norm, mixed/norm

target_mag, target_phase = audio.wav2spec(w1)
mixed_mag, mixed_phase = audio.wav2spec(mixed)

In [36]:
embedder_pt = torch.load('embedder.pt',map_location="cpu")
embedder = SpeechEmbedder(hp)
embedder.load_state_dict(embedder_pt)
embedder.eval()

SpeechEmbedder(
  (lstm): LSTM(40, 768, num_layers=3, batch_first=True)
  (proj): LinearNorm(
    (linear_layer): Linear(in_features=768, out_features=256, bias=True)
  )
)

In [50]:
dvec_mel = audio.get_mel(d)
dvec_mel = torch.from_numpy(dvec_mel).float()
a1=embedder(dvec_mel)

w1_mel=audio.get_mel(w1)
w1_mel= torch.from_numpy(w1_mel).float()
a2=embedder(w1_mel)

w2_mel=audio.get_mel(w2)
w2_mel= torch.from_numpy(w2_mel).float()
b1=embedder(w2_mel)

In [61]:
dvec_mel = audio.get_mel(d)
dvec_mel = torch.from_numpy(dvec_mel).float()
a11=embedder(dvec_mel)

In [60]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
cos(a1, a2)

tensor(0.8309, grad_fn=<DivBackward0>)

In [65]:
d.shape

(210432,)

In [66]:
w1.shape

(48000,)