# Import libraries & parse arguments

In [22]:
import os
import glob
import torch
import random
import librosa
import argparse
import numpy as np
import IPython.display
import torch

from utils.audio import Audio
from utils.hparams import HParam

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, required=True,
                    help="yaml file for configuration")
parser.add_argument('-d', '--libri_dir', type=str, default=None,
                    help="Directory of LibriSpeech dataset, containing folders of train-clean-100, train-clean-360, dev-clean.")
parser.add_argument('-v', '--voxceleb_dir', type=str, default=None,
                    help="Directory of VoxCeleb2 dataset, ends with 'aac'")
parser.add_argument('-o', '--out_dir', type=str, required=True,
                    help="Directory of output training triplet")
parser.add_argument('-p', '--process_num', type=int, default=None,
                    help='number of processes to run. default: cpu_count')
parser.add_argument('--vad', type=int, default=0,
                    help='apply vad to wav file. yes(1) or no(0, default)')
parser.add_argument('--train_amt', type=int, default=4,
                    help='specify the amount of mixed train data (default is 4, equal to 10**4)')
args = parser.parse_args(["-c", "config.yaml", "-o", "tmp_gen", "-d", "datasets/LibriSpeech"])
hp = HParam(args.config)

  for doc in docs:


# Prepare

Get all folder paths (speaker based). Format will be a single list of folder paths

In [3]:
if args.libri_dir is not None:
    train_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-100', '*'))
                        if os.path.isdir(x)] + \
                    [x for x in glob.glob(os.path.join(args.libri_dir, 'train-clean-360', '*'))
                        if os.path.isdir(x)]
                    # we recommned to exclude train-other-500
                    # See https://github.com/mindslab-ai/voicefilter/issues/5#issuecomment-497746793
                    # + \
                    #[x for x in glob.glob(os.path.join(args.libri_dir, 'train-other-500', '*'))
                    #    if os.path.isdir(x)]
    test_folders = [x for x in glob.glob(os.path.join(args.libri_dir, 'dev-clean', '*'))]

elif args.voxceleb_dir is not None:
    all_folders = [x for x in glob.glob(os.path.join(args.voxceleb_dir, '*'))
                        if os.path.isdir(x)]
    train_folders = all_folders[:-20]
    test_folders = all_folders[-20:]

Get all audio file for each speaker. Then remove all speakers who have less than 2 audio files. Format will be [speaker0, speaker1,...] where speakerx = [audiopath0, audiopath1,...]

In [4]:
train_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                for spk in train_folders]
train_spk = [x for x in train_spk if len(x) >= 2]

test_spk = [glob.glob(os.path.join(spk, '**', hp.form.input), recursive=True)
                for spk in test_folders]
test_spk = [x for x in test_spk if len(x) >= 2]

Audio is an abstract class that help simplify many operation on a single audio file like convert to mel, waveform to mel or mel to waveform,...

In [5]:
audio = Audio(hp)

# Main

This function will cut off all segment that considered as silence (db <= 20) from an audio

In [6]:
def vad_merge(w):
    intervals = librosa.effects.split(w, top_db=20)
    temp = list()
    for s, e in intervals:
        temp.append(w[s:e])
    return np.concatenate(temp, axis=None)

Sample 3 audio

In [9]:
spk1, spk2 = random.sample(train_spk, 2)
s1_dvec, s1_target = random.sample(spk1, 2)
s2 = random.choice(spk2)

In [10]:
srate = hp.audio.sample_rate

d, _ = librosa.load(s1_dvec, sr=srate)
w1, _ = librosa.load(s1_target, sr=srate)
w2, _ = librosa.load(s2, sr=srate)
assert len(d.shape) == len(w1.shape) == len(w2.shape) == 1, \
    'wav files must be mono, not stereo'

d, _ = librosa.effects.trim(d, top_db=20)
w1, _ = librosa.effects.trim(w1, top_db =20)
w2, _ = librosa.effects.trim(w2, top_db=20)

# if reference for d-vector is too short, discard it
if d.shape[0] < 1.1 * hp.embedder.window * hp.audio.hop_length:
    raise

# LibriSpeech dataset have many silent interval, so let's vad-merge them
# VoiceFilter paper didn't do that. To test SDR in same way, don't vad-merge.
if args.vad == 1:
    w1, w2 = vad_merge(w1), vad_merge(w2)

# I think random segment length will be better, but let's follow the paper first
# fit audio to `hp.data.audio_len` seconds.
# if merged audio is shorter than `L`, discard it
L = int(srate * hp.data.audio_len)
if w1.shape[0] < L or w2.shape[0] < L:
    raise
w1, w2 = w1[:L], w2[:L]

mixed = w1 + w2

norm = np.max(np.abs(mixed)) * 1.1
w1, w2, mixed = w1/norm, w2/norm, mixed/norm

target_mag, target_phase = audio.wav2spec(w1)
mixed_mag, mixed_phase = audio.wav2spec(mixed)

In [17]:
IPython.display.Audio(d, rate=srate)

In [18]:
IPython.display.Audio(w1, rate=srate)

In [19]:
IPython.display.Audio(w2, rate=srate)

In [20]:
IPython.display.Audio(mixed, rate=srate)

In [16]:
re_wav = audio.spec2wav(target_mag, target_phase)

[[-1.        -8.7422777e-08j -1.        -8.7422777e-08j
   1.        +0.0000000e+00j ...  1.        +0.0000000e+00j
   1.        +0.0000000e+00j -1.        -8.7422777e-08j]
 [ 1.        +4.5410956e-16j  0.8726085 +4.8842031e-01j
  -0.96766067-2.5225559e-01j ... -0.98665446-1.6282801e-01j
  -0.8204281 +5.7174975e-01j  0.9999205 +1.2609982e-02j]
 [-1.        +8.7422777e-08j -0.75160956-6.5960830e-01j
   0.9563663 +2.9217041e-01j ...  0.9696532 +2.4448450e-01j
   0.7893474 -6.1394680e-01j -0.9998205 -1.8949240e-02j]
 ...
 [-1.        +8.7422777e-08j  0.9664593 +2.5681984e-01j
   0.30093977-9.5364314e-01j ...  0.9444915 -3.2853585e-01j
  -0.7895465 -6.1369079e-01j -0.9999989 +1.4559344e-03j]
 [ 1.        +4.7934547e-15j -0.991218  -1.3223797e-01j
  -0.83333594+5.5276692e-01j ... -0.9872682 +1.5906423e-01j
   0.9156683 +4.0193477e-01j  0.9999996 +9.2590402e-04j]
 [ 1.        +0.0000000e+00j  1.        +0.0000000e+00j
   1.        +0.0000000e+00j ...  1.        +0.0000000e+00j
  -1.        -

In [18]:
(re_wav-w1).sum()

0.017367572

In [15]:
n = len(w1)
n_fft = 2048
y_pad = librosa.util.fix_length(w1, size=n + n_fft // 2)
(librosa.istft(librosa.stft(w1, n_fft=n_fft), length=n)-w1).sum()

1.0425218e-07

In [22]:
IPython.display.Audio(re_wav, rate=srate)

In [24]:
swap_wav = audio.spec2wav(target_mag, mixed_phase*2)

In [25]:
IPython.display.Audio(swap_wav, rate=srate)

# Test construct complex number in torch

In [9]:
target_mag, target_phase

(array([[0.64367247, 0.64531344, 0.64805824, ..., 0.2757423 , 0.1278969 ,
         0.23575324],
        [0.6572695 , 0.6608989 , 0.66725713, ..., 0.45753455, 0.44444716,
         0.43819493],
        [0.5997771 , 0.6006956 , 0.5971607 , ..., 0.49789244, 0.46649015,
         0.43882018],
        ...,
        [0.6687833 , 0.6877354 , 0.71401584, ..., 0.37057114, 0.38917947,
         0.39504975],
        [0.7224491 , 0.72312623, 0.720978  , ..., 0.34440672, 0.35760415,
         0.362274  ],
        [0.6785933 , 0.6844675 , 0.6972221 , ..., 0.2952472 , 0.30929297,
         0.31437773]], dtype=float32),
 array([[ 3.1415927e+00, -2.6905995e-17,  3.1415927e+00, ...,
         -3.5131824e-16, -5.4396864e-16,  3.1415927e+00],
        [ 0.0000000e+00,  2.8680248e+00, -4.5919636e-01, ...,
          1.1745552e+00, -2.4939022e+00,  0.0000000e+00],
        [ 0.0000000e+00, -2.6666253e+00,  8.6895269e-01, ...,
         -1.0608065e+00,  2.3770738e+00,  0.0000000e+00],
        ...,
        [ 3.1415927e+

In [21]:
%%timeit
torch.view_as_complex(torch.from_numpy(np.stack((target_mag, target_phase), axis=-1)))

130 µs ± 6.24 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
t = torch.view_as_complex(torch.from_numpy(np.stack((target_mag, target_phase), axis=-1)))

In [10]:
t.shape

torch.Size([301, 601])

In [42]:
t.abs()

tensor([[3.2069, 0.6453, 3.2077,  ..., 0.2757, 0.1279, 3.1504],
        [0.6573, 2.9432, 0.8100,  ..., 1.2605, 2.5332, 0.4382],
        [0.5998, 2.7334, 1.0544,  ..., 1.1718, 2.4224, 0.4388],
        ...,
        [3.2120, 0.9952, 2.1579,  ..., 3.1537, 0.3892, 3.1663],
        [3.2236, 0.8022, 2.6172,  ..., 2.7825, 0.3984, 3.1624],
        [0.6786, 3.2054, 0.6974,  ..., 3.1540, 0.3093, 3.1573]])

In [43]:
t.imag.shape

torch.Size([301, 601])

In [44]:
t.real = t.real*t.real

In [45]:
t

tensor([[0.4143+3.1416e+00j, 0.4164-2.6906e-17j, 0.4200+3.1416e+00j,
          ..., 0.0760-3.5132e-16j, 0.0164-5.4397e-16j,
         0.0556+3.1416e+00j],
        [0.4320+0.0000e+00j, 0.4368+2.8680e+00j, 0.4452-4.5920e-01j,
          ..., 0.2093+1.1746e+00j, 0.1975-2.4939e+00j,
         0.1920+0.0000e+00j],
        [0.3597+0.0000e+00j, 0.3608-2.6666e+00j, 0.3566+8.6895e-01j,
          ..., 0.2479-1.0608e+00j, 0.2176+2.3771e+00j,
         0.1926+0.0000e+00j],
        ...,
        [0.4473+3.1416e+00j, 0.4730-7.1929e-01j, 0.5098+2.0363e+00j,
          ..., 0.1373+3.1318e+00j, 0.1515+2.8055e-03j,
         0.1561+3.1416e+00j],
        [0.5219+3.1416e+00j, 0.5229+3.4734e-01j, 0.5198-2.5160e+00j,
          ..., 0.1186+2.7611e+00j, 0.1279-1.7553e-01j,
         0.1312+3.1416e+00j],
        [0.4605+0.0000e+00j, 0.4685-3.1315e+00j, 0.4861+1.6842e-02j,
          ..., 0.0872+3.1401e+00j, 0.0957+7.4834e-05j,
         0.0988+3.1416e+00j]])

In [47]:
t_ = t.clone()

In [48]:
t_.real *= t.real

In [61]:
%%timeit
torch.nn.MSELoss()(t.abs(), torch.tensor(0))

  return F.mse_loss(input, target, reduction=self.reduction)


110 µs ± 2.25 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [60]:
%%timeit
(t.abs()**2).mean()

61.4 µs ± 2.94 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [64]:
t.abs().dtype

torch.float32