In [126]:
#  Import a bunch of stuff
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['image.cmap'] = 'hot'
import IPython.display as ipd

import audio_functions as af
import model_functions as mf
import dataset
import librosa
import soundfile as sf
import tensorflow as tf
from keras import layers
from SegCaps import capsule_layers
from functools import partial
import datetime

In [18]:
#  Set some variables
sample_rate=16384
n_fft=1024
fft_hop=256
patch_window=256
patch_hop=128
n_parallel_readers=4
normalise=True
batch_size = 5
shuffle=False
n_shuffle = 1

In [109]:
def get_paired_dataset(zipped_files,
                       sample_rate,
                       n_fft,
                       fft_hop,
                       patch_window,
                       patch_hop,
                       n_parallel_readers,
                       batch_size,
                       n_shuffle,
                       normalise):

    return (
        tf.data.Dataset.from_tensor_slices((zipped_files[:, 0], zipped_files[:, 1]))
        .map(partial(af.read_audio_pair,
                     sample_rate=sample_rate),
             num_parallel_calls=n_parallel_readers)
        .map(partial(af.extract_audio_patches_map,
                     fft_hop=fft_hop,
                     patch_window=patch_window,
                     patch_hop=patch_hop,),
             num_parallel_calls=n_parallel_readers)
        .flat_map(af.zip_tensor_slices)
        .map(partial(compute_spectrogram_map,
                     n_fft=n_fft,
                     fft_hop=fft_hop,
                     normalise=normalise),
             num_parallel_calls=n_parallel_readers)
        .shuffle(n_shuffle).batch(batch_size).prefetch(3)
    )

def compute_spectrogram_map(audio_a, audio_b, n_fft, fft_hop, normalise=False):
    spec_a = compute_spectrogram(audio_a, n_fft, fft_hop, normalise, mag_phase=False)
    spec_b = compute_spectrogram(audio_b, n_fft, fft_hop, normalise, mag_phase=False)

    return spec_a, spec_b, audio_a, audio_b

def compute_spectrogram(audio, n_fft, fft_hop, normalise=False, mag_phase=True):
    '''
    Parameters
    ----------
    audio : single to dual channel audio shaped (n_samples, )

    Returns
    -------
    Tensor of shape (n_frames, 1 + n_fft / 2, 2), where the last dimension is (magnitude, phase)
    '''

    def stft(x, normalise, mag_phase):
        spec = librosa.stft(
            x, n_fft=n_fft, hop_length=fft_hop, window='hann')
        
        if mag_phase:
            mag = np.abs(spec)
            if normalise:
                # TODO: normalize?
                mag = (mag - mag.min()) / (mag.max() - mag.min())
            return mag, np.angle(spec)
        else:
            return spec.real, spec.imag

    def mono_func(py_audio, normalise, mag_phase):
        mag, phase = stft(py_audio[:, 0], normalise, mag_phase)
        ret = np.array([mag, phase]).T
        return ret.astype(np.float32)

    with tf.name_scope('read_spectrogram'):
        ret = tf.py_func(mono_func, [audio, normalise, mag_phase], tf.float32, stateful=False)
        ret.set_shape([(audio.get_shape()[0].value/fft_hop) + 1, 1 + n_fft / 2, 2])
    return ret


In [110]:
file_a = 'C:/Users/Toby/Speech_Data/LibriSpeechMini/Mixed/train-clean-100/19/198/19-198-0001.flac'
audio, native_sample_rate = sf.read(file_a)
audio = librosa.core.resample(audio, native_sample_rate, sample_rate)
spec = librosa.stft(audio, n_fft=n_fft, hop_length=fft_hop, window='hann')

In [106]:
real = spec.real
imag = spec.imag
channels = np.array([real, imag])


In [107]:
channels.view(dtype=np.complex64)

ValueError: When changing to a larger dtype, its size must be a divisor of the total size in bytes of the last axis of the array.

In [108]:
channels.shape

(2, 513, 753)

In [170]:
directory_a = 'C:/Users/Toby/MSc_Project/Test_Audio/CHiME/test/Voice'
directory_b = 'C:/Users/Toby/MSc_Project/Test_Audio/CHiME/test/Mixed'

#  Create the pipeline
tf.reset_default_graph()
data = dataset.zip_files(directory_a, directory_b)
data = get_paired_dataset(data,
                          sample_rate,
                          n_fft,
                          fft_hop,
                          patch_window,
                          patch_hop,
                          n_parallel_readers,
                          batch_size,
                          n_shuffle,
                          normalise)

#  Create the iterator
pipe = data.make_initializable_iterator()
mixed_spec, voice_spec, mixed_audio, voice_audio = pipe.get_next()

#  Create variable placeholders
is_training = tf.placeholder(shape=(), dtype=bool)
mixed_spec_trim = mixed_spec[:, :, :-1, :]
voice_spec_trim = voice_spec[:, :, :-1, :]

In [163]:
data

<PrefetchDataset shapes: ((?, 256, 513, 2), (?, 256, 513, 2), (?, 65280, 1), (?, 65280, 1)), types: (tf.float32, tf.float32, tf.float32, tf.float32)>

In [164]:
mixed_spec_trim.shape

TensorShape([Dimension(None), Dimension(256), Dimension(512), Dimension(2)])

In [165]:
class ComplexNumberCapsNet(object):
    
    def __init__(self, mixed_spec, voice_spec, is_training, reuse=True, name='complex_number_capsnet'):
        """
        input_tensor: Tensor with shape [batch_size, height, width, 2], where the two channels are the real 
                      and imaginary parts of the spectrogram
        is_training:  Boolean - should the model be trained on the current input or not
        name:         Model instance name
        """
        with tf.variable_scope(name):
            self.mixed_spec = mixed_spec
            self.voice_spec = voice_spec
            
            with tf.variable_scope('Primary_Caps'):
                
                # Reshape layer to be 1 capsule x [filters] atoms
                _, H, W, C = mixed_spec.get_shape()
                input_caps = layers.Reshape((H.value, W.value, 1, C.value))(mixed_spec)
                self.input_caps = input_caps
            
            with tf.variable_scope('Conv_Caps'):
                conv_caps = capsule_layers.ConvCapsuleLayer(kernel_size=5, num_capsule=8, num_atoms=2, strides=1, padding='same',
                                                               routings=1, name='primarycaps')(input_caps)
                self.conv_caps = conv_caps
                
#            with tf.variable_scope('Seg_Caps'):
#                seg_caps = capsule_layers.ConvCapsuleLayer(kernel_size=1, num_capsule=16, num_atoms=2, strides=1, padding='same',
#                                                           routings=3, name='seg_caps')(conv_caps)
#                self.seg_caps = seg_caps
            
            with tf.variable_scope('Reconstruction'):
                reconstruction = capsule_layers.ConvCapsuleLayer(kernel_size=1, num_capsule=1, num_atoms=2, strides=1, padding='same',
                                                           routings=3, name='seg_caps')(conv_caps)
                reconstruction = layers.Reshape((H.value, W.value, C.value))(reconstruction)
                self.reconstruction = reconstruction
            
            self.cost = mf.l1_loss(self.reconstruction, voice_spec)

            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=0.0002,
                beta1=0.5,
            )
            self.train_op = self.optimizer.minimize(self.cost)

In [171]:
model = ComplexNumberCapsNet(mixed_spec_trim, voice_spec_trim, is_training=True)

In [172]:
print('Layers\' Shapes:\n'
      '\nInput: ', mixed_spec.get_shape().as_list(),
      '\nPrimary Caps: ',model.input_caps.get_shape().as_list(),
      '\nConv Caps: ',model.conv_caps.get_shape().as_list(),
     # '\nSeg Caps: ',model.seg_caps.get_shape().as_list(),
      '\nRecontruction: ',model.reconstruction.get_shape().as_list()
     )

Layers' Shapes:

Input:  [None, 256, 513, 2] 
Primary Caps:  [None, 256, 512, 1, 2] 
Conv Caps:  [None, 256, 512, 8, 2] 
Recontruction:  [None, 256, 512, 2]


In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
print('start: {}'.format(datetime.datetime.now()))
for i in range(5):
    _, mix, voice, est, cost = sess.run([model.train_op, tf.complex(model.mixed_spec[:,:,:,0], model.mixed_spec[:,:,:,1]), 
                                         tf.complex(model.voice_spec[:,:,:,0], model.voice_spec[:,:,:,1]),
                                         tf.complex(model.reconstruction[:,:,:,0], model.reconstruction[:,:,:,0]), 
                                         model.cost])
    
    print('Iteration {i}: {dt}\nCost: {c}'.format(i=i, dt=datetime.datetime.now(), c=cost))

In [185]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(pipe.initializer)
print('start: {}'.format(datetime.datetime.now()))
epochs = 5
epoch = 0
i = 0
while epoch < epochs:
    try:
        _, mix, voice, est, cost = sess.run([model.train_op, tf.complex(model.mixed_spec[:,:,:,0], model.mixed_spec[:,:,:,1]), 
                                             tf.complex(model.voice_spec[:,:,:,0], model.voice_spec[:,:,:,1]),
                                             tf.complex(model.reconstruction[:,:,:,0], model.reconstruction[:,:,:,0]), 
                                             model.cost])

        print('Iteration {i}: {dt}\nCost: {c}'.format(i=i, dt=datetime.datetime.now(), c=cost))
        i += 1
    except tf.errors.OutOfRangeError:
        print('Epoch {e} finished.'.format(e=epoch))
        epoch += 1
        sess.run(pipe.initializer)
        

start: 2018-09-25 15:20:04.450231
Iteration 0: 2018-09-25 15:20:19.044815
Cost: 0.21479789912700653
Iteration 1: 2018-09-25 15:20:33.138691
Cost: 0.19866344332695007
Iteration 2: 2018-09-25 15:20:39.178209
Cost: 0.2425561100244522
Epoch 0 finished.
Iteration 3: 2018-09-25 15:20:54.615719
Cost: 0.21473035216331482
Iteration 4: 2018-09-25 15:21:08.455740
Cost: 0.19859637320041656
Iteration 5: 2018-09-25 15:21:14.624754
Cost: 0.24250520765781403
Epoch 1 finished.
Iteration 6: 2018-09-25 15:21:30.237279
Cost: 0.21468813717365265
Iteration 7: 2018-09-25 15:21:52.734900
Cost: 0.19855459034442902
Iteration 8: 2018-09-25 15:22:02.135098
Cost: 0.2424653321504593
Epoch 2 finished.
Iteration 9: 2018-09-25 15:22:17.318723
Cost: 0.214644193649292
Iteration 10: 2018-09-25 15:22:33.026662
Cost: 0.19849102199077606
Iteration 11: 2018-09-25 15:22:43.276752
Cost: 0.24242104589939117
Epoch 3 finished.
Iteration 12: 2018-09-25 15:23:01.973208
Cost: 0.21458812057971954
Iteration 13: 2018-09-25 15:23:18.469

In [186]:
voice_est_wave = list()
voice_wave = list()
mix_wave = list()


for i in range(est.shape[0]):
    voice_est_wave.append(librosa.istft(est[i, :, :].T, fft_hop))

for i in range(voice.shape[0]):
    voice_wave.append(librosa.istft(voice[i, :, :].T, fft_hop))
    
for i in range(mix.shape[0]):
    mix_wave.append(librosa.istft(mix[i, :, :].T, fft_hop))

In [187]:
ipd.Audio(voice_est_wave[0].T, rate=sample_rate)

In [181]:
ipd.Audio(voice_wave[0].T, rate=sample_rate)

In [182]:
ipd.Audio(mix_wave[0], rate=sample_rate)

In [183]:
ipd.Audio(voice_est_wave[0].T, rate=sample_rate)

In [75]:
mix_reconstr_spec = np.squeeze(mix.view(dtype=np.complex64))

In [122]:
mix_wave = list()

for i in range(mix_reconstr_spec.shape[0]):
    mix_wave.append(librosa.istft(mix_reconstr_spec[i, :, :].T, fft_hop))

In [103]:
mix.shape

(5, 256, 512, 2)