In [6]:
#  Import a bunch of stuff
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['image.cmap'] = 'hot'
import IPython.display as ipd

import audio_functions as af
import model_functions as mf
import librosa
import tensorflow as tf
from keras import layers
from SegCaps import capsule_layers

In [7]:
#  Set some variables
sample_rate=16384
n_fft=1024
fft_hop=256
patch_window=256
patch_hop=128
n_parallel_readers=4
normalise=True
batch_size = 5
shuffle=False
n_shuffle = 1

In [None]:
# Load a pair of files and convert to spectrogram
mix_file = 'C:/Users/Toby/MSc_Project/Test_Audio/CHiME/test/Mixed/F01_423C020U_BUS.CH1.wav'
voice_file = 'C:/Users/Toby/MSc_Project/Test_Audio/CHiME/test/Voice/F01_423C020U_BTH.CH1.wav'

In [8]:


mix_audio, _ = librosa.core.load(mix_file, sample_rate)
voice_audio, _ = librosa.core.load(voice_file, sample_rate)

mix_spec = librosa.stft(mix_audio, n_fft=n_fft, hop_length=fft_hop, window='hann')
voice_spec = librosa.stft(voice_audio, n_fft=n_fft, hop_length=fft_hop, window='hann')

# Split real and imaginary parts into separate channels
mixed_spec_channels = np.concatenate((np.expand_dims(mix_spec.real, axis=2), np.expand_dims(mix_spec.imag, axis=2)), axis=2)
voice_spec_channels = np.concatenate((np.expand_dims(voice_spec.real, axis=2), np.expand_dims(voice_spec.imag, axis=2)), axis=2)

mixed_spec_tensor = tf.convert_to_tensor(mixed_spec_channels)
voice_spec_tensor = tf.convert_to_tensor(voice_spec_channels)

In [11]:
mixed_spec_tensor.shape

TensorShape([Dimension(513), Dimension(492), Dimension(2)])

In [15]:
class ComplexNumberCapsNet(object):
    
    def __init__(self, mixed_spec, voice_spec, is_training, reuse=True, name='complex_number_capsnet'):
        """
        input_tensor: Tensor with shape [batch_size, height, width, 2], where the two channels are the real 
                      and imaginary parts of the spectrogram
        is_training:  Boolean - should the model be trained on the current input or not
        name:         Model instance name
        """
        with tf.variable_scope(name):
            self.mixed_spec = mixed_spec
            self.voice_spec = voice_spec
            
            with tf.variable_scope('Primary_Caps'):
                
                # Reshape layer to be 1 capsule x [filters] atoms
                _, H, W, C = mixed_spec.get_shape()
                input_caps = layers.Reshape((H.value, W.value, 1, C.value))(mixed_spec)
                self.input_caps = input_caps
            
            with tf.variable_scope('Conv_Caps'):
                conv_caps = capsule_layers.ConvCapsuleLayer(kernel_size=5, num_capsule=8, num_atoms=32, strides=1, padding='same',
                                                               routings=1, name='primarycaps')(input_caps)
                self.conv_caps = conv_caps
                
            with tf.variable_scope('Seg_Caps'):
                seg_caps = capsule_layers.ConvCapsuleLayer(kernel_size=1, num_capsule=1, num_atoms=16, strides=1, padding='same',
                                                           routings=3, name='seg_caps')(conv_caps)
                self.seg_caps = seg_caps
            
#            with tf.variable_scope('Reconstruction'):
#                reconstruction = capsule_layers.ConvCapsuleLayer(kernel_size=1, num_capsule=1, num_atoms=1, strides=1, padding='same',
#                                                           routings=3, name='seg_caps')(seg_caps)
#                reconstruction = tf.squeeze(reconstruction,-1)
#                self.reconstruction = reconstruction
            
#            self.cost = mf.l1_loss(self.reconstruction, voice_spec)

#            self.optimizer = tf.train.AdamOptimizer(
#                learning_rate=0.0002,
#                beta1=0.5,
#            )
#            self.train_op = self.optimizer.minimize(self.cost)

In [16]:
model = ComplexNumberCapsNet(mixed_spec_tensor, voice_spec_tensor, is_training=True)

ValueError: Cannot reshape a tensor with 504792 elements to shape [513,513,492,1,2] (258958296 elements) for 'complex_number_capsnet_1/Primary_Caps/reshape_1/Reshape' (op: 'Reshape') with input shapes: [513,492,2], [5] and with input tensors computed as partial shapes: input[1] = [513,513,492,1,2].

In [57]:
print('Layers\' Shapes:\n'
      '\nInput: ', mixed_spec.get_shape().as_list(),
      '\nPrimary Caps: ',model.input_caps.get_shape().as_list(),
      '\nConv Caps: ',model.conv_caps.get_shape().as_list(),
      '\nSeg Caps: ',model.seg_caps.get_shape().as_list(),
#      '\nRecontruction: ',model.reconstruction.get_shape().as_list()
     )

Layers' Shapes:

Input:  [513, 492, 2, 2] 
Primary Caps:  [513, 492, 2, 1, 2] 
Conv Caps:  [None, 492, 2, 8, 32] 
Seg Caps:  [None, 492, 2, 1, 16]
