# Objectives:

- [x] Import hdf5 file as np.array

[ ] Build rough draft of U-Net model

[ ] Make code more reusable/object oriented

# HDF5 to Numpy Array

In [1]:
import h5py
import numpy as np
import os
import tensorflow as tf
from tensorflow.nn import leaky_relu, relu, sigmoid
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.layers import Conv2D, Conv2DTranspose, concatenate

Note that `librosa` shapes the output arrays slightly differently than we want. `128` represents the number of mel coefficients we have $\implies$ that should be in `array.shape[2]` rather than where it is now in `array.shape[1]`.

In [2]:
x = np.load('x.npy')
y = np.load('y.npy')

We will be "training" on a subset of the training to verify that the model actually works. Once complete, we will run it on a GPU.

# SINGING VOICE SEPARATION WITH DEEP U-NET CONVOLUTIONAL NETWORKS

>Our implementation of U-Net is similar to that of [11].
Each encoder layer consists of a strided 2D convolution
of stride 2 and kernel size 5x5, batch normalization, and
leaky rectified linear units (ReLU) with leakiness 0.2. In
the decoder we use strided deconvolution (sometimes re-
ferred to as transposed convolution) with stride 2 and ker-
nel size 5x5, batch normalization, plain ReLU, and use
50% dropout to the first three layers, as in [11]. In the final
layer we use a sigmoid activation function

In [3]:
# Train on 5 to ensure code scales.
print(f'Input Shape:{x.shape} \n Output Shape: {y.shape}')

Input Shape:(5, 128, 4102) 
 Output Shape: (5, 128, 4102)


```tf.keras.layers.Conv2D(
    filters, kernel_size, strides=(1, 1), padding='valid', data_format=None,
    dilation_rate=(1, 1), groups=1, activation=None, use_bias=True,
    kernel_initializer='glorot_uniform', bias_initializer='zeros',
    kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
    kernel_constraint=None, bias_constraint=None, **kwargs
)```

- kernel_size
- strides
- padding
- activation

# U-Net 

In [8]:
class VocalUNet(tf.keras.Model):
    def __init__(self, kernel_size = (5,5), strides = (2,2)):
        # Initialize Model properties
        super(VocalUNet, self).__init__()
        '''
        VocalUNet is the linked paper above translated into Keras Model classes.

        Parameters:
        -----------
        - kernel_size: (tuple/list)
        Size of kernel for model. Default set to (5,5).

        - strides: (tuple/list)
        Size of stride of kernel. Default set to (2,2).
        
        A Note on Batch Normalization and Dropout layers:
        ------------------------------------------------
        Batch Normalization and Dropouts are used throughout the
        convolution and convolution transpose. Syntax among the 
        TensorFlow Model Classes dictates that these layers - even
        if the same - are initalized numerous times. However, in 
        the Vocal U-Net, these layers are the same. I will initalize
        them once and reuse them throughout the call() function.
        
        NOTES:
        - need to know what to do with training paramter for dropout.
        '''
        # Batch Normalization layer:
        self.bn = BatchNormalization()
        # Dropout layer:
        self.dropout = Dropout(0.5)
        # First Convolution
        self.conv1 = Conv2D(filters = 16, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Second Convolution
        self.conv2 = Conv2D(filters = 32, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Third Convolution
        self.conv3 = Conv2D(filters = 64, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Fourth Convolution
        self.conv4 = Conv2D(filters = 128, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Fifth Convolotion
        self.conv5 = Conv2D(filters = 256, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Sixth Convolution
        self.conv6 = Conv2D(filters = 512, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        '''
        Deconvolve/Convolution Transpose layers:
        As we deconvolve the layers, we dropout half for the first three
        deconvolution layers. The rest follows typical procedure of
        deconvoloution with sigmoid activation function.
        '''
        # Convolution Transpose 1:
        self.convt1 = Conv2DTranspose(filters = 256, kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 2:
        self.convt2 = Conv2DTranspose(filters = 128,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 3:
        self.convt3 = Conv2DTranspose(filters = 64,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 4:
        self.convt4 = Conv2DTranspose(filters = 32,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 5:
        self.convt5 = Conv2DTranspose(filters = 16,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 6:
        self.convt6 = Conv2DTranspose(filters = 1, kernel_size = kernel_size,
        strides = strides, activation = sigmoid)
    def call(self, inputs):
        '''
        Again, typical syntax on TensorFlow would prefer x as the variable
        name. However, if I keep this syntax below, concatenating specific
        layers will be a mess. To avoid this, I will name each layer in the
        forward pass as l1, l2, etc.
        '''
        l1 = self.bn(self.conv1(inputs))
        l2 = self.bn(self.conv2(l1))
        l3 = self.bn(self.conv3(l2))
        l4 = self.bn(self.conv4(l3))
        l5 = self.bn(self.conv5(l4))
        l6 = self.bn(self.conv6(l5))
        l7 = self.bn(self.convt1(l6))
        l8 = self.dropout(l7)
        l9 = self.bn(self.convt2(concatenate([l8, l5])))
        l10 = self.dropout(l9)
        l11 = self.bn(self.convt3(concatenate([l10, l4])))
        l12 = self.dropout(l11)
        l13 = self.bn(self.convt4(concatenate([l12, l3])))
        l14 = self.bn(self.convt5(concatenate([l13, l2])))
        l15 = self.convt6(concatenate([l14,l1]))

In [9]:
unet = VocalUNet()

In [7]:
# Awesome error.
# Learning the structure of these models is useful.
unet.train_on_batch(x, y)

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.