# Objectives:

- [x] Import hdf5 file as np.array

[ ] Build rough draft of U-Net model

[ ] Make code more reusable/object oriented

# HDF5 to Numpy Array

In [1]:
import h5py
import numpy as np
import os
import tensorflow as tf
from tensorflow.nn import leaky_relu, relu, sigmoid
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.layers import Conv2D, Conv2DTranspose, concatenate

Note that `librosa` shapes the output arrays slightly differently than we want. `128` represents the number of mel coefficients we have $\implies$ that should be in `array.shape[2]` rather than where it is now in `array.shape[1]`.

In [2]:
x = np.load('x.npy')
y = np.load('y.npy')

We will be "training" on a subset of the training to verify that the model actually works. Once complete, we will run it on a GPU.

# SINGING VOICE SEPARATION WITH DEEP U-NET CONVOLUTIONAL NETWORKS

>Our implementation of U-Net is similar to that of [11].
Each encoder layer consists of a strided 2D convolution
of stride 2 and kernel size 5x5, batch normalization, and
leaky rectified linear units (ReLU) with leakiness 0.2. In
the decoder we use strided deconvolution (sometimes re-
ferred to as transposed convolution) with stride 2 and ker-
nel size 5x5, batch normalization, plain ReLU, and use
50% dropout to the first three layers, as in [11]. In the final
layer we use a sigmoid activation function

In [3]:
# Train on 5 to ensure code scales.
print(f'Input Shape:{x.shape} \n Output Shape: {y.shape}')

Input Shape:(5, 128, 4102) 
 Output Shape: (5, 128, 4102)


```tf.keras.layers.Conv2D(
    filters, kernel_size, strides=(1, 1), padding='valid', data_format=None,
    dilation_rate=(1, 1), groups=1, activation=None, use_bias=True,
    kernel_initializer='glorot_uniform', bias_initializer='zeros',
    kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
    kernel_constraint=None, bias_constraint=None, **kwargs
)```

- kernel_size
- strides
- padding
- activation

# U-Net 

In [4]:
# Might need to convert np array to tf.Input.

In [11]:
class VocalUNet(tf.keras.Model):
    def __init__(self, kernel_size = (5,5), strides = (2,2)):
        # Initialize Model properties
        super(VocalUNet, self).__init__()
        '''
        VocalUNet is the linked paper above translated into Keras Model classes.

        Parameters:
        -----------
        - kernel_size: (tuple/list)
        Size of kernel for model. Default set to (5,5).

        - strides: (tuple/list)
        Size of stride of kernel. Default set to (2,2).

        NEED TO ADD TO INIT AND DETERMINE HOW TO USE THIS PROPERLY.
        - training: (bool)
        True or false for differentiating between training/testing.

        NOTES:
        - need to add BatchNormalization layers
        - need to ensure dropout and other layers work fine
        - need to know what to do with training paramter for dropout.
        '''
        # First Convolution
        self.conv1 = Conv2D(filters = 16, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Second Convolution
        self.conv2 = Conv2D(filters = 32, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Third Convolution
        self.conv3 = Conv2D(filters = 64, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Fourth Convolution
        self.conv4 = Conv2D(filters = 128, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Fifth Convolotion
        self.conv5 = Conv2D(filters = 256, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        # Sixth Convolution
        self.conv6 = Conv2D(filters = 512, kernel_size = kernel_size,
        strides = strides, activation = leaky_relu)
        '''
        Deconvolve/Convolution Transpose layers:
        As we deconvolve the layers, we dropout half for the first three
        deconvolution layers. The rest follows typical procedure of
        deconvoloution with sigmoid activation function.
        '''
        # Convolution Transpose 1:
        self.convt1 = Conv2DTranspose(filters = 256, kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Dropout 1:
        self.dropout1 = Dropout(rate = 0.5)
        # Convolution Transpose 2:
        self.convt2 = Conv2DTranspose(filters = 128,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Dropout 2:
        self.dropout2 = Dropout(rate = 0.5)
        # Convolution Transpose 3:
        self.convt3 = Conv2DTranspose(filters = 64,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Dropout 3:
        self.dropout3 = Dropout(rate = 0.5)
        # Convolution Transpose 4:
        self.convt4 = Conv2DTranspose(filters = 32,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 5:
        self.convt5 = Conv2DTranspose(filters = 16,kernel_size = kernel_size,
        strides = strides, activation = relu)
        # Convolution Transpose 6:
        self.convt6 = Conv2DTranspose(filters = 1, kernel_size = kernel_size,
        strides = strides, activation = sigmoid)

In [12]:
unet = VocalUNet()