# Objectives:

- [x] Import hdf5 file as np.array

[ ] Build rough draft of U-Net model

[ ] Make code more reusable/object oriented

# HDF5 to Numpy Array

In [1]:
import h5py
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, LeakyReLU, Dropout
from tensorflow.keras.layers import Conv2D, Conv2DTranspose, ReLU, Activation

Note that `librosa` shapes the output arrays slightly differently than we want. `128` represents the number of mel coefficients we have $\implies$ that should be in `array.shape[2]` rather than where it is now in `array.shape[1]`.

In [2]:
def loader(path, key):
    with h5py.File(path, 'r') as f:
        array = np.array(f[key], dtype = 'float64')
    return array

In [3]:
path = '/home/asabra/GitHub/Audio-Source-Separation-Undergraduate-Thesis/data/Test.hdf5'
# Load in hdf5 datasets as np.array
mixture = loader(path, 'mixture')
target = loader(path, 'target')

print(f'Mixture Shape: {mixture.shape} \n Target Shape: {target.shape}')

OSError: Unable to open file (unable to open file: name = '/home/asabra/GitHub/Audio-Source-Separation-Undergraduate-Thesis/data/Test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

We will be "training" on a subset of the training to verify that the model actually works. Once complete, we will run it on a GPU.

# SINGING VOICE SEPARATION WITH DEEP U-NET CONVOLUTIONAL NETWORKS

>Our implementation of U-Net is similar to that of [11].
Each encoder layer consists of a strided 2D convolution
of stride 2 and kernel size 5x5, batch normalization, and
leaky rectified linear units (ReLU) with leakiness 0.2. In
the decoder we use strided deconvolution (sometimes re-
ferred to as transposed convolution) with stride 2 and ker-
nel size 5x5, batch normalization, plain ReLU, and use
50% dropout to the first three layers, as in [11]. In the final
layer we use a sigmoid activation function

In [None]:
testmix = mixture[:5, :, :]
testvoc = target[:5, :, :]
testmix = testmix.reshape(testmix.shape[0], testmix.shape[2],
                         testmix.shape[1], 1) # Make sure it's 4D
testvoc = testvoc.reshape(testvoc.shape[0], testvoc.shape[2],
                         testvoc.shape[1], 1) # Same here

In [None]:
# Train on 5 to ensure code scales.
print(f'Mixture Shape:{testmix.shape} \n Target Shape: {testvoc.shape}')

```tf.keras.layers.Conv2D(
    filters, kernel_size, strides=(1, 1), padding='valid', data_format=None,
    dilation_rate=(1, 1), groups=1, activation=None, use_bias=True,
    kernel_initializer='glorot_uniform', bias_initializer='zeros',
    kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None,
    kernel_constraint=None, bias_constraint=None, **kwargs
)```

- kernel_size
- strides
- padding
- activation

## Good Example of How Bad Code Helps You Think.

In [None]:
# kernel_size = (5,5)
# strides = (2,2)
# leaky_alpha = 0.2

# # Initialize Model
# model = Sequential()
# # First Convolution
# model.add(Conv2D(16, kernel_size, strides))
# model.add(BatchNormalization())
# model.add(LeakyReLU(leaky_alpha))
# # Second Convolution
# model.add(Conv2D(32))
# model.add(BatchNormalization())
# model.add(LeakyReLU(leaky_alpha))
# # Third Convolution
# model.add(Conv2D(64))
# model.add(BatchNormalization())
# model.add(LeakyReLU(leaky_alpha))
# # Fourth Convolution
# model.add(Conv2D(128))
# # Fifth Convolution
# model.add(Conv2D(256))
# # Final Convolution
# model.add(Conv2D(512))
# model.add(Conv2DTranspose(256))
# model.add(Conv2DTranspose(128))
# model.add(Conv2DTranspose(64))
# model.add(Conv2DTranspose(32))
# model.add(Conv2DTranspose(16))
# # "flatten" or conv2dtranspose(1) ?

# Load 

In [None]:
# Get hdf5 in data folder
data_folder_path = '/home/asabra/Github/Audio-Source-Separation-Undergraduate-Thesis-/data'
for dev_test in ['Dev', 'Test']:
    true_path = os.path.join(data_folder_path, dev_test + '.hdf5')
    with h5py.File(true_path, 'r') as f:
        if dev_test == 'Dev':
            test = tf.convert_to_tensor(f[0:10,:,:], dtype = 'float64')
        else:
            train = tf.convert_to_tensor(f[0:10,])

In [None]:
# nicer looking code
class VocalUNet(tf.keras.Model):
    def __init__(self, inputs, kernel_size, strides):
        super(VocalUNet, self).__init__()
        self.conv1 = BatchNormalization(Conv2D(inputs = inputs,
        filters = 16, kernel_size = kernel_size, strides = strides,
        activation = leaky_relu))
        self.conv2 = BatchNormalization(Conv2D(inputs = inputs,
        filters = 32, kernel_size = kernel_size, strides = strides,
        activation = leaky_relu))
        self.conv3 = BatchNormalization(Conv2D(inputs = inputs,
        filters = 64, kernel_size = kernel_size, strides = strides,
        activation = leaky_relu)) 
        self.conv4 = BatchNormalization(Conv2D(inputs = inputs,
        filters = 128, kernel_size = kernel_size, strides = strides,
        activation = leaky_relu)) # Sort of
        self.conv5 = BatchNormalization(Conv2D(inputs = inputs,
        filters = 256, kernel_size = kernel_size, strides = strides,
        activation = leaky_relu))
        self.conv6 = BatchNormalization(Conv2D(inputs = inputs,
        filters = 512, kernel_size = kernel_size, strides = strides,
        activation = leaky_relu))