# Objectives:

- [x] Import hdf5 file as np.array

[ ] Build rough draft of U-Net model

[ ] Make code more reusable/object oriented

# HDF5 to Numpy Array

In [1]:
import h5py
import numpy as np
import os

Note that `librosa` shapes the output arrays slightly differently than we want. `128` represents the number of mel coefficients we have $\implies$ that should be in `array.shape[2]` rather than where it is now in `array.shape[1]`.

In [2]:
def loader(path, key):
    with h5py.File(path, 'r') as f:
        array = np.array(f[key], dtype = 'float64')
    return array

In [3]:
path = '/home/asabra/GitHub/Audio-Source-Separation-Undergraduate-Thesis/data/Test.hdf5'
# Load in hdf5 datasets as np.array
mixture = loader(path, 'mixture')
target = loader(path, 'target')

print(f'Mixture Shape: {mixture.shape} \n Target Shape: {target.shape}')

Mixture Shape: (233, 128, 4102) 
 Target Shape: (233, 128, 4102)


We will be "training" on a subset of the training to verify that the model actually works. Once complete, we will run it on a GPU.

In [4]:
testmix = mixture[:10, :, :]
testvoc = target[:10, :, :]
print(f'Mixture Shape:{testmix.shape} \n Target Shape: {testvoc.shape}')

Mixture Shape:(10, 128, 4102) 
 Target Shape: (10, 128, 4102)


# SINGING VOICE SEPARATION WITH DEEP U-NET CONVOLUTIONAL NETWORKS

>Our implementation of U-Net is similar to that of [11].
Each encoder layer consists of a strided 2D convolution
of stride 2 and kernel size 5x5, batch normalization, and
leaky rectified linear units (ReLU) with leakiness 0.2. In
the decoder we use strided deconvolution (sometimes re-
ferred to as transposed convolution) with stride 2 and ker-
nel size 5x5, batch normalization, plain ReLU, and use
50% dropout to the first three layers, as in [11]. In the final
layer we use a sigmoid activation function

In [16]:
testmix = testmix.reshape(testmix.shape[0], testmix.shape[2],
                         testmix.shape[1], 1)
testvoc = testvoc.reshape(testvoc.shape[0], testvoc.shape[2],
                         testvoc.shape[1], 1)

In [17]:
print(testmix.shape, testvoc.shape)

(10, 4102, 128, 1) (10, 4102, 128, 1)


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D

In [9]:
img_rows, img_cols = 4102, 128

In [None]:
model = keras.Sequential()
model.add(Conv2D(16, kernel_size = (5,5),
                activation = 'leakyReLU',
                input_shape = (img_rows, img_cols, 1)))