In [1]:
#  Import a bunch of stuff
import os
import re
from functools import partial
import numpy as np
import IPython.display as ipd
import tensorflow as tf
import Audio_functions as af
import UNet
import Dataset
import Utils
import librosa

  from ._conv import register_converters as _register_converters


In [9]:
model_config = {"saving": True,  # Whether to take checkpoints
                    "loading": False,  # Whether to load an existing checkpoint
                    "local_run": False,  # Whether experiment is running on laptop or server
                    "checkpoint_to_load": "18/18-23",
                    'SAMPLE_RATE': 16384,  # Desired sample rate of audio. Input will be resampled to this
                    'N_FFT': 1024,  # Number of samples in each fourier transform
                    'FFT_HOP': 256,  # Number of samples between the start of each fourier transform
                    'N_PARALLEL_READERS': 4,
                    'PATCH_WINDOW': 256,
                    'PATCH_HOP': 128,
                    'BATCH_SIZE': 5,
                    'N_SHUFFLE': 10,
                    'EPOCHS': 5,  # Number of full passes through the dataset to train for
                    'EARLY_STOPPING': False,  # Should validation data checks be used for early stopping?
                    'VAL_ITERS': 200,  # Number of training iterations between validation checks,
                    'NUM_WORSE_VAL_CHECKS': 3,  # Number of successively worse validation checks before early stopping,
                    'NORMALISE_MAG': True
                    }
model_config['data_root'] = 'C:/Users/Toby/CHiME3/data/audio/16kHz/isolated/'
model_config['model_base_dir'] = 'C:/Users/Toby/MSc_Project/MScFinalProjectCheckpoints'
model_config['log_dir'] = 'logs/ssh'

In [13]:
#  Set other variables
sample_rate=16384
n_fft=1024
fft_hop=256
patch_window=256
patch_hop=128
n_parallel_readers=4
normalise=True
batch_size = 5
shuffle=False
n_shuffle = 10

In [3]:
train_data, val_data, test_data = Dataset.prepare_datasets(model_config)

In [7]:
tf.reset_default_graph()
sess = tf.Session()

mixed, voice, mixed_audio, voice_audio = train_data.make_one_shot_iterator().get_next()

# Create variable placeholders
is_training = tf.placeholder(shape=(), dtype=bool)
mixed_mag = tf.expand_dims(mixed[:, :, 1:, 0], 3)  # Yet more hacking to get around this tuple problem
mixed_phase = tf.expand_dims(mixed[:, :, 1:, 1], 3)
voice_mag = tf.expand_dims(voice[:, :, 1:, 0], 3)

# Build U-Net model
print('Creating model')
model = UNet.UNetModel(mixed_mag, voice_mag, mixed_phase, mixed_audio, voice_audio, 'unet', is_training, name='U_Net_Model')

Creating model


In [10]:
print('Loading checkpoint')
checkpoint_path = os.path.join(model_config['model_base_dir'], model_config['checkpoint_to_load'])
restorer = tf.train.Saver()
restorer.restore(sess, checkpoint_path)

Loading checkpoint
INFO:tensorflow:Restoring parameters from C:/Users/Toby/MSc_Project/MScFinalProjectCheckpoints\18/18-23


In [14]:
mixture_mag, mix_phase_mag, voice_mag, voice_est_mag = sess.run([model.mixed_mag, model.mixed_phase, model.voice_mag,
                                                                  model.gen_voice], {model.is_training:False})
voice_est_wave = list()
voice_wave = list()
mix_wave = list()

for i in range(voice_mag.shape[0]):
    voice_est_wave.append(af.spectrogramToAudioFile(np.squeeze(voice_est_mag[i, :, :, :]).T, n_fft,
                                                    fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))
    # Should we use voice or the original audio? (Might be hard to split into matching patches)
    voice_wave.append(af.spectrogramToAudioFile(np.squeeze(voice_mag[i, :, :, :]).T, n_fft,
                                                fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))
    
    mix_wave.append(af.spectrogramToAudioFile(np.squeeze(mixture_mag[i, :, :, :]).T, n_fft,
                                              fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))

In [15]:
#  The original mixture
ipd.Audio(mix_wave[0], rate=sample_rate)

In [16]:
#  The original isolated voice
ipd.Audio(voice_wave[0], rate=sample_rate)

In [17]:
ipd.Audio(voice_est_wave[0], rate=sample_rate)

In [None]:
len(voice_est_wave)

In [None]:
mix_folder = '/data/CHiME3/data/audio/16kHz/isolated/et05_caf_simu'
voice_folder = '/data/CHiME3/data/audio/16kHz/isolated/et05_bth'

In [7]:
directory_a = 'C:/Users/Toby/MSc_Project/Test_Audio/GANdatasetsMini/test/Mixed'
directory_b = 'C:/Users/Toby/MSc_Project/Test_Audio/GANdatasetsMini/test/Voice'

In [8]:
def zip_files(directory_a, directory_b):
    """
    Takes in two directories (a and b) and returns an array, where each row is a pair of matching file paths, 
    one from each directory, with directory a in col 0 and directory b in col 1
    """

    filelist_a = [f for f in os.listdir(directory_a) if os.path.isfile(os.path.join(directory_a, f)) and re.search('CH0', f) is None]
    filelist_b = [f for f in os.listdir(directory_b) if os.path.isfile(os.path.join(directory_b, f)) and re.search('CH0', f) is None]

    zipped_list = list()

    for file_a in filelist_a:
        for file_b in filelist_b:
            if file_a[:13] == file_b[:13] and (file_a[17:] == file_b[17:] or len(file_a)!=len(file_b)):
                zipped_list.append((str(directory_a + '/' + file_a), str(directory_b + '/' + file_b)))
                if len(file_a) == len(file_b):
                    filelist_b.remove(file_b)
                break
    
    zipped_list = np.array(zipped_list)                

    return zipped_list

In [9]:
zipped_files  = zip_files(directory_a, directory_b)

In [None]:
def get_paired_dataset(zipped_files,
                       sample_rate,
                       n_fft,
                       fft_hop,
                       patch_window,
                       patch_hop,
                       n_parallel_readers,
                       batch_size,
                       n_shuffle,
                       normalise):

    return (
        tf.data.Dataset.from_tensor_slices((zipped_files[:,0],zipped_files[:,1]))
        .map(partial(read_audio_pair,
                     sample_rate=sample_rate), 
             num_parallel_calls=n_parallel_readers)
        .map(partial(compute_spectrogram_map,
                     n_fft=n_fft,
                     fft_hop=fft_hop,
                     normalise=normalise),
             num_parallel_calls=n_parallel_readers)
        .map(partial(extract_patches_map,
                     n_fft=n_fft,
                     fft_hop = fft_hop,
                     patch_window=patch_window,
                     patch_hop=patch_hop,),
             num_parallel_calls=n_parallel_readers)
        .flat_map(Utils.zip_tensor_slices).batch(batch_size).shuffle(n_shuffle))

In [None]:
def read_audio_pair(path_a, path_b, sample_rate):
    """
    Takes in the path of two audio files and the required output sample rate,
    returns a tuple of test_pipelinensors of the wave form of the audio files, plus the
    path to the original audio files.
    """
    def read_audio_py(py_path):

        mono, _ = librosa.load(py_path, sr=sample_rate, mono=True)
        return np.expand_dims(mono, 1)

    return (tf.py_func(read_audio_py, [path_a], tf.float32, stateful=False),
            tf.py_func(read_audio_py, [path_b], tf.float32, stateful=False))

In [None]:
def compute_spectrogram_map(audio_a, audio_b, n_fft, fft_hop, normalise=False):
    
    spec_a = compute_spectrogram(audio_a, n_fft, fft_hop, normalise)
    spec_b = compute_spectrogram(audio_b, n_fft, fft_hop, normalise)
    
    return spec_a, spec_b, audio_a, audio_b


def extract_patches_map(spec_a, spec_b, audio_a, audio_b, n_fft, fft_hop, patch_window, patch_hop):
    
    patches_a = extract_spectrogram_patches(spec_a, n_fft, patch_window, patch_hop)
    patches_b = extract_spectrogram_patches(spec_b, n_fft, patch_window, patch_hop)
    
    audio_patches_a = extract_audio_patches(audio_a, fft_hop, patch_window, patch_hop)
    audio_patches_b = extract_audio_patches(audio_b, fft_hop, patch_window, patch_hop)
    
    return patches_a, patches_b, audio_patches_a, audio_patches_b



In [None]:
def compute_spectrogram(audio, n_fft, fft_hop, normalise=False):
    '''
    Parameters
    ----------
    audio : single to dual channel audio shaped (n_samples, )

    Returns
    -------
    Tensor of shape (n_frames, 1 + n_fft / 2, 2), where the last dimension is (magnitude, phase)
    '''

    def stft(x, normalise):
        spec = librosa.stft(
            x, n_fft=n_fft, hop_length=fft_hop, window='hann')
        mag = np.abs(spec)
        if normalise:
            # TODO: normalize?
            mag = (mag-mag.min())/(mag.max()-mag.min())
        return mag, np.angle(spec)


    def mono_func(py_audio, normalise):
        mag, phase = stft(py_audio[:, 0], normalise)
        ret = np.array([mag, phase]).T
        return ret.astype(np.float32)


    with tf.name_scope('read_spectrogram'):
        ret = tf.py_func(mono_func, [audio, normalise], tf.float32, stateful=False)
        ret.set_shape([None, 1 + n_fft / 2, 2]) 
    return ret


def extract_spectrogram_patches(
        spec, n_fft, patch_window, patch_hop):
    '''
    Parameters
    ----------
    spec : Spectrogram of shape (n_frames, 1 + n_fft / 2, 2)

    Returns
    -------
    Tensor of shape (n_patches, patch_window, 1 + n_fft / 2, 2)
        containing patches from spec.
    '''
    with tf.name_scope('extract_spectrogram_patches'):
        spec4d = tf.expand_dims(spec, 0)

        patches = tf.extract_image_patches(
            spec4d, ksizes=[1, patch_window, 1 + n_fft / 2, 1],
            strides=[1, patch_hop, 1 + n_fft / 2, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )

        num_patches = tf.shape(patches)[1]

        return tf.reshape(patches, [num_patches, patch_window,
                                    int(1 + n_fft / 2), 2])

    
def extract_audio_patches(audio, fft_hop, patch_window, patch_hop):
    '''
    Parameters
    ----------
    audio : Waveform audio of shape (n_samples, )

    Returns
    -------
    Tensor of shape (n_patches, patch_window) containing patches from audio.
    '''
    with tf.name_scope('extract_audio_patches'):
        audio4d = tf.expand_dims(tf.expand_dims(audio, 0), 0)
        patch_length = (patch_window - 1) * fft_hop
        patch_hop_length = (patch_hop - 1) * fft_hop

        patches = tf.extract_image_patches(
            audio4d, ksizes=[1, 1, patch_length, 1],
            strides=[1, 1, patch_hop_length, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )

        num_patches = tf.shape(patches)[2]

        return tf.squeeze(tf.reshape(patches, [num_patches, 1, patch_length, 1]), 1)

In [None]:
test_pipeline = tf.data.Dataset.from_tensor_slices((zipped_files[:,0],zipped_files[:,1]))
test_pipeline

In [None]:
test_pipeline = test_pipeline.map(partial(read_audio_pair,sample_rate=sample_rate), num_parallel_calls=n_parallel_readers)
test_pipeline

In [None]:
test_pipeline = test_pipeline.map(partial(compute_spectrogram_map,
                                          n_fft=n_fft,
                                          fft_hop=fft_hop,
                                          normalise=normalise),
                                  num_parallel_calls=n_parallel_readers)
test_pipeline

In [None]:
test_pipeline = test_pipeline.map(partial(extract_patches_map,
                                          n_fft=n_fft,
                                          patch_window=patch_window,
                                          patch_hop=patch_hop,))
test_pipeline

In [None]:
test_pipeline = test_pipeline.flat_map(Utils.zip_tensor_slices)
test_pipeline

In [None]:
test_pipeline = get_paired_dataset(zipped_files,
                                   sample_rate,
                                   n_fft,
                                   fft_hop,
                                   patch_window,
                                   patch_hop,
                                   n_parallel_readers,
                                   batch_size,
                                   n_shuffle,
                                   normalise)
test_pipeline

In [None]:
value = test_pipeline.make_one_shot_iterator().get_next()

In [None]:
sess = tf.Session()

In [None]:
val = sess.run(value)

In [None]:
val[3].shape

In [None]:
np.squeeze(val[3]).shape

In [10]:
train = Dataset.get_paired_dataset(zipped_files,
                                   model_config['SAMPLE_RATE'],
                                   model_config['N_FFT'],
                                   model_config['FFT_HOP'],
                                   model_config['PATCH_WINDOW'],
                                   model_config['PATCH_HOP'],
                                   model_config['N_PARALLEL_READERS'],
                                   model_config['BATCH_SIZE'],
                                   model_config['N_SHUFFLE'],
                                   model_config['NORMALISE_MAG'])
train

<ShuffleDataset shapes: ((?, 256, 513, 2), (?, 256, 513, 2), (?, 65280, 1), (?, 65280, 1)), types: (tf.float32, tf.float32, tf.float32, tf.float32)>

In [17]:
arr = np.zeros((1, 65280, 5))
arr.T.shape

(5, 65280, 1)