In [1]:
#  Import a bunch of stuff
import os
import re
import numpy as np
import IPython.display as ipd
import tensorflow as tf
import Audio_functions as af
import UNet
import Dataset

In [2]:
model_config = {"saving": True,  # Whether to take checkpoints
                    "loading": False,  # Whether to load an existing checkpoint
                    "local_run": False,  # Whether experiment is running on laptop or server
                    "checkpoint_to_load": "196363/196363-1801",
                    'SAMPLE_RATE': 16000,  # Desired sample rate of audio. Input will be resampled to this
                    'N_FFT': 1024,  # Number of samples in each fourier transform
                    'FFT_HOP': 256,  # Number of samples between the start of each fourier transform
                    'N_CHANNELS' : 1,  # May be removed - all data is single channel
                    'N_PARALLEL_READERS': 4,
                    'PATCH_WINDOW': 256,
                    'PATCH_HOP': 128,
                    'BATCH_SIZE': 50,
                    'N_SHUFFLE': 50,
                    'EPOCHS': 5,  # Number of full passes through the dataset to train for
                    'EARLY_STOPPING': False,  # Should validation data checks be used for early stopping?
                    'VAL_ITERS': 200,  # Number of training iterations between validation checks,
                    'NUM_WORSE_VAL_CHECKS': 3,  # Number of successively worse validation checks before early stopping,
                    'NORMALISE_MAG': False
                    }
model_config['data_root'] = '/data/CHiME3/data/audio/16kHz/isolated/'
model_config['model_base_dir'] = '/home/enterprise.internal.city.ac.uk/acvn728/checkpoints'
model_config['log_dir'] = 'logs/ssh'

In [3]:
#  Set other variables
sample_rate=16000
n_fft=1024
fft_hop=256
n_channels=1
patch_window=256
patch_hop=128
n_parallel_readers=4
normalise=False
batch_size = 10
shuffle=True

In [4]:
train_data, val_data, test_data = Dataset.prepare_datasets(model_config)

In [5]:
tf.reset_default_graph()
sess = tf.Session()

mixed, voice = train_data.make_one_shot_iterator().get_next()

# Create variable placeholders
is_training = tf.placeholder(shape=(), dtype=bool)
mixed_mag = tf.expand_dims(mixed[0][:, :, 1:, 0], 3)  # Yet more hacking to get around this tuple problem
mixed_phase = tf.expand_dims(mixed[0][:, :, 1:, 1], 3)
voice_mag = tf.expand_dims(voice[0][:, :, 1:, 0], 3)

# Build U-Net model
print('Creating model')
model = UNet.UNetModel(mixed_mag, voice_mag, mixed_phase, 'unet', is_training, name='U_Net_Model')

Creating model


In [6]:
print('Loading checkpoint')
checkpoint_path = os.path.join(model_config['model_base_dir'], model_config['checkpoint_to_load'])
restorer = tf.train.Saver()
restorer.restore(sess, checkpoint_path)

Loading checkpoint
INFO:tensorflow:Restoring parameters from /home/enterprise.internal.city.ac.uk/acvn728/checkpoints/196363/196363-1801


In [7]:
mixture_mag, mix_phase_mag, voice_mag, voice_est_mag = sess.run([model.mixed, model.mixed_phase, model.voice,
                                                                  model.gen_voice], {model.is_training:False})
voice_est_wave = list()
voice_wave = list()
mix_wave = list()

for i in range(voice_mag.shape[0]):
    voice_est_wave.append(af.spectrogramToAudioFile(np.squeeze(voice_est_mag[i, :, :, :]).T, n_fft,
                                                    fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))
    # Should we use voice or the original audio? (Might be hard to split into matching patches)
    voice_wave.append(af.spectrogramToAudioFile(np.squeeze(voice_mag[i, :, :, :]).T, n_fft,
                                                fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))
    
    mix_wave.append(af.spectrogramToAudioFile(np.squeeze(mixture_mag[i, :, :, :]).T, n_fft,
                                              fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))

In [23]:
#  The original mixture
ipd.Audio(mix_wave[0], rate=sample_rate)

In [24]:
#  The original isolated voice
ipd.Audio(voice_wave[0], rate=sample_rate)

In [25]:
ipd.Audio(voice_est_wave[0], rate=sample_rate)

In [27]:
len(voice_est_wave)

50

In [8]:
mix_folder = '/data/CHiME3/data/audio/16kHz/isolated/et05_caf_simu'
voice_folder = '/data/CHiME3/data/audio/16kHz/isolated/et05_bth'

In [14]:
mix_files = tf.data.Dataset.list_files(mix_folder + '/*.wav', shuffle=False).filter(lambda x: re.search('CH0', str(x)) is None)
voice_files = tf.data.Dataset.list_files(voice_folder + '/*.wav', shuffle=False).filter(lambda x: re.search('CH0', str(x)) is None)

TypeError: list_files() got an unexpected keyword argument 'shuffle'

In [10]:
next_mix_file = mix_files.make_one_shot_iterator().get_next()
next_voice_file = voice_files.make_one_shot_iterator().get_next()

In [11]:
sess.run(next_mix_file)

b'/data/CHiME3/data/audio/16kHz/isolated/et05_caf_simu/M06_440C020N_CAF.CH2.wav'

In [12]:
sess.run(next_voice_file)

b'/data/CHiME3/data/audio/16kHz/isolated/et05_bth/F06_445C0203_BTH.CH0.wav'

In [16]:
x = '/data/CHiME3/data/audio/16kHz/isolated/et05_caf_simu/M06_440C020N_CAF.CH2.wav'

In [17]:
re.search('CH0', str(x)) is None

True