# Assess the Output of a Saved Model

This notebook allows you to load the weights of a trained U-Net model, pass data to it, and turn the model's output back in to audio, for subjective assesment against the original mixed and isolated signals.

In [12]:
#  Import a bunch of stuff
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd
import tensorflow as tf
import Audio_functions as af
import UNet
import Dataset

### Get the variables and data ready

In [13]:
#  Locate the data to use
#  Locate the model checkpoint to load
local = True
checkpoint = '8/8-13'

if local:
    model_base_dir = 'C:/Users/Toby/MSc_Project/MScFinalProjectCheckpoints'
    mix_folder = 'C:/Users/Toby/MSc_Project/Test_Audio/GANdatasetsMini/train_sup/Mixed'
    voice_folder = 'C:/Users/Toby/MSc_Project/Test_Audio/GANdatasetsMini/train_sup/Voice'
else:
    model_base_dir = '/home/enterprise.internal.city.ac.uk/acvn728/checkpoints'
    mix_folder = None
    voice_folder = None

In [14]:
#  Set other variables
sample_rate=16000
n_fft=1024
fft_hop=256
n_channels=1
patch_window=256
patch_hop=128
n_parallel_readers=4
normalise=False
batch_size = 10
shuffle=True

In [15]:
# Prepare the data pipeline
mix_data = Dataset.get_dataset(data_folder=mix_folder,
                               sample_rate=sample_rate,
                               n_fft=n_fft,
                               fft_hop=fft_hop,
                               n_channels=n_channels,
                               patch_window=patch_window,
                               patch_hop=patch_hop,
                               n_parallel_readers=n_parallel_readers,
                               normalise=normalise)

voice_data = Dataset.get_dataset(data_folder=voice_folder,
                                 sample_rate=sample_rate,
                                 n_fft=n_fft,
                                 fft_hop=fft_hop,
                                 n_channels=n_channels,
                                 patch_window=patch_window,
                                 patch_hop=patch_hop,
                                 n_parallel_readers=n_parallel_readers,
                                 normalise=normalise)


data = Dataset.zip_datasets(mix_data, voice_data, batch_size, shuffle=shuffle)

### Define the model structure

In [16]:
tf.reset_default_graph()
sess = tf.Session()

mixed, voice = data.make_one_shot_iterator().get_next()

# Create variable placeholders
is_training = tf.placeholder(shape=(), dtype=bool)
mixed_mag = tf.expand_dims(mixed[0][:, :, 1:, 0], 3)  # Yet more hacking to get around this tuple problem
mixed_phase = tf.expand_dims(mixed[0][:, :, 1:, 1], 3)
voice_mag = tf.expand_dims(voice[0][:, :, 1:, 0], 3)

# Build U-Net model
print('Creating model')
model = UNet.UNetModel(mixed_mag, voice_mag, mixed_phase, 'unet', is_training, name='U_Net_Model')

Creating model


### Load the saved weights

In [17]:
print('Loading checkpoint')
checkpoint_path = os.path.join(model_base_dir, checkpoint)
restorer = tf.train.Saver()
restorer.restore(sess, checkpoint_path)

Loading checkpoint
INFO:tensorflow:Restoring parameters from C:/Users/Toby/MSc_Project/MScFinalProjectCheckpoints\8/8-13


### Run a batch

In [18]:
mixture_mag, mix_phase_mag, voice_mag, voice_est_mag = sess.run([model.mixed, model.mixed_phase, model.voice,
                                                                  model.gen_voice], {model.is_training:False})
voice_est_wave = list()
voice_wave = list()
mix_wave = list()

for i in range(voice_mag.shape[0]):
    voice_est_wave.append(af.spectrogramToAudioFile(np.squeeze(voice_est_mag[i, :, :, :]).T, n_fft,
                                                    fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))
    # Should we use voice or the original audio? (Might be hard to split into matching patches)
    voice_wave.append(af.spectrogramToAudioFile(np.squeeze(voice_mag[i, :, :, :]).T, n_fft,
                                                fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))
    
    mix_wave.append(af.spectrogramToAudioFile(np.squeeze(mixture_mag[i, :, :, :]).T, n_fft,
                                              fft_hop, phase=np.squeeze(mix_phase_mag[i, :, :, :]).T))

### Listen to the output

The output will consist of up to [batch_size] clips from a single recording.

In [25]:
len(voice_est_wave)

10

In [22]:
#  The original mixture
ipd.Audio(mix_wave[0], rate=sample_rate)

In [23]:
#  The original isolated voice
ipd.Audio(voice_wave[0], rate=sample_rate)

In [24]:
#  The estimated isolated voice produced by the network
ipd.Audio(voice_est_wave[0], rate=sample_rate)

In [None]:
plt.imshow(voice_est_mag[0,:,:,0])

In [None]:
plt.imshow(voice_mag[0,:,:,0])

In [None]:
plt.imshow(mixture_mag[0,:,:,0])

In [None]:
voice_est_mag.shape

In [None]:
sess.run(model.voice_mask_unet.encoder.l1[0])

In [None]:
sess.run(model.voice_mask_unet.encoder.l1[0])

In [None]:
def get_weights():
  return [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if v.name.endswith('kernel:0')]

In [None]:
get_weights()

In [None]:
tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

In [None]:
tf.initialize_all_variables()