In [1]:
# Loading Google Drive

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Problem 2: Speech Denoising Using Deep Learning [5 points]

## Load the required libraries and data

In [0]:
import librosa
import numpy as np
import tensorflow as tf

s, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/train_dirty_male.wav', sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)

y_train = np.transpose(np.abs(S))
X_train = np.transpose(np.abs(X))

In [3]:
print(X_train.shape)
print(y_train.shape)

(2459, 513)
(2459, 513)


In [0]:
def leaky_relu(z, name=None):
  return tf.maximum(0.01 * z, z, name=name)

def fully_connected_layer(X_input, no_of_units, layer_name,  batch_norm, training, activation_function):
  with tf.name_scope(layer_name):
    no_of_inputs = int(X_input.shape[1])
    '''Using a truncated normal distribution rather than a regular normal
       distribution ensures that there won’t be any large weights, which
       could slow down training.
    '''
    # Initializing weights using Xavier initialization strategy to avoid the issue of vanishing gradients
    # Reference page 278 of Hands-on Machine Learning with scikit-learn and TensorFlow book
    variance = 2/(no_of_inputs+no_of_units)
    random_values = tf.truncated_normal((no_of_inputs, no_of_units), 
                                        stddev=np.sqrt(variance))
    print("stddev used is:",np.sqrt(variance))
    weights = tf.Variable(random_values, name = "weights")
    biases = tf.Variable(tf.ones([no_of_units])*0.01, name = "biases")
    output = biases + tf.matmul(X_input,weights) 

    # Impemented batch_nomralization using the reference https://medium.com/@jaynilbvb/implementing-batch-normalization-in-tensorflow-db3784f61693
    if batch_norm:
      batchNorm = tf.layers.batch_normalization(output, training=training, momentum=0.99)
    else:
      batchNorm = output
          
    if activation_function == "relu":
      return tf.nn.relu(batchNorm)
    elif activation_function == "elu":
      return tf.nn.elu(batchNorm)
    elif activation_function == "leaky_relu":
      return leaky_relu(batchNorm)
    else:
      return output

In [0]:
# Defining the parameters to be used for defining the neural network


n_inputs = X_train.shape[1] # The number of frequency bins in stft calculated are 513 
n_hidden_units1 = 1024
n_hidden_units2 = 1024
n_hidden_units3 = 512
n_hidden_units4 = 512
n_hidden_units5 = 256
n_output_dimension = y_train.shape[1]

In [0]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None, n_output_dimension), name="y")

In [7]:
# Generating the fully connected layers in the network

fully_connected_layer_1 = fully_connected_layer(X, n_hidden_units1, "fcc_1", batch_norm = False, training = True, activation_function = "relu")

fully_connected_layer_2 = fully_connected_layer(fully_connected_layer_1, n_hidden_units2, "fcc_2", batch_norm = False, training = True, activation_function = "elu")
fully_connected_layer_3 = fully_connected_layer(fully_connected_layer_2, n_hidden_units3, "fcc_3", batch_norm = False, training = True, activation_function = "elu")
# fully_connected_layer_4 = fully_connected_layer(fully_connected_layer_3, n_hidden_units4, "fcc_4", batch_norm = False, training = True, activation_function = "elu")
# fully_connected_layer_5 = fully_connected_layer(fully_connected_layer_4, n_hidden_units5, "fcc_5", batch_norm = False, training = True, activation_function = "elu")

# Generating the output layer of the network
output_layer = fully_connected_layer(fully_connected_layer_3, n_output_dimension, "output_layer", batch_norm = False, training = True, activation_function = "relu")  

stddev used is: 0.03607265133540433
stddev used is: 0.03125
stddev used is: 0.03608439182435161
stddev used is: 0.04417261042993862


In [8]:
# Setting the learning rate to 0.01 for the optimizer
learning_rate = 0.0001

loss = tf.losses.mean_squared_error(labels=y,predictions=output_layer)
loss_function = tf.reduce_mean(loss, name = 'loss_function') # use square error for cost function

# Defining the Adam Optimizer we will use

optimizer = tf.train.AdamOptimizer(learning_rate, name = "Adam-Opt").minimize(loss_function)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
# Initializing all variables

init = tf.global_variables_initializer()

In [10]:
saver = tf.train.Saver()

n_epochs = 100
batch_size = 100

min_loss = 0.005

extra_graphkeys_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

# with tf.Session() as sess:
sess = tf.InteractiveSession()
# init.run()
sess.run(init)
for epoch in range(n_epochs):
  for iteration in range(X_train.shape[0] // batch_size):
    # X_batch, y_batch = mnist.train.next_batch(batch_size)
    # X_batch, y_batch = shuffle(X_batch, y_batch)

    rand_index = np.random.choice(n_inputs, size=batch_size)
    X_batch = X_train[rand_index,:] # Transpose to the correct shape
    y_batch = y_train[rand_index,:]


    sess.run(optimizer, feed_dict={X: X_batch, y: y_batch})
  loss_train = sess.run(loss_function, feed_dict={X: X_batch, y: y_batch})
  # loss_test = sess.run(loss_function, feed_dict={X: X_test, y: y_test})
  if loss_train < min_loss:
    min_loss = loss_train
    print("Lowest train loss achieved till now!")
    save_path = saver.save(sess, "/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt")
    print("Epoch:",epoch, "Train loss:", loss_train)
 
  else:
    print("Epoch:",epoch, "Train loss:", loss_train)

sess.close()

Epoch: 0 Train loss: 0.044893812
Epoch: 1 Train loss: 0.028849479
Epoch: 2 Train loss: 0.014645899
Epoch: 3 Train loss: 0.011603832
Epoch: 4 Train loss: 0.010497262
Epoch: 5 Train loss: 0.007775266
Epoch: 6 Train loss: 0.006510826
Epoch: 7 Train loss: 0.0062585385
Epoch: 8 Train loss: 0.005394962
Epoch: 9 Train loss: 0.0050484333
Epoch: 10 Train loss: 0.005049563
Lowest train loss achieved till now!
Epoch: 11 Train loss: 0.004561187
Lowest train loss achieved till now!
Epoch: 12 Train loss: 0.0043801162
Lowest train loss achieved till now!
Epoch: 13 Train loss: 0.003391266
Lowest train loss achieved till now!
Epoch: 14 Train loss: 0.0027772132
Epoch: 15 Train loss: 0.0032159917
Lowest train loss achieved till now!
Epoch: 16 Train loss: 0.0027653964
Epoch: 17 Train loss: 0.00282878
Lowest train loss achieved till now!
Epoch: 18 Train loss: 0.0024682162
Epoch: 19 Train loss: 0.0028779267
Epoch: 20 Train loss: 0.0028874893
Lowest train loss achieved till now!
Epoch: 21 Train loss: 0.00240

In [11]:
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_x_01.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

X_test_abs = np.abs(X_t)

X_test = np.transpose(X_test_abs)


sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X: X_test})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_test_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_s_01_recons.wav', s_cap, sr)



Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt


In [12]:
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_x_02.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

X_test_abs = np.abs(X_t)

X_test = np.transpose(X_test_abs)


sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X: X_test})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_test_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_s_02_recons.wav', s_cap, sr)



INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt


In [13]:
# Calculating SNR for the first input file

s, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)

sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/train_dirty_male.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

y_train = np.transpose(np.abs(S))
X_train_abs = np.abs(X_t)
X_train = np.transpose(X_train_abs)

# sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_x_02.wav', sr=None)
# X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

# X_test_abs = np.abs(X_t)

# X_test = np.transpose(X_test_abs)


sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X: X_train})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_train_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/train_recons.wav', s_cap, sr)





# snr = 10*np.log10(np.sum(np.square(s))/np.sum(np.square(np.subtract(s, s_cap))))

snr = 10*np.log10(np.dot(np.transpose(s),s)/np.dot(np.transpose(s-s_cap),(s-s_cap)))
print('Signal to Noise Ratio for input file:',snr)

INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/best_audio_denoising_model.ckpt
Signal to Noise Ratio for input file: 10.767414569854736
