In [1]:
# Loading Google Drive

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Problem 1: Speech Denoising Using 1D CNN [5 points]

## Like you did in homework 1 Q2, install/load librosa. Take the magnitude spectrograms of the dirty signal and the clean signal |X| and |S|.

In [0]:
# Loading the required libraries and data
import librosa
import numpy as np
import tensorflow as tf

s, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_dirty_male.wav', sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)

y_train = np.transpose(np.abs(S))
X_train = np.transpose(np.abs(X))

In [3]:
X_train = np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))
print(X_train.shape)
print(y_train.shape)

(2459, 513, 1)
(2459, 513)


In [0]:
# Reference: 
# 1. https://adventuresinmachinelearning.com/convolutional-neural-networks-tutorial-tensorflow/
## 2. https://www.kaggle.com/agrigorev/tensorflow-starter-conv1d-embeddings-0-442-lb - 
## 3. https://burakhimmetoglu.com/2017/08/22/time-series-classification-with-tensorflow/ - conv1d example 
# 4. https://blog.goodaudience.com/introduction-to-1d-convolutional-neural-networks-in-keras-for-time-sequences-3a7ff801a2cf



def conv1d(X_input, no_of_filters, filter_size, padding = "same", layer_name = "", activation_function = "relu"):

  if activation_function == "relu":
    activation = tf.nn.relu
  elif activation_function == "elu":
    activation = tf.nn.elu
  
  with tf.variable_scope(layer_name) as scope:
    output = tf.layers.conv1d(inputs = X_input,
                              filters = no_of_filters,
                              kernel_size = filter_size,
                              padding=padding,
                              activation = activation
                              )
    return output


def fully_connected_layer(X_input, no_of_units, layer_name,  batch_norm, training, activation_function):
  with tf.name_scope(layer_name):
    no_of_inputs = int(X_input.shape[1])
    '''Using a truncated normal distribution rather than a regular normal
       distribution ensures that there won’t be any large weights, which
       could slow down training.
    '''
    # Initializing weights using Xavier initialization strategy to avoid the issue of vanishing gradients
    # Reference page 278 of Hands-on Machine Learning with scikit-learn and TensorFlow book
    variance = 2/(no_of_inputs+no_of_units)
    random_values = tf.truncated_normal((no_of_inputs, no_of_units), 
                                        stddev=np.sqrt(variance))
    print("stddev used is:",np.sqrt(variance))
    weights = tf.Variable(random_values, name = "weights")
    biases = tf.Variable(tf.ones([no_of_units])*0.01, name = "biases")
    output = biases + tf.matmul(X_input,weights) 

    # Impemented batch_nomralization using the reference https://medium.com/@jaynilbvb/implementing-batch-normalization-in-tensorflow-db3784f61693
    if batch_norm:
      batchNorm = tf.layers.batch_normalization(output, training=training, momentum=0.99)
    else:
      batchNorm = output
          
    if activation_function == "relu":
      return tf.nn.relu(batchNorm)
    elif activation_function == "elu":
      return tf.nn.elu(batchNorm)
    elif activation_function == "leaky_relu":
      return leaky_relu(batchNorm)
    else:
      return output



In [0]:
# Defining the parameters to be used for defining the CNN
n_inputs = X_train.shape[1] # This is the width
height = 1 # Defining height
n_output_dimension = y_train.shape[1]
n_channels = 1

In [0]:
# Defining the structure of X and y
X = tf.placeholder(tf.float32, shape=(None, n_inputs, n_channels), name="X")
y = tf.placeholder(tf.float32, shape=(None, n_output_dimension), name="y")

# Drop our prob and learning_rate for Optimizer
dropout_prob = 0.9
learning_rate = 0.0001

In [7]:
# Defining the model
# (batch,513,1) -> (batch,257,18)

# (batch,513,1) -> (batch,513,18)
convolution_layer_1 = conv1d(X, no_of_filters = 18, filter_size = 2, 
                             padding = "valid", layer_name = "convolution_layer_1",
                             activation_function = "relu")
# max_pooling_layer_1 = tf.layers.max_pooling1d(inputs=convolution_layer_1, pool_size=2, strides=2, padding='same')

# (batch,257,18) -> (batch,129,36)

# (batch,513,18) -> (batch,257,36)
convolution_layer_2 = conv1d(convolution_layer_1, no_of_filters = 36, filter_size = 2, 
                             padding = "valid", layer_name = "convolution_layer_2", 
                             activation_function = "relu")
max_pooling_layer_2 = tf.layers.max_pooling1d(inputs=convolution_layer_2, 
                                              pool_size=2, strides=2, padding='valid')

# (batch,129,36) -> (batch,65,72)

# (batch,257,36) -> (batch,129,72)
# convolution_layer_3 = conv1d(max_pooling_layer_2, no_of_filters = 72, filter_size = 2, padding = "same", layer_name = "convolution_layer_3", activation_function = "relu")
# max_pooling_layer_3 = tf.layers.max_pooling1d(inputs=convolution_layer_3, pool_size=2, strides=2, padding='same')

# (batch,65,72) -> (batch,33,144)
# convolution_layer_4 = conv1d(max_pooling_layer_3, no_of_filters = 144, filter_size = 2, padding = "same", layer_name = "convolution_layer_4", activation_function = "relu")
# max_pooling_layer_4 = tf.layers.max_pooling1d(inputs=convolution_layer_4, pool_size=2, strides=2, padding='same')

flattened_max_pooling_layer_4_output = tf.layers.flatten(max_pooling_layer_2)
flattened_max_pooling_layer_4_output = tf.nn.dropout(flattened_max_pooling_layer_4_output, keep_prob=dropout_prob)


output_layer = fully_connected_layer(flattened_max_pooling_layer_4_output, n_output_dimension, 
                                     "fully_connected_output_layer", batch_norm = False, training = True, activation_function = "relu")

# output_layer = tf.layers.dense(flattened_max_pooling_layer_4_output, n_output_dimension)






Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.MaxPooling1D instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
stddev used is: 0.014364347119619057


In [8]:
loss = tf.losses.mean_squared_error(labels=y,predictions=output_layer)
loss_function = tf.reduce_mean(loss, name = 'loss_function') # use square error for cost function

# Defining the Adam Optimizer we will use

optimizer = tf.train.AdamOptimizer(learning_rate, name = "Adam-Opt").minimize(loss_function)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
# Initializing all variables

init = tf.global_variables_initializer()

In [10]:
sess = tf.InteractiveSession()

sess.run(init)

convolution_layer_1_output = sess.run(convolution_layer_1, feed_dict={X: X_train, y: y_train})
print(convolution_layer_1_output.shape)

sess.close()


(2459, 512, 18)


In [11]:
saver = tf.train.Saver()

n_epochs = 100
batch_size = 100

min_loss = 0.005

extra_graphkeys_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

# with tf.Session() as sess:
sess = tf.InteractiveSession()
# init.run()
sess.run(init)
for epoch in range(1,n_epochs+1):
  for iteration in range(X_train.shape[0] // batch_size):
    # X_batch, y_batch = mnist.train.next_batch(batch_size)
    # X_batch, y_batch = shuffle(X_batch, y_batch)

    rand_index = np.random.choice(n_inputs, size=batch_size)
    X_batch = X_train[rand_index,:,:] # Transpose to the correct shape
    y_batch = y_train[rand_index,:]


    sess.run(optimizer, feed_dict={X: X_batch, y: y_batch})
  loss_train = sess.run(loss_function, feed_dict={X: X_batch, y: y_batch})
  # loss_test = sess.run(loss_function, feed_dict={X: X_test, y: y_test})
  if loss_train < min_loss:
    min_loss = loss_train
    print("Lowest train loss achieved till now!")
    save_path = saver.save(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt")
    print("Epoch:",epoch, "Train loss:", loss_train)
 
  else:
    if epoch % 100 == 0:
      print("Epoch:",epoch, "Train loss:", loss_train)

sess.close()

Lowest train loss achieved till now!
Epoch: 24 Train loss: 0.004805775
Lowest train loss achieved till now!
Epoch: 25 Train loss: 0.0038954825
Lowest train loss achieved till now!
Epoch: 31 Train loss: 0.0037945984
Lowest train loss achieved till now!
Epoch: 36 Train loss: 0.0037513534
Lowest train loss achieved till now!
Epoch: 37 Train loss: 0.0028538664
Lowest train loss achieved till now!
Epoch: 39 Train loss: 0.0026633993
Lowest train loss achieved till now!
Epoch: 47 Train loss: 0.0025643855
Lowest train loss achieved till now!
Epoch: 49 Train loss: 0.0020939447
Lowest train loss achieved till now!
Epoch: 53 Train loss: 0.002023547
Lowest train loss achieved till now!
Epoch: 57 Train loss: 0.0019493065
Lowest train loss achieved till now!
Epoch: 64 Train loss: 0.0018172745
Lowest train loss achieved till now!
Epoch: 66 Train loss: 0.0015522727
Lowest train loss achieved till now!
Epoch: 74 Train loss: 0.0014223235
Lowest train loss achieved till now!
Epoch: 93 Train loss: 0.00136

In [12]:
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/test_x_01.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

X_test_abs = np.abs(X_t)

X_test = np.transpose(X_test_abs)

X_test = np.reshape(X_test,(X_test.shape[0],X_test.shape[1],1))


sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X: X_test})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_test_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_2/test_s_01_1d_cnn_recons.wav', s_cap, sr)



INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt


In [13]:
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/test_x_02.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

X_test_abs = np.abs(X_t)

X_test = np.transpose(X_test_abs)

X_test = np.reshape(X_test,(X_test.shape[0],X_test.shape[1],1))



sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X: X_test})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_test_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_2/test_s_02_1d_cnn_recons.wav', s_cap, sr)



INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt


In [14]:
# Calculating SNR for the first input file

s, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)

sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_dirty_male.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

y_train = np.transpose(np.abs(S))
X_train_abs = np.abs(X_t)
X_train = np.transpose(X_train_abs)

X_train = np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))

# sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_x_02.wav', sr=None)
# X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

# X_test_abs = np.abs(X_t)

# X_test = np.transpose(X_test_abs)


sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X: X_train})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_train_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_2/train_1d_cnn_recons.wav', s_cap, sr)





# snr = 10*np.log10(np.sum(np.square(s))/np.sum(np.square(np.subtract(s, s_cap))))

snr = 10*np.log10(np.dot(np.transpose(s),s)/np.dot(np.transpose(s-s_cap),(s-s_cap)))
print('Signal to Noise Ratio for input file using 1D CNN is:',snr)

INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_1d_cnn_model.ckpt
Signal to Noise Ratio for input file using 1D CNN is: 10.177600383758545


In [15]:
# Calculating SNR in the time domain too
sn1, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_clean_male.wav')
sn2, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_1d_cnn_recons.wav')
snr=10*np.log10( np.square(sn1).sum() / ( np.square(sn1-sn2) ).sum()  ) 
print('Signal to Noise Ratio for input file using 1D CNN is:',snr)

Signal to Noise Ratio for input file using 1D CNN is: 10.200825929641724


# Problem 2: Speech Denoising Using 2D CNN [5 points]

In [0]:
# Loading the required libraries and data
import librosa
import numpy as np
import tensorflow as tf

s, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_dirty_male.wav', sr=None)
X=librosa.stft(sn, n_fft=1024, hop_length=512)

y_train = np.transpose(np.abs(S))
X_train_abs = np.abs(X)
X_train = np.transpose(np.abs(X))

In [17]:
# X_train = np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))

for i in range(19):
  X_train = np.insert(X_train, 0, [np.random.uniform(10**(-26), 10**(-50)) for j in range(513)], axis=0)

print(X_train.shape)
print(y_train.shape)

X_train

# X_train[2439:2459,:]

(2478, 513)
(2459, 513)


array([[5.6581788e-27, 2.6224885e-27, 8.6631723e-29, ..., 4.8300266e-27,
        5.7060825e-27, 6.4507390e-27],
       [4.5649867e-27, 9.4505180e-28, 2.5455057e-27, ..., 4.0531400e-27,
        1.4274459e-27, 5.1863360e-27],
       [5.7513299e-27, 5.3132802e-27, 9.1044039e-27, ..., 4.7512965e-27,
        2.9762875e-27, 4.6878152e-27],
       ...,
       [1.5213569e-02, 1.5371690e-02, 1.3715556e-02, ..., 9.3410030e-04,
        7.1277702e-03, 1.5927447e-03],
       [4.9811867e-03, 1.5078177e-03, 2.2639418e-02, ..., 1.3958135e-02,
        7.0389216e-03, 4.5577832e-04],
       [7.7582356e-03, 2.0596350e-02, 2.8300950e-02, ..., 1.2237257e-02,
        1.7999319e-02, 1.6074810e-02]], dtype=float32)

## Therefore, a pair of adjacent images (unless you shuffe the order) will be with 19 overlapped frames. Since your original STFT spectrogram has 2,459 frames, you can create 2,440 such images as your training dataset.

In [0]:
# X_train_image = []

# for i in range(X_train.shape[0]):
#   temp = X_train[i:i+20,:]
#   if len(temp) == 20:
#     X_train_image.append(temp)

# len(X_train_image)

# X_train_image[2439]

In [19]:
X_train_image = np.zeros([y_train.shape[0],20,X_train.shape[1]])

for i in range(X_train.shape[0]):
  temp = X_train[i:i+20,:]
  if len(temp) == 20:
    X_train_image[i,:,:] = temp

len(X_train_image)

X_train_image = np.reshape(X_train_image,(y_train.shape[0], 20, X_train.shape[1],1))
X_train_image.shape






(2459, 20, 513, 1)

In [0]:
# Reference 1: http://aqibsaeed.github.io/2016-11-04-human-activity-recognition-cnn/
# Reference 2: https://www.datacamp.com/community/tutorials/cnn-tensorflow-python
## Reference 3: https://medium.com/data-science-group-iitr/building-a-convolutional-neural-network-in-python-with-tensorflow-d251c3ca8117



def conv2d(X_input, no_of_filters, filter_size, padding = "same", layer_name = "", activation_function = "relu"):

  if activation_function == "relu":
    activation = tf.nn.relu
  elif activation_function == "elu":
    activation = tf.nn.elu
  
  with tf.variable_scope(layer_name) as scope:
    output = tf.layers.conv2d(inputs = X_input,
                              filters = no_of_filters,
                              kernel_size = filter_size,
                              padding=padding,
                              activation = activation
                              )
    return output


def fully_connected_layer(X_input, no_of_units, layer_name,  batch_norm, training, activation_function):
  with tf.name_scope(layer_name):
    no_of_inputs = int(X_input.shape[1])
    '''Using a truncated normal distribution rather than a regular normal
       distribution ensures that there won’t be any large weights, which
       could slow down training.
    '''
    # Initializing weights using Xavier initialization strategy to avoid the issue of vanishing gradients
    # Reference page 278 of Hands-on Machine Learning with scikit-learn and TensorFlow book
    variance = 2/(no_of_inputs+no_of_units)
    random_values = tf.truncated_normal((no_of_inputs, no_of_units), 
                                        stddev=np.sqrt(variance))
    print("stddev used is:",np.sqrt(variance))
    weights = tf.Variable(random_values, name = "weights")
    biases = tf.Variable(tf.ones([no_of_units])*0.01, name = "biases")
    output = biases + tf.matmul(X_input,weights) 

    # Impemented batch_nomralization using the reference https://medium.com/@jaynilbvb/implementing-batch-normalization-in-tensorflow-db3784f61693
    if batch_norm:
      batchNorm = tf.layers.batch_normalization(output, training=training, momentum=0.99)
    else:
      batchNorm = output
          
    if activation_function == "relu":
      return tf.nn.relu(batchNorm)
    elif activation_function == "elu":
      return tf.nn.elu(batchNorm)
    elif activation_function == "leaky_relu":
      return leaky_relu(batchNorm)
    else:
      return output





In [0]:
# Defining the parameters to be used for defining the neural network


n_inputs = X_train_image.shape[2] # This is the width
height = X_train_image.shape[1] # Defining height
n_output_dimension = y_train.shape[1]
n_channels = 1

In [0]:
# Placeholder for X
X_image = tf.placeholder(tf.float32, shape=(None, height, n_inputs, n_channels), name="X")
# Reshape the place_holder into [no_of_images, image_height, image_width, no_of_channels]
# X_image = tf.reshape(X,[-1, height, n_inputs, n_channels])

# Placeholder for y
y = tf.placeholder(tf.float32, shape=(None, n_output_dimension), name="y")

# Drop our prob and learning_rate for Optimizer
dropout_prob = 0.9
learning_rate = 0.0002

In [23]:
# Defining the model

# (batch,20,513,1) -> (batch,20,513,8)
convolution_layer_1 = conv2d(X_image, no_of_filters = 8, filter_size = 4, 
                             padding = "same", layer_name = "convolution_layer_1",
                             activation_function = "relu")

# max_pooling_layer_1 = tf.layers.max_pooling2d(inputs=convolution_layer_1, 
                                              # pool_size=2, strides=2, padding='same')

# (batch,20,513,8) -> (batch,10,257,16)
convolution_layer_2 = conv2d(convolution_layer_1, no_of_filters = 16, filter_size = 2, 
                             padding = "same", layer_name = "convolution_layer_2", 
                             activation_function = "elu")
max_pooling_layer_2 = tf.layers.max_pooling2d(inputs=convolution_layer_2, 
                                              pool_size=2, strides=2, padding='same')
# (batch,10,257,16) -> (batch,5,129,32)
convolution_layer_3 = conv2d(max_pooling_layer_2, no_of_filters = 32, filter_size = 1,
                             padding = "same", layer_name = "convolution_layer_3",
                             activation_function = "elu")
max_pooling_layer_3 = tf.layers.max_pooling2d(inputs=convolution_layer_3, 
                                              pool_size=2, strides=2, padding='same')

# (batch,65,72) -> (batch,33,144)
# convolution_layer_4 = conv1d(max_pooling_layer_3, no_of_filters = 144, filter_size = 2, padding = "same", layer_name = "convolution_layer_4", activation_function = "relu")
# max_pooling_layer_4 = tf.layers.max_pooling1d(inputs=convolution_layer_4, pool_size=2, strides=2, padding='same')

flattened_max_pooling_layer_4_output = tf.reshape(max_pooling_layer_3, (-1, 5*129*32))
flattened_max_pooling_layer_4_output = tf.nn.dropout(flattened_max_pooling_layer_4_output, keep_prob=dropout_prob)


output_layer = fully_connected_layer(flattened_max_pooling_layer_4_output, n_output_dimension, 
                                     "fully_connected_output_layer", batch_norm = False, training = True, activation_function = "relu")

# output_layer = tf.layers.dense(flattened_max_pooling_layer_4_output, n_output_dimension)






Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
stddev used is: 0.00972364317089614


In [0]:
loss = tf.losses.mean_squared_error(labels=y,predictions=output_layer)
loss_function = tf.reduce_mean(loss, name = 'loss_function') # use square error for cost function

# Defining the Adam Optimizer we will use

optimizer = tf.train.AdamOptimizer(learning_rate, name = "Adam-Opt").minimize(loss_function)

In [0]:
# Initializing all variables

init = tf.global_variables_initializer()

In [0]:
# sess = tf.InteractiveSession()

# sess.run(init)

# convolution_layer_2_output = sess.run(max_pooling_layer_2, feed_dict={X_image: X_train_image[0:20,:,:,:], y: y_train})
# print(convolution_layer_2_output.shape)

# sess.close()


In [26]:
saver = tf.train.Saver()

n_epochs = 1000
batch_size = 128

min_loss = 0.005

extra_graphkeys_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

# with tf.Session() as sess:
sess = tf.InteractiveSession()
# init.run()
sess.run(init)
for epoch in range(1,n_epochs+1):
  for offset in range(0, X_train_image.shape[0], batch_size):
    # X_batch, y_batch = mnist.train.next_batch(batch_size)
    # X_batch, y_batch = shuffle(X_batch, y_batch)

    # rand_index = np.random.choice(n_inputs, size=batch_size)
    # rand_index = [i for i in range(iteration:iteration+batch_size)]
    X_batch = X_train_image[offset:offset+batch_size,:,:,:] # Transpose to the correct shape
    y_batch = y_train[offset:offset+batch_size,:]


    sess.run(optimizer, feed_dict={X_image: X_batch, y: y_batch})
  loss_train = sess.run(loss_function, feed_dict={X_image: X_batch, y: y_batch})
  # loss_test = sess.run(loss_function, feed_dict={X: X_test, y: y_test})
  if loss_train < min_loss:
    min_loss = loss_train
    print("Lowest train loss achieved till now!")
    save_path = saver.save(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt")
    print("Epoch:",epoch, "Train loss:", loss_train)
 
  else:
    if epoch % 100 == 0:
      print("Epoch:",epoch, "Train loss:", loss_train)

sess.close()

Lowest train loss achieved till now!
Epoch: 9 Train loss: 0.004779021
Lowest train loss achieved till now!
Epoch: 10 Train loss: 0.0044362773
Lowest train loss achieved till now!
Epoch: 11 Train loss: 0.0041842307
Lowest train loss achieved till now!
Epoch: 12 Train loss: 0.0040924973
Lowest train loss achieved till now!
Epoch: 13 Train loss: 0.0038648536
Lowest train loss achieved till now!
Epoch: 15 Train loss: 0.0034482474
Lowest train loss achieved till now!
Epoch: 19 Train loss: 0.0034463932
Lowest train loss achieved till now!
Epoch: 20 Train loss: 0.0033947
Lowest train loss achieved till now!
Epoch: 24 Train loss: 0.0032482287
Lowest train loss achieved till now!
Epoch: 25 Train loss: 0.003169674
Lowest train loss achieved till now!
Epoch: 26 Train loss: 0.003089836
Lowest train loss achieved till now!
Epoch: 28 Train loss: 0.002934055
Lowest train loss achieved till now!
Epoch: 29 Train loss: 0.0027209613
Lowest train loss achieved till now!
Epoch: 32 Train loss: 0.0026845268


In [27]:
saver = tf.train.Saver()
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/test_x_01.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

X_test_abs = np.abs(X_t)

X_test = np.transpose(X_test_abs)

print(X_test.shape)

for i in range(19):
  X_test = np.insert(X_test, 0, [np.random.uniform(10**(-26), 10**(-50)) for j in range(513)], axis=0)

print(X_test.shape)


X_test_image = np.zeros([X_test_abs.shape[1],20,X_test_abs.shape[0]])

for i in range(X_test.shape[0]):
  temp = X_test[i:i+20,:]
  if len(temp) == 20:
    X_test_image[i,:,:] = temp

print(len(X_test_image))

X_test_image = np.reshape(X_test_image,(X_test_abs.shape[1], 20, X_test_abs.shape[0],1))
print(X_test_image.shape)

# X_test = np.reshape(X_test,(X_test.shape[0],X_test.shape[1],1))


sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X_image: X_test_image})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_test_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_2/test_s_01_2d_cnn_recons.wav', s_cap, sr)



(142, 513)
(161, 513)
142
(142, 20, 513, 1)
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt


In [28]:
sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/test_x_02.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

X_test_abs = np.abs(X_t)

X_test = np.transpose(X_test_abs)

print(X_test.shape)

for i in range(19):
  X_test = np.insert(X_test, 0, [np.random.uniform(10**(-26), 10**(-50)) for j in range(513)], axis=0)

print(X_test.shape)


X_test_image = np.zeros([X_test_abs.shape[1],20,X_test_abs.shape[0]])

for i in range(X_test.shape[0]):
  temp = X_test[i:i+20,:]
  if len(temp) == 20:
    X_test_image[i,:,:] = temp

print(len(X_test_image))

X_test_image = np.reshape(X_test_image,(X_test_abs.shape[1], 20, X_test_abs.shape[0],1))
print(X_test_image.shape)




sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X_image: X_test_image})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_test_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_2/test_s_02_2d_cnn_recons.wav', s_cap, sr)



(380, 513)
(399, 513)
380
(380, 20, 513, 1)
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt


In [29]:
# Calculating SNR for the first input file
saver = tf.train.Saver()
s, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_clean_male.wav', sr=None)
S=librosa.stft(s, n_fft=1024, hop_length=512)

sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_dirty_male.wav', sr=None)
X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

# y_train = np.transpose(np.abs(S))
# X_train_abs = np.abs(X_t)
# X_train = np.transpose(X_train_abs)

# print(X_train.shape)

# X_train_image = np.zeros([X_train.shape[0],20,X_train.shape[1]])

# for i in range(X_train.shape[0]):
#   temp = X_train[i:i+20,:]
#   if len(temp) == 20:
#     X_train_image[i,:,:] = temp

# len(X_train_image)

# X_train_image = np.reshape(X_train_image,(X_train_abs.shape[1], 20, X_train_abs.shape[0],1))
print(X_train_image.shape)

# X_train = np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))

# sn, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_1/Problem_2/test_x_02.wav', sr=None)
# X_t=librosa.stft(sn, n_fft=1024, hop_length=512)

# X_test_abs = np.abs(X_t)

# X_test = np.transpose(X_test_abs)

X_train_abs = np.abs(X_t)
sess = tf.InteractiveSession()
saver.restore(sess, "/content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt")

mod_S_test_predicted = sess.run(output_layer, feed_dict={X_image: X_train_image})

sess.close()

S_cap = np.multiply(np.divide(X_t,X_train_abs),np.transpose(mod_S_test_predicted))

s_cap = librosa.core.istft(S_cap, hop_length=512, length=len(sn))

librosa.output.write_wav('/content/drive/My Drive/Deep_Learning/Assignment_2/train_2d_cnn_recons.wav', s_cap, sr)





# snr = 10*np.log10(np.sum(np.square(s))/np.sum(np.square(np.subtract(s, s_cap))))
print(s.shape)
print(s_cap.shape)
snr = 10*np.log10(np.dot(np.transpose(s),s)/np.dot(np.transpose(s-s_cap),(s-s_cap)))
print('Signal to Noise Ratio for input file using 2D CNN is:',snr)

(2459, 20, 513, 1)
INFO:tensorflow:Restoring parameters from /content/drive/My Drive/Deep_Learning/Assignment_2/best_audio_denoising_2d_cnn_model.ckpt
(1258899,)
(1258899,)
Signal to Noise Ratio for input file using 2D CNN is: 16.21869206428528


In [30]:
# Calculating SNR in the time domain too
sn1, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_clean_male.wav')
sn2, sr=librosa.load('/content/drive/My Drive/Deep_Learning/Assignment_2/train_2d_cnn_recons.wav')
snr=10*np.log10( np.square(sn1).sum() / ( np.square(sn1-sn2) ).sum()  ) 
print('Signal to Noise Ratio for input file using 2D CNN is:',snr)


Signal to Noise Ratio for input file using 2D CNN is: 16.353379487991333
