In [0]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import signal
from google.colab import files
#!pip install librosa # in colab, you'll need to install this
import librosa

In [0]:
train_clean_vector, train_clean_sr=librosa.load('train_clean_male.wav', sr=None)
train_clean_stft=librosa.stft(train_clean_vector, n_fft=1024, hop_length=512)

train_noisy_vector, train_noisy_sr=librosa.load('train_dirty_male.wav', sr=None)
train_noisy_stft=librosa.stft(train_noisy_vector, n_fft=1024, hop_length=512)

test1_vector, test1_sr=librosa.load('test_x_01.wav', sr=None)
test1_stft=librosa.stft(test1_vector, n_fft=1024, hop_length=512)

test2_vector, test2_sr=librosa.load('test_x_02.wav', sr=None)
test2_stft=librosa.stft(test2_vector, n_fft=1024, hop_length=512)

## Data Transformation

In [0]:
train_Y_org=np.abs(train_clean_stft).transpose()
train_X_org=np.abs(train_noisy_stft).transpose()
test1_X_org=np.abs(test1_stft).transpose()
test2_X_org=np.abs(test2_stft).transpose()

In [0]:
train_Y = train_Y_org[19:,:]
train_X=np.asarray([train_X_org[i:i+20,:] for i in range(len(train_X_org)-19)])
test1_X=np.asarray([test1_X_org[i:i+20,:] for i in range(len(test1_X_org)-19)])
test2_X=np.asarray([test2_X_org[i:i+20,:] for i in range(len(test2_X_org)-19)])

## 2D CNN Model Configuration

In [0]:
x = tf.placeholder(tf.float32, [None,20, 513])
y = tf.placeholder(tf.float32, [None, 513])
input_layer = tf.reshape(x, [-1, 20,513,1])

In [0]:
conv1 = tf.layers.conv2d(inputs=input_layer, filters=16, kernel_size=[4,4], padding="same", activation=tf.nn.relu)
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2,2], strides=[2,2], padding="same")
conv2 = tf.layers.conv2d(inputs=pool1, filters=32, kernel_size=[4,4], padding="same", activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2,2], strides=[2,2], padding="same")
flattened = tf.layers.flatten(pool2)
fc = tf.layers.dense(inputs=flattened,units=2048,activation=tf.nn.relu)
last= tf.layers.dense(inputs=fc,units=513,activation=tf.nn.relu)
output = tf.reshape(last,[-1,513])

In [0]:
mse = tf.losses.mean_squared_error(y,output)
numerator=tf.log(tf.reduce_sum(tf.math.pow(y,2))/tf.reduce_sum(tf.math.pow(tf.math.subtract(y,output),2)))
denominator = tf.log(tf.constant(10, dtype=numerator.dtype))
snr=10*(numerator/denominator)

## Model Training

In [9]:
learning_rate = 0.8
epochs = 100
batch_size = 10
optimiser = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(mse)

#Creating Minibatchs
total_batch=int(len(train_Y)/batch_size)
index=np.arange(len(train_X))

#Training the model
sess= tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
for epoch in range(epochs):
  avg_cost=snr_value=0
  np.random.shuffle(index)
  
  for i,idx in zip(range(total_batch),np.reshape(index,(-1,batch_size))):
    _ , c, snr1 = sess.run([optimiser, mse,snr], feed_dict={x: train_X[idx], y: train_Y[idx]})
    avg_cost += c / total_batch
    snr_value += snr1/total_batch
    #print(idx)
  if(epoch==0 or (epoch+1)%10==0):
    print("Epoch:", (epoch + 1), "cost =", "{:.3f}".format(avg_cost),"SNR: ","{:.3f}".format(snr_value))


Epoch: 1 cost = 0.066 SNR:  1.682
Epoch: 10 cost = 0.007 SNR:  10.181
Epoch: 20 cost = 0.004 SNR:  12.685
Epoch: 30 cost = 0.003 SNR:  13.568
Epoch: 40 cost = 0.003 SNR:  14.675
Epoch: 50 cost = 0.002 SNR:  15.334
Epoch: 60 cost = 0.002 SNR:  15.269
Epoch: 70 cost = 0.002 SNR:  16.087
Epoch: 80 cost = 0.002 SNR:  16.932
Epoch: 90 cost = 0.001 SNR:  17.199
Epoch: 100 cost = 0.002 SNR:  17.074


### **Cleanup of the noisy sounds with the trained model**

In [0]:
test1_clean=sess.run(output, feed_dict={x: test1_X})

test2_clean=sess.run(output, feed_dict={x: test2_X})

train_clean=sess.run(output, feed_dict={x: train_X})

In [48]:
random = np.random.uniform(low=np.min(train_clean), high=np.mean(train_clean), size=(19,513)) 
train_recon = np.append(random,train_clean, axis=0)

Phase = np.divide(train_noisy_stft, np.abs(train_noisy_stft))
train_reconstruction = np.multiply(np.transpose(train_recon),Phase)
train_reconstruction = librosa.istft(train_reconstruction,hop_length=512)
s_reduced = train_clean_vector[:1258496]
print("SNR of training data:",round(10*(math.log(np.sum(s_reduced**2)/np.sum((np.subtract(train_reconstruction,s_reduced)**2)),10)),2))

SNR of training data: 16.16


### Reconstructing and saving denoised test1 audio

In [0]:
test1_stft_trunc=test1_stft.transpose()[19:,:].transpose()
Phase = np.divide(test1_stft_trunc, np.abs(test1_stft_trunc))
test1_reconstruction = np.multiply(np.transpose(test1_clean),Phase)
test1_reconstruction = librosa.istft(test1_reconstruction,hop_length=512)
librosa.output.write_wav('test1_cleaned_2d.wav', test1_reconstruction, test1_sr)
files.download('test1_cleaned_2d.wav')

### Reconstructing and saving denoised test2 audio

In [0]:
test2_stft_trunc=test2_stft.transpose()[19:,:].transpose()
Phase = np.divide(test2_stft_trunc, np.abs(test2_stft_trunc))

test2_reconstruction = np.multiply(np.transpose(test2_clean),Phase)
test2_reconstruction = librosa.istft(test2_reconstruction,hop_length=512)
librosa.output.write_wav('test2_cleaned_2d.wav', test2_reconstruction, test2_sr)
files.download('test2_cleaned_2d.wav')