## Convolution architecture

In [None]:
!pip install librosa

In [None]:
import numpy as np
import torchvision
import torchvision.transforms as transforms
import torch
from torch import nn, optim
from torch.utils import data
import random
from time import time
import random
from random import randrange
import librosa
from IPython.display import Audio
from scipy.signal import stft 
from scipy.signal import istft
import scipy as sp
import math
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
device = torch.device('cuda:0')

In [None]:
# Upsample clean signal with 19 silent frames to to get output of 2459 by 513
rand_X = np.random.normal(0,0.001,(19,513))
rand_X_abs = np.abs(rand_X)
X_T1 = np.concatenate((rand_X_abs,X_T))

In [None]:
# Create patches of 20 by 513 input signal and 1 by 513 output signal
X_patch = []
Y_patch = []
for i in range(0,S_T.shape[0]):
    X_patch.append(X_T1[i:i+20,:])
    Y_patch.append(S_T[i,:])

In [None]:
# Create tensor object of patches

X_tensor = torch.tensor(X_patch, dtype=torch.float32).to(device)
Y_tensor = torch.tensor(Y_patch, dtype=torch.float32).to(device)

In [None]:
# Load train data with batch size of 128

train_dataset = torch.utils.data.TensorDataset(X_tensor,Y_tensor)
trainloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)

In [None]:
# Defining model architecture 

model=nn.Sequential(nn.Conv2d(1,16, kernel_size=(3,3), stride=1), 
                    nn.ReLU(),
                    nn.Conv2d(16,32, kernel_size=(3,3), stride=1),
                    nn.ReLU(),
                    nn.MaxPool2d(kernel_size=(2,2),stride=2),
                    nn.ReLU(), 
                    nn.Conv2d(32,6, kernel_size=(3,3),stride=1),
                    nn.ReLU(),
                    nn.AvgPool2d(kernel_size=(2,2),stride=2),
                    nn.ReLU(),       
                    nn.Flatten(),
                    nn.Linear(2268,1024),
                    nn.ReLU(),
                    nn.Linear(1024,513),
                    nn.ReLU(),
                    ).to(device) 

print(model)

Sequential(
  (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=(2, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): ReLU()
  (6): Conv2d(32, 6, kernel_size=(3, 3), stride=(1, 1))
  (7): ReLU()
  (8): AvgPool2d(kernel_size=(2, 2), stride=2, padding=0)
  (9): ReLU()
  (10): Flatten()
  (11): Linear(in_features=2268, out_features=1024, bias=True)
  (12): ReLU()
  (13): Linear(in_features=1024, out_features=513, bias=True)
  (14): ReLU()
)


In [None]:
# defining the Mean squared error for loss function
criterion = nn.MSELoss() 

In [None]:

optimizer = optim.Adam(model.parameters())
start = time()
epochs = 200
for i in range(epochs):
    running_loss = 0
    for x, y in trainloader:
      x=x.to(device)
      y=y.to(device)
      optimizer.zero_grad()

      output = model(x.view(-1,1,20,513))
      loss = criterion(output, y)
        
      #This is where the model learns by backpropagating
      loss.backward()
        
      #And optimizes its weights here
      optimizer.step()
        
      running_loss += loss.item()
    else:
        print("Epoch {} - Training loss: {}".format(i, running_loss/len(trainloader)))
print("\nTraining Time =",time()-start)

Epoch 0 - Training loss: 0.08960044607520104
Epoch 1 - Training loss: 0.07692374847829342
Epoch 2 - Training loss: 0.06757998941466212
Epoch 3 - Training loss: 0.0633871553465724
Epoch 4 - Training loss: 0.057708676904439926
Epoch 5 - Training loss: 0.05119535271078348
Epoch 6 - Training loss: 0.04354371698573232
Epoch 7 - Training loss: 0.038396379072219135
Epoch 8 - Training loss: 0.03499619048088789
Epoch 9 - Training loss: 0.030829435028135777
Epoch 10 - Training loss: 0.02677404214628041
Epoch 11 - Training loss: 0.02420265320688486
Epoch 12 - Training loss: 0.02327483459375799
Epoch 13 - Training loss: 0.0215888031758368
Epoch 14 - Training loss: 0.01898319725878537
Epoch 15 - Training loss: 0.017809338122606277
Epoch 16 - Training loss: 0.01638960442505777
Epoch 17 - Training loss: 0.014857350941747427
Epoch 18 - Training loss: 0.013791473000310361
Epoch 19 - Training loss: 0.013419375289231539
Epoch 20 - Training loss: 0.012355825584381818
Epoch 21 - Training loss: 0.0124039359

In [None]:
# Saving the model for future purposes (optional)
torch.save(model,'denoise_problem2.h5')

In [None]:
# Load the model (optional)
model = torch.load('denoise_problem2.h5')

## SNR calculation on train signals

In [None]:
# Getting clean O/P for train noisy signal 

with torch.no_grad():
    out = model(X_tensor.view(-1,1,20,513))
out_numpy = out.cpu().numpy()
print("Number of output ex and features:",out_numpy.shape)

Number of output ex and features: (2459, 513)


In [None]:
# recovered complex stft of train noisy signal
# recover clean signal using inverse STFT

X_norm = np.divide(X,X_abs)
S_pred = np.multiply(X_norm,out_numpy.T)
s_pred = librosa.istft(S_pred, hop_length=512)
print("Length of clean signal:",s_pred.shape)

Length of clean signal: (1258496,)


In [None]:
Audio(s_pred,rate=sr)

In [None]:
# Calculating SNR for the recovered train signal

SNR = 10*math.log10(np.sum(s[:len(s_pred),]**2)/np.sum((s[:len(s_pred),]-s_pred)**2))
print(SNR)

17.27724553822406


## Output on Test signals 

In [None]:
# Load test noisy signal
# STFT and take absolute which will be fed to the network

x_test1, sr_test1=librosa.load('test_x_01.wav', sr=None)
X_test1=librosa.stft(x_test1, n_fft=1024, hop_length=512)
X_test_abs1 = np.abs(X_test1)

x_test2, sr_test2=librosa.load('test_x_02.wav', sr=None)
X_test2=librosa.stft(x_test2, n_fft=1024, hop_length=512)
X_test_abs2 = np.abs(X_test2)

In [None]:
# Take transpose to be consistent with the network I/P

X_test_T1 = X_test_abs1.T
X_test_T2 = X_test_abs2.T

In [None]:
# Upsample noisy signal with 19 silent frames to to get output of same size
rand_X1 = np.random.normal(0,0.001,(19,513))
rand_X2 = np.random.normal(0,0.001,(19,513))

rand_X_abs1 = np.abs(rand_X1)
rand_X_abs2 = np.abs(rand_X2)

X_T1 = np.concatenate((rand_X_abs1,X_test_T1))
X_T2 = np.concatenate((rand_X_abs2,X_test_T2))

In [None]:
# Create patches of 20 by 513 for test signal 1 
X_test_patch1 = []
for i in range(0,X_T1.shape[0]):
  if i+20 <= X_T1.shape[0]:
    X_test_patch1.append(X_T1[i:i+20,:])

In [None]:
# Create patches of 20 by 513 for test signal 2 
X_test_patch2 = []
for i in range(0,X_T2.shape[0]):
  if i+20 <= X_T2.shape[0]:
    X_test_patch2.append(X_T2[i:i+20,:])

In [None]:
X_test_tensor1 = torch.tensor(X_test_patch1,dtype=torch.float32).to(device)
X_test_tensor2 = torch.tensor(X_test_patch2,dtype=torch.float32).to(device)

In [None]:
# Getting clean O/P for test noisy signal 

with torch.no_grad():
    out1 = model(X_test_tensor1.view(-1,1,20,513))
out_numpy1 = out1.cpu().numpy()

with torch.no_grad():
    out2 = model(X_test_tensor2.view(-1,1,20,513))
out_numpy2 = out2.cpu().numpy()

print("Number of output ex and features for test signal 1:",out_numpy1.shape)
print("Number of output ex and features for test signal 2:",out_numpy2.shape)

Number of output ex and features for test signal 1: (142, 513)
Number of output ex and features for test signal 2: (380, 513)


In [None]:
# recovered complex stft of test noisy signal 1
# recover clean signal using inverse STFT

X_norm1 = np.divide(X_test1,X_test_abs1)
S_pred1 = np.multiply(X_norm1,out_numpy1.T)
s_pred1 = librosa.istft(S_pred1, hop_length=512)
print("Length of clean signal:",s_pred1.shape)

Length of clean signal: (72192,)


In [None]:
Audio(s_pred1,rate=sr_test1)

In [None]:
# recovered complex stft of test noisy signal 2
# recover clean signal using inverse STFT

X_norm2 = np.divide(X_test2,X_test_abs2)
S_pred2 = np.multiply(X_norm2,out_numpy2.T)
s_pred2 = librosa.istft(S_pred2, hop_length=512)
print("Length of clean signal:",s_pred2.shape)

Length of clean signal: (194048,)


In [None]:
Audio(s_pred2,rate=sr_test2)