In [4]:
# Mount Google Drive
from google.colab import drive # import drive from google colab
 
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)
 
drive.mount(ROOT)           # we mount the google drive at /content/drive


/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## This Notebook Gave in SMR=0 SDR=2.57 and 2.45

In [5]:
#!touch helpers2.py

In [6]:
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter, freqz

from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

from numpy import linalg as LA
from numpy.linalg import inv
from helpers2 import Reconstruct, Viz_Y,SMR,get_mixed_signal,SDR,ReconstructSoft,butter_lowpass_filter
import seaborn as sns
import warnings
import math
from tqdm import tqdm
warnings.simplefilter('ignore')

In [7]:
def eval(D,G_test,Ytest):
  Sources,Masks=Reconstruct(B=D,G=G_test,Ns=Dc.shape[1],Nm=Nm,Yabs=Ytest,p=0.5)

  print('Reconstruction Step .... Done')
  speech_est = Sources[0]
  music_est = Sources[1]

  _, speech_est =  signal.istft(speech_est,
                      samplerate,
                      window = WINDOW,
                      nperseg=WINDOW_SIZE,
                      noverlap=OVERLAP,
                      nfft = NFFT)

  _, music_est =  signal.istft(music_est,
                      samplerate,
                      window = WINDOW,
                      nperseg=WINDOW_SIZE,
                      noverlap=OVERLAP,
                      nfft = NFFT)

  sdr_speech = SDR(s_est=speech_est,s=test_s)
  sdr_music = SDR(s_est=music_est, s=test_m)

  print(f'Speech SDR = {sdr_speech}')
  print(f'Music SDR = {sdr_music}')

In [8]:
# Best 1-20 min

start = 1 * 60 * 44100
end = 20 * 60 * 44100 

samplerate_s, data_speech = read("/content/drive/MyDrive/Conversation.wav")
speech=data_speech[start:end,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {} ... Length : {:.2f}s ... Sample rate : {}'.format(speech.shape[0],length,samplerate_s))

start = 1 * 60 * 44100
end = 5 * 60 * 44100 
samplerate_m, data_music = read("/content/drive/MyDrive/music.wav")
music=data_music[start:end,0]
length=music.shape[0]/samplerate_m
print('Shape of the music {} ... Length : {:.2f}s ... Sample rate : {}'.format(music.shape[0],length,samplerate_m))


Shape of the speech 50274000 ... Length : 1140.00s ... Sample rate : 44100
Shape of the music 10584000 ... Length : 240.00s ... Sample rate : 44100


In [9]:
fs = 16000

rate = samplerate_s / fs


start = 1 * 60 * 44100
end = 20 * 60 * 44100


speech_t=data_speech[start : end, 0]
music_t = data_music[start : end, 0]


speech_t = signal.resample(speech_t,int(speech_t.shape[0]/rate))
music_t = signal.resample(music_t,int(music_t.shape[0]/rate))
samplerate=int(samplerate_m/rate)
length=music_t.shape[0]/samplerate

print('Shape of the test {} ... Length : {:.2f}s ... Sample rate : {}'.format(music_t.shape[0],length,samplerate))

speech = signal.resample(speech,int(speech.shape[0]/rate))
music = signal.resample(music,int(music.shape[0]/rate))


print('Downsampled rate = {}'.format(samplerate))

speech = butter_lowpass_filter(speech,5000,fs)
music = butter_lowpass_filter(music,5000,fs)

music_t = butter_lowpass_filter(music_t,5000,fs)
speech_t = butter_lowpass_filter(speech_t,5000,fs)

Shape of the test 18240000 ... Length : 1140.00s ... Sample rate : 16000
Downsampled rate = 16000


## Training STFT :


In [10]:
WINDOW = 'hamming'
WINDOW_SIZE=480
OVERLAP = 0.8 * WINDOW_SIZE
NFFT=512

f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_s=np.abs(Y)
f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_m=np.abs(Y)



SMR_db = 0
mix,speech_mix,music_mix=get_mixed_signal(speech_t,music_t,SMR_db)


f,t,Ymix= signal.stft(mix,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_mix=np.abs(Ymix)

Yabs_mix[Yabs_mix==0]=0.00001
write("/MixX.wav", samplerate, mix.astype(np.int16))



SMR = -0.00


## Test STFT :

In [11]:
fs = 16000

rate = samplerate_s / fs


start = 15 * 60 * 44100
step = int(0.1 * 60 * 44100)

test_s = np.array([])
test_m = np.array([])

for i in range(1):

  test_s = np.hstack([test_s,data_speech[start+i*step:start+(i+1)*step,0]])
  test_m = np.hstack([test_m,data_music[start+i*step:start+(i+1)*step,0]])


test_s = signal.resample(test_s,int(test_s.shape[0]/rate))
test_m = signal.resample(test_m,int(test_m.shape[0]/rate))
samplerate=int(samplerate_m/rate)
length=music_t.shape[0]/samplerate


test_s = butter_lowpass_filter(test_s,5000,fs)
test_m = butter_lowpass_filter(test_m,5000,fs)


################################################################################
SMR_db = 0
test,speech_test,music_test=get_mixed_signal(test_s,test_m,SMR_db)


f,t,Ytest= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_test=np.abs(Ytest)

Yabs_test[Yabs_test==0]=0.00001


SMR = 0.00


# Train First NMF on Clean Speech :

In [12]:
def softmax(x):

  e_x = np.exp(x)
  return e_x / e_x.sum(axis=0)

In [13]:
Nc = 8 #8
Nm = 8

model = NMF(n_components=Nc, init='random',alpha=0.0,beta_loss='frobenius',solver="mu",max_iter=50, random_state=7)
model.fit(np.transpose(Yabs_s))
Dc = np.transpose(model.components_)
scaler = MinMaxScaler()
Dc = scaler.fit_transform(Dc)


# Train NMF on Noisy Speech :

In [14]:
def nmf(X, Dc, Nn, lamb=0.1, maxit=100):

    Nc = Dc.shape[1]
    H = np.random.rand(Nc+Nn, X.shape[1])

    Dn = np.random.rand(X. shape[0], Nn)
    print(f"Shape of Dc {Dc.shape} Shape of Dn {Dn.shape}")
    D = np.hstack([Dc,Dn])
    Dnorm = D / np.sum(D**2, axis=0)**(.5)

    print(f'Dnorm shape {Dnorm.shape} and X shape {X.shape} and H shape {H.shape}')
    hist=[]
    for i in range(maxit):
        H = H * (np.matmul(Dnorm.T, X)) / (np.matmul(np.matmul(Dnorm.T, Dnorm), H) + lamb)
        D[:,Dc.shape[1]:] = (Dnorm * (np.matmul(X, H.T) + Dnorm * (np.matmul(np.ones((X.shape[0], X.shape[0])), np.matmul(Dnorm, np.matmul(H, H.T)) * Dnorm))) / (np.matmul(Dnorm, np.matmul(H, H.T)) + Dnorm * (np.matmul(np.ones((X.shape[0], X.shape[0])), np.matmul(X, H.T) * Dnorm))))[:,Dc.shape[1]:]
        Dnorm = D / np.sum(D**2, axis=0)**(.5)
        hist.append(LA.norm(X-np.matmul(Dnorm,H)))
    #Dnorm[:,Dc.shape[1]:] = softmax(Dnorm[:,Dc.shape[1]:])
    return Dnorm, H,hist

In [15]:
D,H,hist = nmf(Yabs_mix,Dc,Nm)

scaler = MinMaxScaler()
D =  scaler.fit_transform(D)


Shape of Dc (257, 8) Shape of Dn (257, 8)
Dnorm shape (257, 16) and X shape (257, 190001) and H shape (16, 190001)


# Test NMF :

In [16]:
model_test = NMF(n_components=Nc+Nm, init='nndsvd',alpha=0.1,beta_loss='frobenius',solver="mu",max_iter=200, random_state=7)
model_test.fit(np.transpose(Yabs_test))
    
model_test.components_= np.transpose(D)
G_test=np.transpose(model_test.transform(np.transpose(Yabs_test)))

In [17]:
from tqdm import tqdm



def soft(z,a,l=0.02):
  h = np.maximum(np.abs(z)-5,np.zeros(z.shape[0]))
  return h
  
def warm_start_ISTA(x,W,n_components,a,K,l=0.02):

  np.random.seed(seed=7)
  h = np.random.rand(W.shape[1] , x.shape[1])
  T = x.shape[1]

  for t_ in tqdm(range(1,T)):
    h[:,t_] = h[:,t_-1]

    for _ in range(1,K):
      z = (np.identity(n_components) - (1/a)*(np.transpose(W)@W))@h[:,t_] + \
          (1/a)*np.transpose(W)@x[:,t_]

      h[:,t_] = soft(z,a,l)
  return h 

In [18]:
h_ws = warm_start_ISTA(Yabs_test,D,16,
                    10,
                    3)

100%|██████████| 1000/1000 [00:00<00:00, 5169.67it/s]


In [19]:
eval(D,G_test,Ytest) ,eval(D,h_ws,Ytest)

Reconstruction Step .... Done
Speech SDR = 3.094948486134423
Music SDR = 2.7694485410458274
Reconstruction Step .... Done
Speech SDR = 3.1056233447209425
Music SDR = 2.7780513908153286


(None, None)

In [20]:
eval(D,G_test,Ytest) ,eval(D,h_ws,Ytest)

Reconstruction Step .... Done
Speech SDR = 3.094948486134423
Music SDR = 2.7694485410458274
Reconstruction Step .... Done
Speech SDR = 3.1056233447209425
Music SDR = 2.7780513908153286


(None, None)

# Unfolded ISTA

In [23]:
import torch

In [24]:
Sources,Masks=Reconstruct(B=D,G=G_test,Ns=Dc.shape[1],Nm=Nm,Yabs=Ytest,p=0.5)
M = Masks[0]
f,t,Y= signal.stft(test_s,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Y_clean=np.abs(Y)
Y_clean[Y_clean==0]=0.00001


In [26]:
X = Yabs_test
X_cmplx = Ytest
W = D
alpha = 100
K = 3
lambd = 0.01
torch.autograd.set_detect_anomaly(False)


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f71487257d0>

In [43]:
def speech_mask(B,G,Ns,p):
    B1=B[:,:Ns]
    B2=B[:,Ns:]
    G1=G[:Ns,:]
    G2=G[Ns:,:]
    
    
    numerator = torch.pow(torch.matmul(B1,G1),p)

    denominator = torch.pow(torch.matmul(B1,G1),p)+torch.pow(torch.matmul(B2,G2),p)
  

    mask_speech = numerator/(denominator+0.00001)

    
    return mask_speech


def sigmoid(x):
  return torch.nn.Sigmoid()(x)

def soft(z,a,l=0.02):
  h = np.maximum(np.abs(z),np.zeros(z.shape[0]))
  return h


def error(X,M,Y):

  return torch.nn.MSELoss()(Y,M*X)

def Unfolded_ISTA(X, W, alpha, K, lambd,epochs,learning_rate):
  loss_e = 0
  epoch_loss = []
  torch.manual_seed(7)
  X = torch.from_numpy(X).float()
  W = torch.from_numpy(W).float()
  W = torch.tile(W, (K,1,1))
  alpha_list = list(torch.tile(torch.tensor([1]).float(),(K,)))
  H = torch.rand(W.shape[2],X.shape[1])
  H = torch.tile(H, (K,1,1))
  H_clone = H.clone()
  Y = torch.from_numpy(Y_clean).float()
  W1 = W[0]
  W2 = W[1]
  #W3 = W[2]

  W1.requires_grad = False
  W2.requires_grad = True
  #W3.requires_grad = True
  params =  [W2] #+ [W2]# + [W3]
  sgd = torch.optim.SGD(params,lr=learning_rate)
  for e in range(epochs):
    for t in tqdm(np.arange(1,X.shape[1]),position=0, leave=True):

      for k in range(1,K):
        if k ==1:
          z = (torch.eye(16) - (1/alpha_list[k])*(sigmoid(W[k].t())@sigmoid(W[k])))@H[k,:,t-1] + (1/alpha_list[k])*sigmoid(W[k].t())@X[:,t]

          H[k,:,t] = soft(z,0,0.001)
        elif k > 1:
          z = (torch.eye(16) - (1/alpha_list[k])*(sigmoid(W[k].t())@sigmoid(W[k])))@H[k-1,:,t] + (1/alpha_list[k])*sigmoid(W[k].t())@X[:,t]

          H[k,:,t] = soft(z,0,0.001)

        H[k] = sigmoid(H[k])

      sgd.zero_grad()      


          #h0_ = unfolded_cell(W1,H,alpha_list,0,t)
        # h1_k = unfolded_cell(W2,h0_,alpha_list,1,t)
        # h2_k  = unfolded_cell(W3,h1_k.float(),alpha_list,2,t)

      mask = speech_mask(sigmoid(W2),H[1],8,1)
      #print(mask)
      loss = error(X[:,t],mask[:,t],Y[:,t])
      #print("\n",loss)

      loss.backward()

      sgd.step()
    print(f'Epoch {e} ... Loss = {loss.item()}')
  return (torch.nn.Sigmoid()(W2)).detach().numpy(), H.detach().numpy()


In [44]:
d_new ,s = Unfolded_ISTA(X, W, 100, 2, lambd,epochs=10,learning_rate= 0.001)

100%|██████████| 1000/1000 [00:08<00:00, 115.13it/s]
  1%|          | 11/1000 [00:00<00:09, 108.28it/s]

Epoch 0 ... Loss = 748.6536254882812


100%|██████████| 1000/1000 [00:08<00:00, 115.18it/s]
  1%|          | 11/1000 [00:00<00:09, 101.65it/s]

Epoch 1 ... Loss = 722.3134765625


100%|██████████| 1000/1000 [00:08<00:00, 111.47it/s]
  1%|          | 11/1000 [00:00<00:09, 106.39it/s]

Epoch 2 ... Loss = 709.1109619140625


100%|██████████| 1000/1000 [00:09<00:00, 110.28it/s]
  1%|          | 12/1000 [00:00<00:08, 114.20it/s]

Epoch 3 ... Loss = 702.0897827148438


100%|██████████| 1000/1000 [00:09<00:00, 110.10it/s]
  1%|          | 12/1000 [00:00<00:08, 113.26it/s]

Epoch 4 ... Loss = 697.96240234375


100%|██████████| 1000/1000 [00:09<00:00, 108.54it/s]
  1%|          | 12/1000 [00:00<00:08, 112.39it/s]

Epoch 5 ... Loss = 695.3065185546875


100%|██████████| 1000/1000 [00:08<00:00, 111.38it/s]
  1%|          | 11/1000 [00:00<00:09, 103.62it/s]

Epoch 6 ... Loss = 693.5171508789062


100%|██████████| 1000/1000 [00:09<00:00, 110.68it/s]
  1%|          | 11/1000 [00:00<00:09, 109.64it/s]

Epoch 7 ... Loss = 692.2645874023438


100%|██████████| 1000/1000 [00:08<00:00, 111.15it/s]
  1%|          | 12/1000 [00:00<00:08, 117.10it/s]

Epoch 8 ... Loss = 691.4091186523438


100%|██████████| 1000/1000 [00:08<00:00, 113.42it/s]

Epoch 9 ... Loss = 690.8125





In [45]:
d_new[d_new == 0] = 0.0001

In [53]:
print("SDR Evaluation using MU \n")
eval(D,G_test,Ytest) ## mu 
print("\n")
print("SDR Evaluation using Warm Start ISTA \n")

eval(D,h_ws,Ytest)  ## warm start ISTA
print("\n")

print("SDR Evaluation using Unfolded ISTA \n")

eval(d_new,s[1],Ytest) ## DU ISTA

SDR Evaluation using MU 

Reconstruction Step .... Done
Speech SDR = 3.094948486134423
Music SDR = 2.7694485410458274


SDR Evaluation using Warm Start ISTA 

Reconstruction Step .... Done
Speech SDR = 3.1056233447209425
Music SDR = 2.7780513908153286


SDR Evaluation using Unfolded ISTA 

Reconstruction Step .... Done
Speech SDR = 2.8690409717475474
Music SDR = 2.5497031564942993
