In [1]:
# Mount Google Drive
from google.colab import drive # import drive from google colab
 
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)
 
drive.mount(ROOT)           # we mount the google drive at /content/drive
%cd "/content/drive/My Drive/PFE"
%pwd

/content/drive
Mounted at /content/drive
/content/drive/My Drive/PFE


'/content/drive/My Drive/PFE'

In [None]:
!apt install octave
!apt install liboctave-dev  # development files
!pip install oct2py

In [4]:


from oct2py import Oct2Py
oc = Oct2Py()
script = '''
  function [SDR,SIR,SAR,perm]=bss_eval_sources(se,s)
%%% Errors %%%
if nargin<2, error('Not enough input arguments.'); end
[nsrc,nsampl]=size(se);
[nsrc2,nsampl2]=size(s);
if nsrc2~=nsrc, error('The number of estimated sources and reference sources must be equal.'); end
if nsampl2~=nsampl, error('The estimated sources and reference sources must have the same duration.'); end

%%% Performance criteria %%%
% Computation of the criteria for all possible pair matches
SDR=zeros(nsrc,nsrc);
SIR=zeros(nsrc,nsrc);
SAR=zeros(nsrc,nsrc);
for jest=1:nsrc,
    for jtrue=1:nsrc,
        [s_true,e_spat,e_interf,e_artif]=bss_decomp_mtifilt(se(jest,:),s,jtrue,512);
        [SDR(jest,jtrue),SIR(jest,jtrue),SAR(jest,jtrue)]=bss_source_crit(s_true,e_spat,e_interf,e_artif);
    end
end
% Selection of the best ordering
perm=perms(1:nsrc);
nperm=size(perm,1);
meanSIR=zeros(nperm,1);
for p=1:nperm,
    meanSIR(p)=mean(SIR((0:nsrc-1)*nsrc+perm(p,:)));
end
[meanSIR,popt]=max(meanSIR);
perm=perm(popt,:).';
SDR=SDR((0:nsrc-1).'*nsrc+perm);
SIR=SIR((0:nsrc-1).'*nsrc+perm);
SAR=SAR((0:nsrc-1).'*nsrc+perm);

return;



function [s_true,e_spat,e_interf,e_artif]=bss_decomp_mtifilt(se,s,j,flen)

if nargin<4, error('Not enough input arguments.'); end
[nchan2,nsampl2]=size(se);
[nsrc,nsampl,nchan]=size(s);
if nchan2~=nchan, error('The number of channels of the true source images and the estimated source image must be equal.'); end
if nsampl2~=nsampl, error('The duration of the true source images and the estimated source image must be equal.'); end

%%% Decomposition %%%
% True source image
s_true=[reshape(s(j,:,:),nsampl,nchan).',zeros(nchan,flen-1)];
% Spatial (or filtering) distortion
e_spat=project(se,s(j,:,:),flen)-s_true;
% Interference
e_interf=project(se,s,flen)-s_true-e_spat;
% Artifacts
e_artif=[se,zeros(nchan,flen-1)]-s_true-e_spat-e_interf;

return;



function sproj=project(se,s,flen)

% SPROJ Least-squares projection of each channel of se on the subspace
% spanned by delayed versions of the channels of s, with delays between 0
% and flen-1

[nsrc,nsampl,nchan]=size(s);
s=reshape(permute(s,[3 1 2]),nchan*nsrc,nsampl);

%%% Computing coefficients of least squares problem via FFT %%%
% Zero padding and FFT of input data
s=[s,zeros(nchan*nsrc,flen-1)];
se=[se,zeros(nchan,flen-1)];
fftlen=2^nextpow2(nsampl+flen-1);
sf=fft(s,fftlen,2);
sef=fft(se,fftlen,2);
% Inner products between delayed versions of s
G=zeros(nchan*nsrc*flen);
for k1=0:nchan*nsrc-1,
    for k2=0:k1,
        ssf=sf(k1+1,:).*conj(sf(k2+1,:));
        ssf=real(ifft(ssf));
        ss=toeplitz(ssf([1 fftlen:-1:fftlen-flen+2]),ssf(1:flen));
        G(k1*flen+1:k1*flen+flen,k2*flen+1:k2*flen+flen)=ss;
        G(k2*flen+1:k2*flen+flen,k1*flen+1:k1*flen+flen)=ss.';
    end
end
% Inner products between se and delayed versions of s
D=zeros(nchan*nsrc*flen,nchan);
for k=0:nchan*nsrc-1,
    for i=1:nchan,
        ssef=sf(k+1,:).*conj(sef(i,:));
        ssef=real(ifft(ssef,[],2));
        D(k*flen+1:k*flen+flen,i)=ssef(:,[1 fftlen:-1:fftlen-flen+2]).';
    end
end

%%% Computing projection %%%
% Distortion filters
C=G\D;
C=reshape(C,flen,nchan*nsrc,nchan);
% Filtering
sproj=zeros(nchan,nsampl+flen-1);
for k=1:nchan*nsrc,
    for i=1:nchan,
        sproj(i,:)=sproj(i,:)+fftfilt(C(:,k,i).',s(k,:));
    end
end

return;



function [SDR,SIR,SAR]=bss_source_crit(s_true,e_spat,e_interf,e_artif)


if nargin<4, error('Not enough input arguments.'); end
[nchant,nsamplt]=size(s_true);
[nchans,nsampls]=size(e_spat);
[nchani,nsampli]=size(e_interf);
[nchana,nsampla]=size(e_artif);
if ~((nchant==nchans)&&(nchant==nchani)&&(nchant==nchana)), error('All the components must have the same number of channels.'); end
if ~((nsamplt==nsampls)&&(nsamplt==nsampli)&&(nsamplt==nsampla)), error('All the components must have the same duration.'); end

%%% Energy ratios %%%
s_filt=s_true+e_spat;
% SDR
SDR=10*log10(sum(sum(s_filt.^2))/sum(sum((e_interf+e_artif).^2)))
% SIR
SIR=10*log10(sum(sum(s_filt.^2))/sum(sum(e_interf.^2)))
% SA
SAR=10*log10(sum(sum((s_filt+e_interf).^2))/sum(sum(e_artif.^2)))
return;

         '''

with open("myScript.m","w+") as f:
    f.write(script)


In [5]:
import torch
import numpy as np
from scipy.io.wavfile import read, write
from scipy import signal
from scipy.signal import butter, lfilter, freqz
import matplotlib.pyplot as plt
import seaborn as sns


from Model import Net,Netsmall,AEModel
from boltzman_machine import RBM
from helpers import Reconstruct, Viz_Y


from torch.optim import LBFGS, Adam,SGD
from torch import nn
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
import warnings 

import tqdm
from helpers2 import *
warnings.simplefilter('ignore')

In [None]:
!ffmpeg -i  "/content/drive/My Drive/PFE/vocal_10.mp3" "vocal.wav"
!ffmpeg -i  "/content/drive/My Drive/PFE/piano_10.mp3" "music.wav"

In [184]:
def gain_params(s1,s2,y):
 
        l2_s1 = torch.norm(s1.float(),2, None)
                 
        l2_s2 = torch.norm(s2.float(),2, None)
               
        l2_Y = torch.norm(y.float(),2, None)
              
        # u is initialized by the l2-norm of the initial NMF source estimate sˆ1 
        # divided by the l2-norm of the mixed signal y
        u = torch.div(l2_s1, l2_Y) 
 
        # v is initialized by the same manner
        v = torch.div(l2_s2, l2_Y)
 
        return u,v
    
def gain_params_vec(s1,s2,y):
 
        l2_s1 = torch.norm(s1.float().t(),2, True)
                 
        l2_s2 = torch.norm(s2.float().t(),2, True)
               
        l2_Y = torch.norm(y.float().t(),2, True)
              
        # u is initialized by the l2-norm of the initial NMF source estimate sˆ1 
        # divided by the l2-norm of the mixed signal y
        u = torch.div(l2_s1, l2_Y) 
 
        # v is initialized by the same manner
        v = torch.div(l2_s2, l2_Y)
 
        return u,v

def gain_params_matrix(s1,s2,y):

  u = y / s1
  v = y / s2

  return u,v
    
    
def feed_(x):
    
    f = model(x)
    f1 = f[0] 
    f2 = f[1]
    return f1,f2
 
def energy_1(x_source1):
    
    f1,f2 = feed_(x_source1)
    e1 = (1 - f1 ).pow(2) + f2.pow(2)
    return e1 
 
def energy_2(x_source2):
 
    f1,f2 = feed_(x_source2)
    e2 = f1.pow(2) + (1 - f2 ).pow(2) 
    return e2
 



 
def E_err_vec(s1,s2,y,u,v):
  
    return torch.norm(u*s1 + v*s2 - y,'fro',None)
 
def E_err_vec_nouv(s1,s2,y):
  
    return torch.norm(s1 + s2 - y,'fro',None)
 
def nonneg_constraint_sum(s1,s2,u,v):
    
    Rs1=min([s1.min().item(),0])**2
    Rs2=min([s2.min().item(),0])**2
    Ru=min([u,0])**2
    Rv=min([v,0])**2
    
    return Rs1+Rs2+Ru+Rv
 
def reconstruct(est1,est2,u,v,Y,p):
    
    speech_num = (u * est1)**p
    music_num = (v * est2)**p
    den = (u * est1)**p + (v * est2)**p
    
    speech_est = (speech_num/den)*Y
    music_est = (music_num/den)*Y
    
    return speech_est,music_est
 
def Criteria(s1,s2,Yabs,u,v,i,lambd,beta) :
    
    # Feed forward and get energy 1 and 2
    e1 = energy_1(s1[:,i].float())
    e2 = energy_2(s2[:,i].float())
    
    # Get least square error :
    e_rr=E_err_vec(s1[:,i], s2[:,i], Yabs[:,i],u[i],v[i])
    
    # Non negative constraint
    R = nonneg_constraint_sum(s1[:,i], s2[:,i], u[i], v[i])
    
    # Compute total Loss :
    
    #print(f'E1 = {e1:.2f} ... E2 = {e2:.2f} ... Err = {e_rr:.2f} ... R = {R:.2f} \n')
    return e1 + e2 + lambd*e_rr + beta*R


def Criteria_nouv(s1,s2,Yabs,i,lambd,beta) :
    
    # Feed forward and get energy 1 and 2
    e1 = energy_1(s1[:,i].float())
    e2 = energy_2(s2[:,i].float())
    
    # Get least square error :
    e_rr=E_err_vec_nouv(s1[:,i], s2[:,i], Yabs[:,i])
    
    # Non negative constraint
    #R = nonneg_constraint_sum(s1[:,i], s2[:,i], u[:,i], v[:,i])
    
    # Compute total Loss :
    
    #print(f'E1 = {e1:.2f} ... E2 = {e2:.2f} ... Err = {e_rr:.2f} ... R = {R:.2f} \n')
    return e1 + e2 + lambd*e_rr

In [185]:
def eval(speech_estx,music_estx,ux,vx,scaler1,scaler2,p,WINDOW = 'hamming',WINDOW_SIZE=480,OVERLAP = 0.8 ,NFFT=512,save=False,ret=False):

  OVERLAP = OVERLAP * WINDOW_SIZE

  speech_estx = speech_estx.cpu().detach().numpy()
  music_estx = music_estx.cpu().detach().numpy()
  ux = ux.cpu().detach().numpy()
  vx = vx.cpu().detach().numpy()

  ux[ux==0]=0.00001
  vx[vx==0]=0.00001
  speech_estx = np.transpose(scaler1.inverse_transform(np.transpose(speech_estx)))
  music_estx = np.transpose(scaler2.inverse_transform(np.transpose(music_estx)))
  # Omit negative values
  speech_estx[speech_estx<0]=0.00001
  music_estx[music_estx<0]=0.00001

  speech_est_c,music_est_c=reconstruct(speech_estx,music_estx,ux,vx,Y,p)



  #Recover ISTFT
  _, speech_est_sig =  signal.istft(speech_est_c,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

  _, music_est_sig =  signal.istft(music_est_c,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

  speech_est_sig = speech_est_sig[:speech_test.shape[0]]
  music_est_sig = music_est_sig[:music_test.shape[0]]

  #sdr_speech = SDR(s_est=speech_est_sig,s=speech_test)
  #sdr_music = SDR(s_est=music_est_sig, s=music_test)
  oc.myScript(speech_est_sig,speech_test)
  oc.myScript(music_est_sig,music_test)       
       

      
  #print('SDR Speech = {:.3f} ... SDR Music = {:.3f}'.format(sdr_speech,sdr_music))

  if save :
    np.savetxt('Matlab_SDR/speech_orig.txt',speech_test)
    np.savetxt('Matlab_SDR/speech_est.txt',speech_est_sig)

  if ret :
    return speech_est_sig


In [186]:
def reconstruct(est1,est2,Y,p):
    
    speech_num = (est1)**p
    music_num = (est2)**p
    den = (est1)**p + (est2)**p
    
    speech_est = (speech_num/den)*Y
    music_est = (music_num/den)*Y
    
    return speech_est,music_est

def eval_nouv(speech_estx,music_estx,scaler1,scaler2,p,WINDOW = 'hamming',WINDOW_SIZE=480,OVERLAP = 0.8 ,NFFT=512,save=False,ret=False):

  OVERLAP = OVERLAP * WINDOW_SIZE

  speech_estx = speech_estx.cpu().detach().numpy()
  music_estx = music_estx.cpu().detach().numpy()

  speech_estx = np.transpose(scaler1.inverse_transform(np.transpose(speech_estx)))
  music_estx = np.transpose(scaler2.inverse_transform(np.transpose(music_estx)))
  # Omit negative values
  speech_estx[speech_estx<0]=0.00001
  music_estx[music_estx<0]=0.00001

  speech_est_c,music_est_c=reconstruct(speech_estx,music_estx,Y,p)



  #Recover ISTFT
  _, speech_est_sig =  signal.istft(speech_est_c,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

  _, music_est_sig =  signal.istft(music_est_c,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

  speech_est_sig = speech_est_sig[:speech_test.shape[0]]
  music_est_sig = music_est_sig[:music_test.shape[0]]

  #sdr_speech = SDR(s_est=speech_est_sig,s=speech_test)
  #sdr_music = SDR(s_est=music_est_sig, s=music_test)
  oc.myScript(speech_est_sig,speech_test)
  oc.myScript(music_est_sig,music_test)       
       

      
  #print('SDR Speech = {:.3f} ... SDR Music = {:.3f}'.format(sdr_speech,sdr_music))

  if save :
    np.savetxt('Matlab_SDR/speech_orig.txt',speech_test)
    np.savetxt('Matlab_SDR/speech_est.txt',speech_est_sig)

  if ret :
    return speech_est_sig


# 1.Data Preprocessing :

* Speech_train : Used to train NMF speech.
* music_train : used to train NMF music.
* Y_music and Y_speech : complex spectrogram pf train.
* Yabs_music and Yabs_speech : abs cpetrogram train.
* Ytest and Yabs_test : Spectrograms of test.
* test , speech_t, music_t : signals of test

In [8]:
start = 1 * 60 * 44100
end = 2 * 60 * 44100 


PATH = '/content/drive/My Drive/PFE/'
samplerate_s, data_speech = read(PATH+"DATA/Conversation.wav")
samplerate_m, data_music = read(PATH+"DATA/Bigmusic2.wav")
fs = 16000

rate = samplerate_s / fs




speech_train = data_speech[start : end,0]
music_train = data_music[start:end, 0]


speech_train = signal.resample(speech_train,int(speech_train.shape[0]/rate))
music_train = signal.resample(music_train,int(music_train.shape[0]/rate))
samplerate=int(samplerate_m/rate)
length=music_train.shape[0]/samplerate

print('Shape of the test {} ... Length : {:.2f}s ... Sample rate : {}'.format(music_train.shape[0],length,samplerate))
print('Downsampled rate = {}'.format(samplerate))


music_train = butter_lowpass_filter(music_train,5000,fs)
speech_train = butter_lowpass_filter(speech_train,5000,fs)



samplerate = fs
"""
STFT of SPEECH AND MUSIC
"""

WINDOW = 'hamming'
WINDOW_SIZE=480
OVERLAP = 0.8 * WINDOW_SIZE
NFFT=512

_,_,Y_speech= signal.stft(speech_train,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_speech=np.abs(Y_speech)

Yabs_speech[Yabs_speech==0]=0.00001

_,_,Y_music= signal.stft(music_train,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_music = np.abs(Y_music)

Yabs_music[Yabs_music==0]=0.00001

Shape of the test 960000 ... Length : 60.00s ... Sample rate : 16000
Downsampled rate = 16000


# 2. Test DATA :


In [29]:
fs = 16000
SMR_db = 0
N_test_utterances = 1

rate = samplerate_s / fs

# 5 min for AE 10 - 15
start = 20 * 60 * 44100
step = int(0.5 * 60 * 44100)

test_list_s = []
test_list_m = []
test_list = []
spectres = []
spectres_abs = []
spectres_clean_speech = []
spectres_clean_speech_abs = []


for i in range(N_test_utterances):

  # Clean Speech 
  test_s = data_speech[start+i*step:start+(i+1)*step,0]
  test_s = signal.resample(test_s,int(test_s.shape[0]/rate))
  test_s = butter_lowpass_filter(test_s,5000,fs)

  # Clean Music
  test_m = data_music[start+i*step:start+(i+1)*step,0]
  test_m = signal.resample(test_m,int(test_m.shape[0]/rate))
  test_m = butter_lowpass_filter(test_m,5000,fs)

  # Get mixed signal
  test,speech_test,music_test=get_mixed_signal(test_s,test_m,SMR_db)


  test_list.append(test)
  test_list_s.append(speech_test)
  test_list_m.append(music_test)

  # STFT Mixed signal
  _,_,Ytest= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
  Yabs_test=np.abs(Ytest)
  Yabs_test[Yabs_test==0]=0.00001

  spectres.append(Ytest)
  spectres_abs.append(Yabs_test)

  # STFT Clean SPeech
  _,_,Ytest= signal.stft(speech_test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
  Yabs_test=np.abs(Ytest)
  Yabs_test[Yabs_test==0]=0.00001

  spectres_clean_speech.append(Ytest)
  spectres_clean_speech_abs.append(Yabs_test)


SMR = -0.00


In [30]:
np.save("Signals/speech_list.npy",np.array(test_list_s))
np.save("Signals/music_list.npy",np.array(test_list_m))
np.save("Signals/test_list.npy",np.array(test_list))
np.save("Signals/mix_spectres.npy",np.array(spectres))
np.save("Signals/mix_spectres_abs.npy",np.array(spectres_abs))
np.save("Signals/clean_speech_spectres.npy",np.array(spectres_clean_speech))
np.save("Signals/clean_speech_spectres_abs.npy",np.array(spectres_clean_speech_abs))

# 3. Train NMF Speech and Music :

In [None]:
Ns = 16

model = NMF(n_components=Ns, init='nndsvd',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=7)
model.fit(np.transpose(Yabs_speech))
Ws = np.transpose(model.components_)

Nm = 16

model = NMF(n_components=Nm, init='nndsvd',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=7)
model.fit(np.transpose(Yabs_music))
Wm = np.transpose(model.components_)

B = np.hstack([Ws,Wm])
#np.save('B16.npy',B)

# 3. Apply Test NMF :

In [215]:
B = np.load('B.npy')
#scaler = MinMaxScaler()
#B = scaler.fit_transform(B)

p = 3
Ns=8
Nm=8


In [216]:
sdr_speech_list=[]
sdr_music_list=[]

speech_est_list=[]
music_est_list=[]
for i in range(N_test_utterances):

  model_test = NMF(n_components=Ns+Nm, init='nndsvd',alpha=100,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=7)
  model_test.fit(np.transpose(spectres_abs[i]))

  model_test.components_ = np.transpose(B)
  G_test = model_test.transform(np.transpose(spectres_abs[i]))

  print('Testing NMF .... Done')
  Sources,Masks=Reconstruct(B=B,G=np.transpose(G_test),Ns=Ns,Nm=Nm,Yabs=spectres[i],p=p)

  print('Reconstruction Step .... Done')
  speech_est = Sources[0]
  music_est = Sources[1]

  speech_est_list.append(Sources[0])
  music_est_list.append(Sources[1])

  _, speech_est =  signal.istft(speech_est,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

  _, music_est =  signal.istft(music_est,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

  speech_est = speech_est[:test_list_s[i].shape[0]]
  music_est = music_est[:test_list_m[i].shape[0]]

  sdr_speech = SDR(s_est=speech_est,s=test_list_s[i])
  sdr_music = SDR(s_est=music_est, s=test_list_m[i])

  sdr_speech_list.append(sdr_speech)
  sdr_music_list.append(sdr_music)
  print('SDR Speech = {:.2f} ... SDR Music = {:.2f}'.format(sdr_speech,sdr_music))
  oc.myScript(speech_est,test_list_s[i])



Testing NMF .... Done
Reconstruction Step .... Done
SDR Speech = 2.50 ... SDR Music = 2.50
SDR =  4.3288
SIR = Inf
SAR =  4.3288


In [33]:
   ## Save the lists of spectrogram estimates of tests :

np.save("Signals/speech_ests.npy",np.array(speech_est_list))
np.save("Signals/music_ests.npy",np.array(music_est_list))

In [34]:
np.savetxt("Matlab_SDR/nmf_est.txt",np.array(speech_est))


In [None]:
np.save("Matlab_SDR/nmf_est.npy",np.array(speech_est))


## 3.2. Bootstraping to compute SDR:

In [None]:
N_boot = 1000
sdr_speech_means=[]
sdr_music_means=[]
for i in range(N_boot):

  sdr_speech_means.append(np.random.choice(np.array(sdr_speech_list),size=len(sdr_speech_list),replace=True).mean())
  sdr_music_means.append(np.random.choice(np.array(sdr_music_list),size=len(sdr_music_list),replace=True).mean())

print('Bootstrap SDR speech = {} ... SDR Music = {}'.format(np.array(sdr_speech_means).mean(),np.array(sdr_music_means).mean()))

Bootstrap SDR speech = 2.6117202387173104 ... SDR Music = 2.612618558798549


# 4. Train the DNN :

In [35]:
class AEModel(nn.Module):

    def __init__(self,d):
        super(AEModel, self).__init__()

        self.sigmoid = torch.nn.Sigmoid()
        self.relu = torch.nn.ReLU()
        self.fc1 = nn.Linear(d, 512, bias=False)  # d is dimension of the input.
        self.fc2 = nn.Linear(512, 128, bias=False)
        self.fc3 = nn.Linear(128, 64, bias=False)
        self.fc4 = nn.Linear(64, 128, bias=False)
        self.fc5 = nn.Linear(128, 512, bias=False)
        self.fc6 = nn.Linear(512, d, bias=False)


    def forward(self, x1):

        x1 = self.relu(self.fc1(x1))
        x1 = self.relu(self.fc2(x1))
        x1 = self.relu(self.fc3(x1))
        x1 = self.relu(self.fc4(x1))
        x1 = self.relu(self.fc5(x1))
        x1 = self.fc6(x1)

        return x1

def AEcost(x,y):

  return torch.norm(x - y,'fro',None)


In [36]:
class Net(nn.Module):

    def __init__(self,d):
        super(Net, self).__init__()

        self.sigmoid = torch.nn.Sigmoid()
        self.relu = torch.nn.ReLU()
        self.fc1 = nn.Linear(d, 512, bias=False)  # d is dimension of the input.
        self.fc2 = nn.Linear(512, 128, bias=False)
        self.fc3 = nn.Linear(128, 64, bias=False)
        self.fc4 = nn.Linear(64, 2, bias=False)


    def forward(self, x1):

        x1 = self.relu(self.fc1(x1))
        x1 = self.relu(self.fc2(x1))
        x1 = self.relu(self.fc3(x1))
        x1 = self.sigmoid(self.fc4(x1))

        return x1


## 4.1. Compute initial estimates of U and V :

In [205]:
N_uterance = 0
s1 = abs(np.load("Signals/speech_ests.npy"))[N_uterance]
s2 = abs(np.load("Signals/music_ests.npy"))[N_uterance]
Yabs = np.load("Signals/mix_spectres_abs.npy")[N_uterance]
Y = np.load("Signals/mix_spectres.npy")[N_uterance]
speech_test = np.load("Signals/speech_list.npy")[N_uterance]
music_test = np.load("Signals/music_list.npy")[N_uterance]
Y_clean = np.load("Signals/clean_speech_spectres.npy")[N_uterance]
Y_clean_abs = np.load("Signals/clean_speech_spectres_abs.npy")[N_uterance]
"""
u,v = gain_params_matrix(s1, s2, Yabs)

u = torch.tensor(u)
v = torch.tensor(v)
u0 = u ; v0 = v


u00 = u0.detach().numpy()
v00 = v0.detach().numpy()
"""
scaler = MinMaxScaler()
scaler.fit(np.transpose(Yabs))

Yabs = np.transpose(scaler.transform(np.transpose(Yabs)))
s1 = np.transpose(scaler.transform(np.transpose(s1)))
s2 = np.transpose(scaler.transform(np.transpose(s2)))

s1 = torch.tensor(s1)
s2 = torch.tensor(s2)
Yabs = torch.tensor(Yabs)





## 4.3. Build Model :

In [206]:
AE = AEModel(257)
AE.load_state_dict(torch.load('./AE-DNN-weights-Big3.pt',map_location=torch.device('cpu')))

model = Net(257)

In [207]:
model.fc1.weight = AE.fc1.weight
model.fc2.weight = AE.fc2.weight
model.fc3.weight = AE.fc3.weight
#model.fc3.weight = AE.fc3.weight


In [208]:

model.fc1.weight.requires_grad=False
model.fc2.weight.requires_grad=False
model.fc3.weight.requires_grad=False
#model.fc4.weight.requires_grad=False


In [209]:
print(torch.cuda.is_available())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

s1 = s1.to(device)
s2 = s2.to(device)
u = u.to(device)
v = v.to(device)
Yabs = Yabs.to(device)

s1.requires_grad = True
s2.requires_grad = True
u.requires_grad = True
v.requires_grad = True
Yabs.requires_grad = False

model = model.to(device)


True
cuda:0


## 4.5. Training Loop :

In [25]:
samplerate = 16000

In [160]:
params = list(model.parameters())+[u]+[v]
optimizer = SGD(params,lr=0.0001)

In [None]:

model.train()
N_epochs = 20
for e in range(N_epochs):


    loss_e=0.0
    for i in tqdm(range(s1.shape[1]),leave=True,position=0):

            optimizer.zero_grad()
            loss = Criteria(s1,s2,Yabs,u,v,i,25,1)
            loss.backward()
            # fill zeros into the first row of grad
            u.grad.data[:i].fill_(0)
            u.grad.data[i+1:].fill_(0)  
            v.grad.data[:i].fill_(0)
            v.grad.data[i+1:].fill_(0)
            s1.grad.data[:,:i].fill_(0)
            s1.grad.data[:,i+1:].fill_(0)
            s2.grad.data[:,:i].fill_(0)
            s2.grad.data[:,i+1:].fill_(0)



            optimizer.step()
            loss_e+=loss
    eval(s1,s2,u,v,scaler,scaler,p=1)
    print('Epoch {} , Loss = {:.3f}'.format(e,loss_e/s1.shape[1]))

In [53]:
eval(s1,s2,u,v,scaler,scaler,save=False,p=2)  

SDR =  4.0098
SIR = Inf
SAR =  4.0098
SDR =  3.4012
SIR = Inf
SAR =  3.4012


In [117]:
params = list(model.parameters())+[u]+[v]
optimizer = Adam(params,lr=0.01)

In [None]:
model.train()
N_epochs = 20
bs = 50
N_batches = int(s1.shape[1]/bs)

for e in range(N_epochs):
    
    loss_e=0.0
    for i in tqdm(range(N_batches),leave=True,position=0):

            loss_batch = 0.0
            optimizer.zero_grad()
            for b in range(bs):
           
              loss_batch += Criteria(s1,s2,Yabs,u,v,i*bs+b,60,1)

            loss_batch/=bs
            #print(f'Batch N°{i} ... Loss = {loss_batch/bs:.3f}')
            loss_batch.backward()
            # fill zeros into the first row of grad
            u.grad.data[:i*bs].fill_(0)
            u.grad.data[(i+1)*bs:].fill_(0) 


            v.grad.data[:i*bs].fill_(0)
            v.grad.data[(i+1)*bs:].fill_(0)

            s1.grad.data[:,:i*bs].fill_(0)
            s1.grad.data[:,(i+1)*bs:].fill_(0)

            s2.grad.data[:,:i*bs].fill_(0)
            s2.grad.data[:,(i+1)*bs:].fill_(0)

            optimizer.step()
            loss_e+=loss_batch

    eval(s1,s2,u,v,scaler,scaler,p=1)       
    print('Epoch {} , Loss = {:.3f}'.format(e,loss_e/N_batches))

In [None]:

#model_path='./Model/DNN-BIG.pt'
#torch.save(model.state_dict(), model_path)
"""
np.save('./Model/s1_1',s1.cpu().detach().numpy())
np.save('./Model/s2_1',s2.cpu().detach().numpy())
np.save('./Model/u_1',u.cpu().detach().numpy())
np.save('./Model/v_1',v.cpu().detach().numpy())


In [125]:
u*s1 + v*s2 - Yabs

tensor([[2.7528e-02, 7.6878e-02, 1.0964e-01,  ..., 2.5380e-01, 7.3391e-01,
         1.0000e+00],
        [2.8483e-02, 5.4513e-02, 1.0983e-01,  ..., 3.0935e-01, 8.0447e-01,
         9.0042e-01],
        [3.0174e-02, 3.8446e-02, 4.1283e-02,  ..., 2.0930e-01, 2.9827e-01,
         2.8237e-01],
        ...,
        [1.8095e-02, 0.0000e+00, 4.4440e-02,  ..., 1.9368e-01, 6.2700e-01,
         9.3658e-01],
        [1.7288e-02, 1.0353e-04, 4.2306e-02,  ..., 2.8934e-01, 5.9627e-01,
         8.4545e-01],
        [1.7779e-02, 8.9594e-04, 4.2357e-02,  ..., 3.2266e-01, 5.8065e-01,
         8.7627e-01]], device='cuda:0', dtype=torch.float64,
       grad_fn=<SubBackward0>)

## 4.6. Training Loop for u , v matrix :

In [161]:
params = list(model.parameters())+[u]+[v]
optimizer = Adam(params,lr=0.0001)

In [None]:
model.train()
N_epochs = 20
bs = 2
N_batches = int(s1.shape[1]/bs)

for e in range(N_epochs):
    
    loss_e=0.0
    for i in tqdm(range(N_batches),leave=True,position=0):

            loss_batch = 0.0
            optimizer.zero_grad()
            for b in range(bs):
           
              loss_batch += Criteria_matrix(s1,s2,Yabs,u,v,i*bs+b,1,1)

            loss_batch/=bs
            loss_batch.backward()

            # fill zeros into the first row of grad
            u.grad.data[:,:i*bs].fill_(0)
            u.grad.data[:,(i+1)*bs:].fill_(0) 


            v.grad.data[:,:i*bs].fill_(0)
            v.grad.data[:,(i+1)*bs:].fill_(0)

            s1.grad.data[:,:i*bs].fill_(0)
            s1.grad.data[:,(i+1)*bs:].fill_(0)

            s2.grad.data[:,:i*bs].fill_(0)
            s2.grad.data[:,(i+1)*bs:].fill_(0)

            optimizer.step()
            loss_e+=loss_batch

    eval(s1,s2,u,v,scaler,scaler,p=1)       
    print('Epoch {} , Loss = {:.3f}'.format(e,loss_e/N_batches))

## 4.7. Training Loop No u v :

In [210]:
params = list(model.parameters())+[s1]+[s2]
optimizer = Adam(params,lr=0.001)

In [218]:

model.train()
N_epochs = 20
bs = 10
N_batches = int(s1.shape[1]/bs)

for e in range(N_epochs):
    
    loss_e=0.0
    for i in tqdm(range(N_batches),leave=True,position=0):

            loss_batch = 0.0
            optimizer.zero_grad()
            for b in range(bs):
           
              loss_batch += Criteria_nouv(s1,s2,Yabs,i*bs+b,10,1)

            loss_batch/=bs
            loss_batch.backward()

            # fill zeros into the first row of grad


            s1.grad.data[:,:i*bs].fill_(0)
            s1.grad.data[:,(i+1)*bs:].fill_(0)

            s2.grad.data[:,:i*bs].fill_(0)
            s2.grad.data[:,(i+1)*bs:].fill_(0)

            optimizer.step()
            loss_e+=loss_batch

    eval_nouv(s1,s2,scaler,scaler,p=1)       
    print('Epoch {} , Loss = {:.3f}'.format(e,loss_e/N_batches))

100%|██████████| 500/500 [00:10<00:00, 47.05it/s]


SDR =  4.5072
SIR = Inf
SAR =  4.5072
SDR =  4.0809
SIR = Inf
SAR =  4.0809


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 0 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.06it/s]


SDR =  4.5382
SIR = Inf
SAR =  4.5382
SDR =  4.0648
SIR = Inf
SAR =  4.0648


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1 , Loss = 3.228


100%|██████████| 500/500 [00:10<00:00, 47.10it/s]


SDR =  4.5073
SIR = Inf
SAR =  4.5073
SDR =  4.0810
SIR = Inf
SAR =  4.0810


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 2 , Loss = 3.160


100%|██████████| 500/500 [00:10<00:00, 47.12it/s]


SDR =  4.5383
SIR = Inf
SAR =  4.5383
SDR =  4.0649
SIR = Inf
SAR =  4.0649


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 3 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.10it/s]


SDR =  4.5074
SIR = Inf
SAR =  4.5074
SDR =  4.0810
SIR = Inf
SAR =  4.0810


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 4 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.09it/s]


SDR =  4.5384
SIR = Inf
SAR =  4.5384
SDR =  4.0649
SIR = Inf
SAR =  4.0649


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 5 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.06it/s]


SDR =  4.5074
SIR = Inf
SAR =  4.5074
SDR =  4.0810
SIR = Inf
SAR =  4.0810


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 6 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.15it/s]


SDR =  4.5384
SIR = Inf
SAR =  4.5384
SDR =  4.0649
SIR = Inf
SAR =  4.0649


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 7 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.11it/s]


SDR =  4.5075
SIR = Inf
SAR =  4.5075
SDR =  4.0811
SIR = Inf
SAR =  4.0811


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 8 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.12it/s]


SDR =  4.5384
SIR = Inf
SAR =  4.5384
SDR =  4.0649
SIR = Inf
SAR =  4.0649


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 9 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.06it/s]


SDR =  4.5075
SIR = Inf
SAR =  4.5075
SDR =  4.0811
SIR = Inf
SAR =  4.0811


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 10 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.11it/s]


SDR =  4.5385
SIR = Inf
SAR =  4.5385
SDR =  4.0650
SIR = Inf
SAR =  4.0650


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 11 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.14it/s]


SDR =  4.5075
SIR = Inf
SAR =  4.5075
SDR =  4.0811
SIR = Inf
SAR =  4.0811


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 12 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.10it/s]


SDR =  4.5385
SIR = Inf
SAR =  4.5385
SDR =  4.0650
SIR = Inf
SAR =  4.0650


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 13 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.07it/s]


SDR =  4.5075
SIR = Inf
SAR =  4.5075
SDR =  4.0811
SIR = Inf
SAR =  4.0811


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 14 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.10it/s]


SDR =  4.5385
SIR = Inf
SAR =  4.5385
SDR =  4.0650
SIR = Inf
SAR =  4.0650


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 15 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.13it/s]


SDR =  4.5076
SIR = Inf
SAR =  4.5076
SDR =  4.0811
SIR = Inf
SAR =  4.0811


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 16 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.13it/s]


SDR =  4.5386
SIR = Inf
SAR =  4.5386
SDR =  4.0650
SIR = Inf
SAR =  4.0650


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 17 , Loss = 3.229


100%|██████████| 500/500 [00:10<00:00, 47.00it/s]


SDR =  4.5075
SIR = Inf
SAR =  4.5075
SDR =  4.0811
SIR = Inf
SAR =  4.0811


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 18 , Loss = 3.161


100%|██████████| 500/500 [00:10<00:00, 47.13it/s]


SDR =  4.5386
SIR = Inf
SAR =  4.5386
SDR =  4.0650
SIR = Inf
SAR =  4.0650
Epoch 19 , Loss = 3.229


In [223]:
eval_nouv(s1,s2,scaler,scaler,p=1)       


SDR =  4.5386
SIR = Inf
SAR =  4.5386
SDR =  4.0650
SIR = Inf
SAR =  4.0650


In [None]:
# In absolute values :

PATH = '/content/drive/My Drive/PFE/'

speech_estx = np.load(PATH+'s1.npy')
music_estx = np.load(PATH+'s2.npy')

ux = np.load(PATH+'u.npy')
vx= np.load(PATH+'v.npy')
speech_estx = scaler_s1.inverse_transform(speech_estx)
music_estx = scaler_s2.inverse_transform(music_estx)


# Omit negative values
speech_estx[speech_estx<0]=0.00001
music_estx[music_estx<0]=0.00001

speech_est_c,music_est_c=reconstruct(speech_estx,music_estx,ux,vx,Y,1)



#Recover ISTFT
_, speech_est_sig =  signal.istft(speech_est_c,
                      samplerate,
                      window = WINDOW,
                      nperseg=WINDOW_SIZE,
                      noverlap=OVERLAP,
                      nfft = NFFT)

_, music_est_sig =  signal.istft(music_est_c,
                      samplerate,
                      window = WINDOW,
                      nperseg=WINDOW_SIZE,
                      noverlap=OVERLAP,
                      nfft = NFFT)


speech_est_sig = speech_est_sig[:speech_test.shape[0]]
music_est_sig = music_est_sig[:music_test.shape[0]]

sdr_speech = SDR(s_est=speech_est_sig,s=speech_test)
sdr_music = SDR(s_est=music_est_sig, s=music_test)
    
print('SDR Speech = {:.2f} ... SDR Music = {:.2f}'.format(sdr_speech,sdr_music))

#write(PATH+'tests/SpeechDNN.wav", samplerate, speech_est_sig.astype(np.int16))
#write(PATH+'tests/MusicDNN.wav", samplerate, music_est_sig.astype(np.int16))

SDR Speech = 2.64 ... SDR Music = 2.64


# AutoEncoders :

* Input : S1 Estimate of speech.
* Output : Y_clean - M * S1.

In [None]:
class AEModel(nn.Module):

    def __init__(self,d):
        super(AEModel, self).__init__()

        self.sigmoid = torch.nn.Sigmoid()
        self.relu = torch.nn.ReLU()
        self.fc1 = nn.Linear(d, 128, bias=True)  # d is dimension of the input.
        self.fc2 = nn.Linear(128, 64, bias=True)
        self.fc3 = nn.Linear(64, 128, bias=True)
        self.fc4 = nn.Linear(128, d, bias=True)

        self.fc1_ = nn.Linear(d, 128, bias=True)  # d is dimension of the input.
        self.fc2_ = nn.Linear(128, 64, bias=True)
        self.fc3_ = nn.Linear(64, 128, bias=True)
        self.fc4_ = nn.Linear(128, d, bias=True)
       

    def forward(self, x1,x2):

        x1 = self.relu(x1)
        x1 = self.relu(self.fc1(x1))
        x1 = self.relu(self.fc2(x1))
        x1 = self.relu(self.fc3(x1))
        x1 = self.relu(self.fc4(x1))

        x2 = self.relu(x2)
        x2 = self.relu(self.fc1_(x2))
        x2 = self.relu(self.fc2_(x2))
        x2 = self.relu(self.fc3_(x2))
        x2 = self.relu(self.fc4_(x2))



        return x1,x2

In [None]:

def cost(Yabs,s1,s2,u,v):

  loss1 = torch.norm(u*s1 + v*s2 - Yabs,'fro',None)/257
  loss2 = torch.norm(s1 - s2 ,'fro',None)

  #print(f'Loss1 = {loss1.item()} ... Loss2 = {loss2.item()}')
  return loss1



In [None]:
params = list(ModelAE.parameters())+[u]+[v]
optimizer = SGD(params,lr=0.001)

In [None]:
ModelAE.train()
N_epochs = 20

for e in range(N_epochs):

  loss_e=0.0

  for i in tqdm(range(s1.shape[1]),leave=True,position=0):

    a,b = ModelAE(s1[:,i],s2[:,i])

    optimizer.zero_grad()
    
    loss1 = cost(Yabs[:,i],a,b,u[i],v[i])
    loss1.backward()

    optimizer.step()

    u.grad.data[:i].fill_(0)
    u.grad.data[i+1:].fill_(0)  
    v.grad.data[:i].fill_(0)
    v.grad.data[i+1:].fill_(0)


    loss_e+= loss1.item()

  print("Epoch {} ... Loss = {}".format(e,loss_e))



In [None]:

ss1=[]
ss2=[]

for i in tqdm(range(s1.shape[1]),leave=True,position=0):

  a,b = ModelAE(s1[:,i],s2[:,i])
  ss1.append(a.cpu().detach().numpy())
  ss2.append(b.cpu().detach().numpy())

100%|██████████| 1701/1701 [00:01<00:00, 970.04it/s]


In [None]:
s1_est=np.transpose(np.array(ss1))
s2_est=np.transpose(np.array(ss2))

s1_est = (u.cpu().detach().numpy()*s1_est)/(u.cpu().detach().numpy()*s1_est+v.cpu().detach().numpy()*s2_est+0.001)

In [None]:


s1_est = s1_est * Y / Yabs.cpu().detach().numpy() 
s1_est[s1_est==0]=0.0001



In [None]:

  #Recover ISTFT
_, speech_est_sig =  signal.istft(s1_est,
                        samplerate,
                        window = WINDOW,
                        nperseg=WINDOW_SIZE,
                        noverlap=OVERLAP,
                        nfft = NFFT)

sdr_speech = SDR(s_est=speech_est_sig,s=speech_test)
print(sdr_speech)

0.00892209308257177


# AE FOr the DNN


In [None]:
class AEModel(nn.Module):

    def __init__(self,d):
        super(AEModel, self).__init__()

        self.sigmoid = torch.nn.Sigmoid()
        self.relu = torch.nn.ReLU()
        self.fc1 = nn.Linear(d, 512, bias=False)  # d is dimension of the input.
        self.fc2 = nn.Linear(512, 128, bias=False)
        self.fc3 = nn.Linear(128, 64, bias=False)
        self.fc4 = nn.Linear(64, 128, bias=False)
        self.fc5 = nn.Linear(128, 512, bias=False)
        self.fc6 = nn.Linear(512, d, bias=False)


    def forward(self, x1):

        x1 = self.relu(self.fc1(x1))
        x1 = self.relu(self.fc2(x1))
        x1 = self.relu(self.fc3(x1))
        x1 = self.relu(self.fc4(x1))
        x1 = self.relu(self.fc5(x1))
        x1 = self.fc6(x1)

        return x1

def AEcost(x,y):

  return torch.norm(x - y,'fro',None)


In [None]:
ModelAE = AEModel(257)
ModelAE.load_state_dict(torch.load('./AE-DNN-weights-Big.pt',map_location=torch.device('cpu')))

N_uterance = 0
s1 = abs(np.load("Signals/speech_ests.npy"))[N_uterance]
s2 = abs(np.load("Signals/music_ests.npy"))[N_uterance]
Yabs = np.load("Signals/mix_spectres_abs.npy")[N_uterance]

scaler = MinMaxScaler()
scaler.fit(np.transpose(Yabs))

s1 = np.transpose(scaler.transform(np.transpose(s1)))
s2 = np.transpose(scaler.transform(np.transpose(s2)))


s1 = torch.tensor(s1).float()
s2 = torch.tensor(s2).float()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

s1 = s1.to(device)
s2 = s2.to(device)



ModelAE = ModelAE.to(device)


In [None]:
params = list(ModelAE.parameters())
optimizer = SGD(params,lr=0.002)

In [None]:
ModelAE.train()
N_epochs = 10

for e in range(N_epochs):

  loss_e=0.0

  i1 = 0
  i2 = 0

  for i in tqdm(range(2*s1.shape[1]),leave=True,position=0):

    optimizer.zero_grad()

    if i%2 == 0 :

      loss = AEcost(ModelAE(s1[:,i1]),s1[:,i1])
      i1 += 1

    if i%2 == 1:

      loss = AEcost(ModelAE(s1[:,i2]),s1[:,i2])
      i2 += 1


    loss.backward()
    optimizer.step()

    loss_e += loss.item()
  print('Epoch {} ... Loss = {:.2f}'.format(e,loss_e))


100%|██████████| 100002/100002 [02:53<00:00, 577.87it/s]
  0%|          | 59/100002 [00:00<02:49, 588.01it/s]

Epoch 0 ... Loss = 20155.65


100%|██████████| 100002/100002 [02:51<00:00, 583.26it/s]
  0%|          | 59/100002 [00:00<02:51, 583.28it/s]

Epoch 1 ... Loss = 19772.15


100%|██████████| 100002/100002 [02:52<00:00, 580.86it/s]
  0%|          | 55/100002 [00:00<03:05, 539.84it/s]

Epoch 2 ... Loss = 19410.54


100%|██████████| 100002/100002 [02:54<00:00, 573.90it/s]
  0%|          | 65/100002 [00:00<02:35, 643.60it/s]

Epoch 3 ... Loss = 19050.05


100%|██████████| 100002/100002 [02:53<00:00, 574.85it/s]
  0%|          | 55/100002 [00:00<03:01, 549.74it/s]

Epoch 4 ... Loss = 18661.71


100%|██████████| 100002/100002 [02:57<00:00, 563.45it/s]
  0%|          | 60/100002 [00:00<02:47, 597.02it/s]

Epoch 5 ... Loss = 18278.52


100%|██████████| 100002/100002 [03:00<00:00, 553.95it/s]
  0%|          | 54/100002 [00:00<03:05, 537.43it/s]

Epoch 6 ... Loss = 17916.12


100%|██████████| 100002/100002 [02:58<00:00, 558.91it/s]
  0%|          | 63/100002 [00:00<02:39, 628.18it/s]

Epoch 7 ... Loss = 17561.88


100%|██████████| 100002/100002 [02:55<00:00, 570.10it/s]
  0%|          | 65/100002 [00:00<02:35, 642.57it/s]

Epoch 8 ... Loss = 17216.21


100%|██████████| 100002/100002 [02:51<00:00, 583.86it/s]

Epoch 9 ... Loss = 16888.40





In [None]:
torch.save(ModelAE.state_dict(), './AE-DNN-weights-Big3.pt')

2.2619

# Stacked Auto Encoder :

In [None]:
class AEModel(nn.Module):

    def __init__(self,d):
        super(AEModel, self).__init__()
        self.sigmoid = torch.nn.Sigmoid()
        self.relu = torch.nn.ReLU()


        self.fc1 = nn.LSTM(input_size=d, hidden_size=128, bias=True)  # d is dimension of the input.
        self.fc2 = nn.LSTM(input_size=128, hidden_size=64, bias=True)
        self.fc3 = nn.LSTM(input_size=64, hidden_size=128, bias=True)
        self.fc4 = nn.LSTM(input_size=128, hidden_size=d, bias=True)
       

    def forward(self, x):

        h_t1,_ = self.fc1(x)
        h_t2,_ = self.fc2(h_t1)
        h_t3,_ = self.fc3(h_t2)
        y,_ = self.fc4(h_t3)

        return y

In [None]:
def create_sequences(data, seq_length,overlap):
    xs = []

    for i in range(len(data)-seq_length-overlap):
        x = data[i:(i+seq_length)]
        xs.append(x)

    return np.array(xs)


In [None]:
N_uterance = 0
s1 = abs(np.load("speech_ests.npy"))[N_uterance][:,:-1]
s2 = abs(np.load("music_ests.npy"))[N_uterance][:,:-1]
Yabs = np.load("mix_spectres_abs.npy")[N_uterance][:,:-1]
Y = np.load("mix_spectres.npy")[N_uterance][:,:-1]
speech_test = np.load("speech_list.npy")[N_uterance]
music_test = np.load("music_list.npy")[N_uterance]
Y_clean = np.load("clean_speech_spectres.npy")[N_uterance]
Y_clean_abs = np.load("clean_speech_spectres_abs.npy")[N_uterance]

In [None]:
seq_len = 20
N_seq = int(s1.shape[1]/seq_len)

s_seq=s.reshape((seq_len,N_seq,514))
print(f'Shape of s = {s_seq.shape}')
s_seq = torch.tensor(s_seq).float()

Shape of s = (20, 500, 514)
