In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from numpy.linalg import inv
from helpers import Reconstruct, Viz_Y,SMR,get_mixed_signal,SDR
import seaborn as sns
import warnings
import torch
import math
from tqdm import tqdm
warnings.simplefilter('ignore')

# 1.Read speech and music data:

In [23]:
samplerate_s, data_speech = read("../data/male_vocal.wav")
speech=data_speech[:100000,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {}'.format(speech.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_s))

samplerate_m, data_music = read("../data/piano.wav")
music=data_music[:speech.shape[0],0]
length=music.shape[0]/samplerate_m
print('Shape of the music {}'.format(music.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_m))

samplerate_t, test = read("../data/mixed_signal.wav")
test=test[:,0]


Shape of the speech 100000
Length : 2.27s
Sample rate : 44100
Shape of the music 100000
Length : 2.27s
Sample rate : 44100


In [None]:
mixed,_,_=get_mixed_signal(speech, music, 10)
write("../../mixed.wav", samplerate_s, mixed.astype(np.int16))

## Apply STFT :

### We can change :

* Window : Type of window
* nperseg : length of window
* noverlap : overlap between windows.
* nfft : fft length > window size

In [None]:

test,speech,music=get_mixed_signal(speech,music,10)

WINDOW = 'hamming'
WINDOW_SIZE=480
OVERLAP = 0.6 * WINDOW_SIZE
NFFT=512

f,t,Y= signal.stft(speech,samplerate_s,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_s=np.abs(Y)

f,t,Y= signal.stft(music,samplerate_m,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_m=np.abs(Y)

f,t,Y= signal.stft(test,samplerate_t,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_t=np.abs(Y)
print('Shape of spectrogram speech : {}'.format(Yabs_s.shape))
print('Shape of spectrogram music: {}'.format(Yabs_m.shape))

## If we apply the elbow method the optimal number of componenets will be 32

In [None]:
Yabs_s[Yabs_s==0]=0.0001
Yabs_t[Yabs_t==0]=0.0001
Yabs_m[Yabs_m==0]=0.0001


In [21]:
def Evaluation(speech, music,Ns,Nm,SMR_db,samplerate):
    
    
    test,speech,music=get_mixed_signal(speech,music,SMR_db)

    WINDOW = 'hamming'
    WINDOW_SIZE=480
    OVERLAP = 0.6 * WINDOW_SIZE
    NFFT=512

    f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_s=np.abs(Y)

    f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_m=np.abs(Y)

    f,t,Y= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_t=np.abs(Y)
    
    Yabs_s[Yabs_s==0]=0.0001
    Yabs_t[Yabs_t==0]=0.0001
    Yabs_m[Yabs_m==0]=0.0001

    model = NMF(n_components=Ns, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=200, random_state=0)
    G_s = model.fit_transform(np.transpose(Yabs_s))
    B_s = model.components_
    model = NMF(n_components=Nm, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=200, random_state=0)
    G_m = model.fit_transform(np.transpose(Yabs_m))
    B_m = model.components_

    B=np.vstack([B_s,B_m])
    model_test = NMF(n_components=Ns+Nm, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=200, random_state=0)
    model_test.fit(np.transpose(Yabs_t))
    
    model_test.components_=B
    G_test=model_test.transform(np.transpose(Yabs_t))
    
    
    Sources,Masks=Reconstruct(B=np.transpose(B),G=np.transpose(G_test),Ns=Ns,Nm=Nm,Yabs=Y,p=2)
    
    speech_est = Sources[0]
    music_est = Sources[1]
    
    _, speech_est =  signal.istft(speech_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    _, music_est =  signal.istft(music_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    speech_est = speech_est[:speech.shape[0]]
    music_est = music_est[:music.shape[0]]
    
    sdr_speech = SDR(s_est=speech_est,s=speech)
    sdr_music = SDR(s_est=music_est, s=music)
    
    return sdr_speech, sdr_music
    

In [24]:
SDR_MUSIC=[]
SDR_SPEECH=[]

for SMR in tqdm([-5,0,5]):
    
    SDR_MUSIC_Ncomp=[]
    SDR_SPEECH_Ncomp=[]
    
    for Ns in [2,16,32,64,128]:
            
            print('Evaluation SMR = {} Ns = Nm = {}'.format(SMR,Ns))
            sdr_speech,sdr_music=Evaluation(speech=speech, music=music,Ns=Ns,Nm=Ns,SMR_db=SMR,samplerate=samplerate_s)
            SDR_SPEECH_Ncomp.append(sdr_speech)
            SDR_MUSIC_Ncomp.append(sdr_music)
            print('Speech SDR = {} ... Music SDR = {}'.format(sdr_speech,sdr_music))
            
    SDR_MUSIC.append(SDR_MUSIC_Ncomp)
    SDR_SPEECH.append(SDR_SPEECH_Ncomp)



  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A

Evaluation SMR = -5 Ns = Nm = 2
SMR = -5.00
Speech SDR = 0.19582603420746408 ... Music SDR = 5.195826034207477
Evaluation SMR = -5 Ns = Nm = 16
SMR = -5.00
Speech SDR = 0.5226139877333418 ... Music SDR = 5.522613987733356
Evaluation SMR = -5 Ns = Nm = 32
SMR = -5.00
Speech SDR = 0.3215587291009167 ... Music SDR = 5.321558729100929
Evaluation SMR = -5 Ns = Nm = 64
SMR = -5.00
Speech SDR = 0.4577974691203778 ... Music SDR = 5.45779746912039
Evaluation SMR = -5 Ns = Nm = 128
SMR = -5.00



 33%|████████████████████████████                                                        | 1/3 [00:15<00:31, 15.96s/it][A

Speech SDR = 0.782115245989678 ... Music SDR = 5.782115245989691
Evaluation SMR = 0 Ns = Nm = 2
Speech SDR = 1.845279041389463 ... Music SDR = -1.7639280969679798
Evaluation SMR = 0 Ns = Nm = 16
Speech SDR = 5.181001282810592 ... Music SDR = 1.5717941115601848
Evaluation SMR = 0 Ns = Nm = 32
Speech SDR = 4.908447082149917 ... Music SDR = 1.2992399464111961
Evaluation SMR = 0 Ns = Nm = 64
Speech SDR = 3.0694553728398946 ... Music SDR = -0.5397517543050638
Evaluation SMR = 0 Ns = Nm = 128



 67%|████████████████████████████████████████████████████████                            | 2/3 [00:31<00:15, 15.96s/it][A

Speech SDR = 2.8672957298735593 ... Music SDR = -0.7419113672207176
Evaluation SMR = 5 Ns = Nm = 2


TypeError: unsupported operand type(s) for *: 'numpy.ndarray' and 'Tensor'

In [None]:
SDR_MUSIC_ARRAY=np.array(SDR_MUSIC)
SDR_SPEECH_ARRAY=np.array(SDR_SPEECH)
np.save('./SDR/Music_sdr',SDR_MUSIC_ARRAY)
np.save('./SDR/Speech_sdr',SDR_SPEECH_ARRAY)

# Estimation of the sources:

In [None]:
for i in range(2):
    
    _, xrec =  signal.istft(Sources[i],
                          samplerate_t,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    write("../../example"+str(i)+".wav", samplerate_t, xrec.astype(np.int16))