In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from numpy.linalg import inv
from helpers import Reconstruct, Viz_Y,SMR,get_mixed_signal,SDR
import seaborn as sns
import warnings
import torch
import math
from tqdm import tqdm
import librosa
warnings.simplefilter('ignore')

# 1.Read speech and music data:

In [16]:
N_minutes = 7*60*44100
samplerate_s, data_speech = read("../../DATA/vocal_11.wav")
speech=data_speech[44100*10:N_minutes,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {}'.format(speech.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_s))

samplerate_m, data_music = read("../../DATA/piano_10.wav")
music=data_music[44100*10:N_minutes,0]
length=music.shape[0]/samplerate_m
print('Shape of the music {}'.format(music.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_m))


Shape of the speech 18081000
Length : 410.00s
Sample rate : 44100
Shape of the music 18081000
Length : 410.00s
Sample rate : 44100


# Get test data:

## Apply STFT :

### We can change :

* Window : Type of window
* nperseg : length of window
* noverlap : overlap between windows.
* nfft : fft length > window size

In [None]:
test,speech,music=get_mixed_signal(speech,music,10)

WINDOW = 'hamming'
WINDOW_SIZE=480
OVERLAP = 0.6 * WINDOW_SIZE
NFFT=512

f,t,Y= signal.stft(speech,samplerate_s,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_s=np.abs(Y)

f,t,Y= signal.stft(music,samplerate_m,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_m=np.abs(Y)

f,t,Y= signal.stft(test,samplerate_t,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_t=np.abs(Y)
print('Shape of spectrogram speech : {}'.format(Yabs_s.shape))
print('Shape of spectrogram music: {}'.format(Yabs_m.shape))

# Tuning Loop :

In [None]:
def Evaluation(speech, music,Ns,Nm,SMR_db,samplerate,p):
    
    
    test,speech_test,music_test=get_mixed_signal(speech,music,SMR_db)
    test=test[882000:2*882000]
    speech_test=speech_test[882000:2*882000]
    music_test = music_test[882000:2*882000]
    
    WINDOW = 'hamming'
    WINDOW_SIZE=480
    OVERLAP = 0.6 * WINDOW_SIZE
    NFFT=512

    f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_s=np.abs(Y)

    f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_m=np.abs(Y)

    f,t,Y= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_t=np.abs(Y)
    
    Yabs_s[Yabs_s==0]=0.0001
    Yabs_t[Yabs_t==0]=0.0001
    Yabs_m[Yabs_m==0]=0.0001

    model = NMF(n_components=Ns, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=50, random_state=0)
    G_s = model.fit_transform(np.transpose(Yabs_s))
    B_s = model.components_
    
    
    model = NMF(n_components=Nm, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=50, random_state=0)
    G_m = model.fit_transform(np.transpose(Yabs_m))
    B_m = model.components_

    B=np.vstack([B_s,B_m])
    
    
    model_test = NMF(n_components=Ns+Nm, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=50, random_state=0)
    model_test.fit(np.transpose(Yabs_t))
    
    model_test.components_=B
    G_test=model_test.transform(np.transpose(Yabs_t))
    
    
    Sources,Masks=Reconstruct(B=np.transpose(B),G=np.transpose(G_test),Ns=Ns,Nm=Nm,Yabs=Y,p=p)
    
    speech_est = Sources[0]
    music_est = Sources[1]
    
    _, speech_est =  signal.istft(speech_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    _, music_est =  signal.istft(music_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    speech_est = speech_est[:speech_test.shape[0]]
    music_est = music_est[:music_test.shape[0]]
    
    sdr_speech = SDR(s_est=speech_est,s=speech_test)
    sdr_music = SDR(s_est=music_est, s=music_test)
    
    return sdr_speech, sdr_music
    

In [None]:
SDR_MUSIC=[]
SDR_SPEECH=[]

for SMR in tqdm([0]):
    
    SDR_MUSIC_Ncomp=[]
    SDR_SPEECH_Ncomp=[]
    
    for Ns in tqdm([2,64,128]):
            
            SDR_MUSIC_P=[]
            SDR_SPEECH_P=[]

            for p in [2]:
                
                print('Evaluation SMR = {} Ns = Nm = {} p = {}'.format(SMR,Ns,p))
                sdr_speech,sdr_music=Evaluation(speech=speech, music=music,Ns=Ns,Nm=Ns,SMR_db=SMR,samplerate=samplerate_s,p=p)
                SDR_SPEECH_P.append(sdr_speech)
                SDR_MUSIC_P.append(sdr_music)
                print('Speech SDR = {} ... Music SDR = {}'.format(sdr_speech,sdr_music))

            SDR_MUSIC_Ncomp.append(SDR_MUSIC_P)
            SDR_SPEECH_Ncomp.append(SDR_SPEECH_P)
            
    SDR_MUSIC.append(SDR_MUSIC_Ncomp)
    SDR_SPEECH.append(SDR_SPEECH_Ncomp)


In [None]:
SDR_MUSIC_ARRAY=np.array(SDR_MUSIC)
SDR_SPEECH_ARRAY=np.array(SDR_SPEECH)
np.save('./SDR/Music_sdr2',SDR_MUSIC_ARRAY)
np.save('./SDR/Speech_sdr2',SDR_SPEECH_ARRAY)

# Test on one configuration :

In [20]:
def Test(speech, music,Ns,Nm,SMR_db,samplerate,p):
    
    
    test,speech_test,music_test=get_mixed_signal(speech,music,SMR_db)
    test=test[882000:3*882000]
    speech_test=speech_test[882000:3*882000]
    music_test = music_test[882000:3*882000]
    
    write("../../Tests/Test.wav", samplerate_t, test.astype(np.int16))
    
    WINDOW = 'hamming'
    WINDOW_SIZE=480
    OVERLAP = 0.6 * WINDOW_SIZE
    NFFT=512

    f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_s=np.abs(Y)

    f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_m=np.abs(Y)

    f,t,Y= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_t=np.abs(Y)
    
    Yabs_s[Yabs_s==0]=0.0001
    Yabs_t[Yabs_t==0]=0.0001
    Yabs_m[Yabs_m==0]=0.0001

    model = NMF(n_components=Ns, init='random',alpha=0.5,beta_loss='itakura-saito',solver="mu",max_iter=50, random_state=0)
    G_s = model.fit_transform(np.transpose(Yabs_s))
    B_s = model.components_
    
    print('Train NMF 1 ... Done')
    
    model = NMF(n_components=Nm, init='random',alpha=0.5,beta_loss='itakura-saito',solver="mu",max_iter=50, random_state=0)
    G_m = model.fit_transform(np.transpose(Yabs_m))
    B_m = model.components_

    print('Train NMF 2 ... Done')

    B=np.vstack([B_s,B_m])
    
    
    model_test = NMF(n_components=Ns+Nm, init='random',alpha=0.5,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
    model_test.fit(np.transpose(Yabs_t))
    
    model_test.components_=B
    G_test=model_test.transform(np.transpose(Yabs_t))
    
    
    Sources,Masks=Reconstruct(B=np.transpose(B),G=np.transpose(G_test),Ns=Ns,Nm=Nm,Yabs=Y,p=p)
    
    speech_est = Sources[0]
    music_est = Sources[1]
    
    _, speech_est =  signal.istft(speech_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    _, music_est =  signal.istft(music_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    speech_est = speech_est[:speech_test.shape[0]]
    music_est = music_est[:music_test.shape[0]]
    
    sdr_speech = SDR(s_est=speech_est,s=speech_test)
    sdr_music = SDR(s_est=music_est, s=music_test)
    
    print('SDR Speech = {:.2f} ... SDR Music = {:.2f}'.format(sdr_speech,sdr_music))
    
    write("../../Tests/Speech.wav", samplerate_t, speech_est.astype(np.int16))
    write("../../Tests/Music.wav", samplerate_t, music_est.astype(np.int16))

In [21]:
Test(speech=speech, music=music,Ns=8,Nm=8,SMR_db=0,samplerate=samplerate_s,p=2)

Train NMF 1 ... Done
Train NMF 2 ... Done
SDR Speech = 0.68 ... SDR Music = -2.12
