In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from numpy.linalg import inv
from helpers import Reconstruct, Viz_Y,SMR,get_mixed_signal,SDR
import seaborn as sns
import warnings
import torch
import math
from tqdm import tqdm
import librosa
warnings.simplefilter('ignore')

# 1.Read speech and music data:

In [2]:
N_minutes = 10*60*44100
samplerate_s, data_speech = read("../../DATA/vocal_11.wav")
speech=data_speech[44100*10:N_minutes,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {}'.format(speech.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_s))

samplerate_m, data_music = read("../../DATA/piano_10.wav")
music=data_music[44100*10:N_minutes,0]
length=music.shape[0]/samplerate_m
print('Shape of the music {}'.format(music.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_m))


Shape of the speech 26019000
Length : 590.00s
Sample rate : 44100
Shape of the music 26019000
Length : 590.00s
Sample rate : 44100


In [3]:
rate = 2
speech = signal.resample(speech,int(speech.shape[0]/rate))
music=signal.resample(music,int(music.shape[0]/rate))
samplerate_m=int(samplerate_m/rate)
samplerate_s=samplerate_m


print('Downsampled rate = {}'.format(samplerate_s))

Downsampled rate = 22050


# Test on one configuration :

In [7]:
samplerate_t=samplerate_m

In [16]:
def Test(speech, music,Ns,Nm,SMR_db,samplerate,p):
    
    # Set the test data:
    test,speech_test,music_test=get_mixed_signal(speech,music,SMR_db)
    test=test[5*882000:6*882000]
    speech_test=speech_test[5*882000:6*882000]
    music_test = music_test[5*882000:6*882000]
    
    write("../../Tests/Test.wav", samplerate_t, test.astype(np.int16))
    
    WINDOW = 'hamming'
    WINDOW_SIZE=256
    OVERLAP = 0.6 * WINDOW_SIZE
    NFFT=512

    f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_s=np.abs(Y)

    f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_m=np.abs(Y)

    f,t,Y= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_t=np.abs(Y)
    
    Yabs_s[Yabs_s==0]=0.0001
    Yabs_t[Yabs_t==0]=0.0001
    Yabs_m[Yabs_m==0]=0.0001

    model = NMF(n_components=Ns, init='random',alpha=0.2,beta_loss='itakura-saito',solver="mu",max_iter=20, random_state=0)
    G_s = model.fit_transform(np.transpose(Yabs_s))
    B_s = model.components_
    
    print('Training Speech NMF  .... Done')
    
    model = NMF(n_components=Nm, init='random',alpha=0.2,beta_loss='itakura-saito',solver="mu",max_iter=20, random_state=0)
    G_m = model.fit_transform(np.transpose(Yabs_m))
    B_m = model.components_

    print('Training Music NMF .... Done')

    B=np.vstack([B_s,B_m])
    
    
    model_test = NMF(n_components=Ns+Nm, init='random',alpha=0.2,beta_loss='itakura-saito',solver="mu",max_iter=50, random_state=0)
    model_test.fit(np.transpose(Yabs_t))
    
    model_test.components_=B
    G_test=model_test.transform(np.transpose(Yabs_t))
    
    print('Testing NMF .... Done')
    Sources,Masks=Reconstruct(B=np.transpose(B),G=np.transpose(G_test),Ns=Ns,Nm=Nm,Yabs=Y,p=p)
    
    print('Reconstruction Step .... Done')
    speech_est = Sources[0]
    music_est = Sources[1]
    
    _, speech_est =  signal.istft(speech_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    _, music_est =  signal.istft(music_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    speech_est = speech_est[:speech_test.shape[0]]
    music_est = music_est[:music_test.shape[0]]
    
    sdr_speech = SDR(s_est=speech_est,s=speech_test)
    sdr_music = SDR(s_est=music_est, s=music_test)
    
    print('SDR Speech = {:.2f} ... SDR Music = {:.2f}'.format(sdr_speech,sdr_music))
    
    write("../../Tests/SpeechX.wav", samplerate_t, speech_est.astype(np.int16))
    write("../../Tests/MusicX.wav", samplerate_t, music_est.astype(np.int16))

In [17]:
Test(speech=speech, music=music,Ns=64,Nm=64,SMR_db=0,samplerate=samplerate_m,p=3)

SMR = 0.00
Training Speech NMF  .... Done
Training Music NMF .... Done
Testing NMF .... Done
Reconstruction Step .... Done
SDR Speech = 0.27 ... SDR Music = 2.71
