In [3]:
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter, freqz

from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from numpy.linalg import inv
from helpers import Reconstruct, Viz_Y,SMR,get_mixed_signal,SDR
import seaborn as sns
import warnings
import torch
import math
from tqdm import tqdm
warnings.simplefilter('ignore')

# 1.Read speech and music data:

In [4]:
start = 1 * 60 * 44100
end = 7 * 60 * 44100 + 20 * 44100

samplerate_s, data_speech = read("../../DATA/vocal_11.wav")
speech=data_speech[start:end,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {} ... Length : {:.2f}s ... Sample rate : {}'.format(speech.shape[0],length,samplerate_s))

samplerate_m, data_music = read("../../DATA/piano_10.wav")
music=data_music[start:end,0]
length=music.shape[0]/samplerate_m
print('Shape of the music {} ... Length : {:.2f}s ... Sample rate : {}'.format(music.shape[0],length,samplerate_m))

rate = samplerate_s / 16000


fs = 16000
start = 9 * 60 * fs + 40 * fs
end = 10 * 60 * fs + 10 * fs

start = 580 * 44100
end = 610 * 44100
speech_t=data_speech[start : end,0]
music_t = data_music[start:end, 0]


speech_t = signal.resample(speech_t,int(speech_t.shape[0]/rate))
music_t = signal.resample(music_t,int(music_t.shape[0]/rate))
samplerate=int(samplerate_m/rate)
length=music_t.shape[0]/samplerate

print('Shape of the test {} ... Length : {:.2f}s ... Sample rate : {}'.format(music_t.shape[0],length,samplerate))

speech = signal.resample(speech,int(speech.shape[0]/rate))
music = signal.resample(music,int(music.shape[0]/rate))


print('Downsampled rate = {}'.format(samplerate))


def butter_lowpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y


speech = butter_lowpass_filter(speech,4000,fs)
music = butter_lowpass_filter(music,4000,fs)

music_t = butter_lowpass_filter(music_t,4000,fs)
speech_t = butter_lowpass_filter(speech_t,4000,fs)

Shape of the speech 16758000 ... Length : 380.00s ... Sample rate : 44100
Shape of the music 16758000 ... Length : 380.00s ... Sample rate : 44100
Shape of the test 480000 ... Length : 30.00s ... Sample rate : 16000
Downsampled rate = 16000


# Test on one configuration :

In [5]:
samplerate_t=samplerate

In [8]:
def Test(speech, music,speech_t,music_t,Ns,Nm,SMR_db,samplerate,p):
    
    # Set the test data:
    test,speech_test,music_test=get_mixed_signal(speech_t,music_t,SMR_db)
    write("../../Tests/Test.wav", samplerate_t, test.astype(np.int16))
    
    WINDOW = 'barthann'
    WINDOW_SIZE=480
    OVERLAP = 0.6 * WINDOW_SIZE
    NFFT=512

    f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_s=np.abs(Y)

    f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_m=np.abs(Y)

    f,t,Y= signal.stft(test,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
    Yabs_t=np.abs(Y)
    
    Yabs_s[Yabs_s==0]=0.00001
    Yabs_t[Yabs_t==0]=0.00001
    Yabs_m[Yabs_m==0]=0.00001

    model = NMF(n_components=Ns, init='nndsvd',alpha=0.2,beta_loss='itakura-saito',solver="mu",max_iter=70, random_state=7)
    model.fit(np.transpose(Yabs_s))
    B_s = model.components_
    
    print('Training Speech NMF  .... Done')
    
    model = NMF(n_components=Nm, init='nndsvd',alpha=0.2,beta_loss='itakura-saito',solver="mu",max_iter=70, random_state=7)
    model.fit(np.transpose(Yabs_m))
    B_m = model.components_

    print('Training Music NMF .... Done')

    B=np.vstack([B_s,B_m])
    
    scaler = MinMaxScaler()
    B = scaler.fit_transform(B)
    
    model_test = NMF(n_components=Ns+Nm, init='nndsvd',alpha=0.2,beta_loss='itakura-saito',solver="mu",max_iter=200, random_state=7)
    model_test.fit(np.transpose(Yabs_t))
    
    model_test.components_=B
    G_test=model_test.transform(np.transpose(Yabs_t))
    
    print('Testing NMF .... Done')
    Sources,Masks=Reconstruct(B=np.transpose(B),G=np.transpose(G_test),Ns=Ns,Nm=Nm,Yabs=Y,p=p)
    
    print('Reconstruction Step .... Done')
    speech_est = Sources[0]
    music_est = Sources[1]
    
    _, speech_est =  signal.istft(speech_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    _, music_est =  signal.istft(music_est,
                          samplerate,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    
    speech_est = speech_est[:speech_test.shape[0]]
    music_est = music_est[:music_test.shape[0]]
    
    sdr_speech = SDR(s_est=speech_est,s=speech_test)
    sdr_music = SDR(s_est=music_est, s=music_test)
    
    print('SDR Speech = {:.2f} ... SDR Music = {:.2f}'.format(sdr_speech,sdr_music))
    
    write("../../Tests/SpeechX.wav", samplerate_t, speech_est.astype(np.int16))
    write("../../Tests/MusicX.wav", samplerate_t, music_est.astype(np.int16))
    
    return B

In [9]:
B2=Test(speech=speech, music=music, speech_t = speech_t, music_t = music_t, Ns=10,Nm=10,SMR_db=5,samplerate=samplerate_m,p=3)

SMR = 5.00
Training Speech NMF  .... Done
Training Music NMF .... Done
Testing NMF .... Done
Reconstruction Step .... Done
SDR Speech = 4.39 ... SDR Music = -0.61


In [10]:
np.save('B5',B2)