In [85]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from numpy.linalg import inv
from helpers import Reconstruct, Viz_Y
import seaborn as sns
import warnings
import torch
warnings.simplefilter('ignore')

# 1.Read speech and music data:

In [95]:
samplerate_s, data_speech = read("../data/male_vocal.wav")
speech=data_speech[:100000000,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {}'.format(speech.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_s))

samplerate_m, data_music = read("../data/piano.wav")
music=data_music[:100000000,0]
length=music.shape[0]/samplerate_m
print('Shape of the music {}'.format(music.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_m))

samplerate_t, test = read("../data/mixed_signal.wav")
test=test[:,0]


Shape of the speech 5018580
Length : 113.80s
Sample rate : 44100
Shape of the music 5046139
Length : 114.42s
Sample rate : 44100


In [2]:
def SDR(s_est, s):
    """
    Function that takes original and estimated spectrogram
    return SDR in DB
    """
    
    signal_power = torch.tensor(s,dtype=torch.float64).norm(p=2)
    distorsion_power = torch.tensor(s-s_est,dtype=torch.float64).norm(p=2)
    SDR_db=10*np.log10(signal_power/distorsion_power)
    
    return SDR_db

## Apply STFT :

### We can change :

* Window : Type of window
* nperseg : length of window
* noverlap : overlap between windows.
* nfft : fft length > window size

In [97]:
WINDOW = 'hamming'
WINDOW_SIZE=480
OVERLAP = 0.6 * WINDOW_SIZE
NFFT=512

f,t,Y= signal.stft(speech,samplerate_s,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_s=np.abs(Y)

f,t,Y= signal.stft(music,samplerate_m,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_m=np.abs(Y)

f,t,Y= signal.stft(test,samplerate_t,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_t=np.abs(Y)
print('Shape of spectrogram speech : {}'.format(Yabs_s.shape))
print('Shape of spectrogram music: {}'.format(Yabs_m.shape))

Shape of spectrogram speech : (257, 26140)
Shape of spectrogram music: (257, 26283)


## If we apply the elbow method the optimal number of componenets will be 32

In [98]:
Yabs_s[Yabs_s==0]=0.0001
Yabs_t[Yabs_t==0]=0.0001
Yabs_m[Yabs_m==0]=0.0001


In [99]:
model = NMF(n_components=64, init='random',alpha=0.7,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
G_s = model.fit_transform(np.transpose(Yabs_s))
B_s = model.components_

In [100]:
model = NMF(n_components=64, init='random',alpha=0.7,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
G_m = model.fit_transform(np.transpose(Yabs_m))
B_m = model.components_

In [101]:
B=np.vstack([B_s,B_m])
print(B.shape)

(128, 257)


In [102]:
model_test = NMF(n_components=128, init='random',alpha=0.7,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
model_test.fit(np.transpose(Yabs_t))
model_test.components_=B
G_test=model_test.transform(np.transpose(Yabs_t))

# Estimation of the sources:

In [103]:
Sources,Masks=Reconstruct(B=np.transpose(B),G=np.transpose(G_test),Ns=64,Nm=64,Yabs=Y,p=5)

Source shape = (257, 26283)


In [104]:
for i in range(2):
    
    _, xrec =  signal.istft(Sources[i],
                          samplerate_t,
                          window = WINDOW,
                          nperseg=WINDOW_SIZE,
                          noverlap=OVERLAP,
                          nfft = NFFT)
    write("../../example"+str(i)+".wav", samplerate_t, xrec.astype(np.int16))