In [8]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import read, write
from scipy import signal
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from numpy.linalg import inv
from helpers import Reconstruct, Viz_Y
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

# 1.Read speech and music data:

In [55]:
samplerate_s, data_speech = read("../data/male_vocal.wav")
speech=data_speech[:10000,0]
length=speech.shape[0]/samplerate_s
print('Shape of the speech {}'.format(music.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_s))

samplerate_m, data_music = read("../data/piano.wav")
music=data_music[:100000,0]
length=music.shape[0]/samplerate_m
print('Shape of the music {}'.format(music.shape[0]))
print('Length : {:.2f}s'.format(length))
print('Sample rate : {}'.format(samplerate_m))

samplerate_t, test = read("../data/mixed_signal.wav")
test=test[:100000,0]


Shape of the speech 100000
Length : 0.23s
Sample rate : 44100
Shape of the music 100000
Length : 2.27s
Sample rate : 44100


## Apply STFT :

### We can change :

* Window : Type of window
* nperseg : length of window
* noverlap : overlap between windows.
* nfft : fft length > window size

In [56]:
WINDOW = 'hamming'
WINDOW_SIZE=480
OVERLAP = 0.6 * WINDOW_SIZE
NFFT=512

f,t,Y= signal.stft(speech,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_s=np.abs(Y)

f,t,Y= signal.stft(music,samplerate,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_m=np.abs(Y)

f,t,Y= signal.stft(test,samplerate_t,window=WINDOW,nperseg=WINDOW_SIZE,noverlap=OVERLAP,nfft=NFFT)
Yabs_t=np.abs(Y)
print('Shape of spectrogram speech : {}'.format(Yabs_s.shape))
print('Shape of spectrogram music: {}'.format(Yabs_m.shape))

Shape of spectrogram speech : (257, 54)
Shape of spectrogram music: (257, 522)


## If we apply the elbow method the optimal number of componenets will be 32

In [57]:
model = NMF(n_components=8, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
G_s = model.fit_transform(np.transpose(Yabs_s))
B_s = model.components_

In [58]:
model = NMF(n_components=8, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
G_m = model.fit_transform(np.transpose(Yabs_m))
B_m = model.components_

In [59]:
B=np.vstack([B_s,B_m])
print(B.shape)

(16, 257)


In [66]:
model_test = NMF(n_components=16, init='random',alpha=0.0,beta_loss='itakura-saito',solver="mu",max_iter=100, random_state=0)
model_test.fit(np.transpose(Yabs_t))
model_test.components_=B
G_test=model_test.transform(np.transpose(Yabs_t))

In [72]:
np.matmul(np.transpose(B),np.transpose(G_test)).shape

(257, 522)

# Estimation of the magnitude of the sources:

In [None]:
percents = pourcentage(n_components,B,G)
Sources=[]

for i in range(n_components):
    
    Sources.append(np.multiply(percents[i],Yabs))
    
print('Source shape = {}'.format(Sources[0].shape))

In [None]:
fig,ax=plt.subplots(1,n_components,figsize=(20,8))

for i in range(n_components):
        
        ax[i].pcolormesh(t, f,Sources[i],vmin=0, vmax=20, shading='gouraud')
        ax[i].set_title('STFT Magnitude')
        ax[i].set_ylabel('Frequency [Hz]')
        ax[i].set_xlabel('Time [sec]')

In [None]:
for i in range(n_components):
    
    _, xrec =  signal.istft(Sources[i],
                          samplerate,
                          window = "hamming",
                          nperseg=480,
                          noverlap=480*0.5,
                          nfft = 512)
    write("../data/example"+str(i)+".wav", samplerate, xrec.astype(np.int16))