In [None]:
import librosa  as lr
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import soundfile


In [None]:
def normalization(data):
    mean=np.mean(data,axis=0)
    std=np.std(data,axis=0)
    return (0.5 * (np.tanh(0.01 * ((data - mean) / std)) + 1))

In [None]:
def feature_extraction(mixture,vocal):
  #loading the data
  mixture_wave,sr = lr.load(mixture,sr=16000)
  vocal_wave  ,sr = lr.load(vocal ,sr=16000)

  #extracting stft of audion files
  mixture_stft    = lr.stft(mixture_wave,n_fft=512,hop_length=256,win_length=512) 
  vocal_stft      = lr.stft(vocal_wave,n_fft=512,hop_length=256,win_length=512)

  # Magnitude of STFT -> spectrogram
  mixture_data    = np.abs(mixture_stft)
  vocal_data      = np.abs(vocal_stft)

  mixture_data    = np.transpose(mixture_data)
  vocal_data      = np.transpose(vocal_data)


  #normalisation of the data
  mixture_data    = normalization(mixture_data)
  vocal_data      = normalization(vocal_data)
 
  #Generating mask  
  mask            = np.divide(vocal_data,mixture_data+np.finfo(float).eps)

  n_context = int(21)
  n_pad     =int(n_context/2)

   
  # padding the data
  mixture_data    =np.vstack(([mixture_data[0]]*n_pad,mixture_data))
  mixture_data    =np.vstack((mixture_data,[mixture_data[-1]]*n_pad))
  


  input = []
  i = 0
  while(i+n_context<=len(mixture_data)):
    input.append(mixture_data[i:i+n_context,:])
    i+=1

  mixture_data = np.array(input)  
  
  return mixture_data , mask  

In [None]:
x=[]
y=[]
#np.seterr(divide='ignore', invalid='ignore')
for i in range(1,51):  
  mixture_path = "C:\\Users\\sseela\\Downloads\\Training\\mixture\\mixture"+str(i)+".wav" 
  vocal_path   = "C:\\Users\\sseela\\Downloads\\Training\\Vocals\\vocal"+str(i)+".wav"
  data , mask = feature_extraction(mixture_path,vocal_path)
  x.append(data)
  y.append(mask)

#creating datasset
x=np.vstack(x)
y=np.vstack(y)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
model=tf.keras.Sequential()
model.add(LSTM(128,input_shape=(21,257),return_sequences=True))
model.add(LSTM(128,return_sequences=False))
model.add(Dense(257,activation='tanh'))
model.compile(loss='mean_absolute_error',optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x,y,epoch=100,batchsize=256,verbose=1)

In [None]:
def feature_extraction_test(stft_data):
  n_context = int(21)
  n_pad     =int(n_context/2)

  stft_data    = np.vstack(([stft_data[0]]*n_pad,stft_data))
  stft_data    = np.vstack((stft_data,[stft_data[-1]]*n_pad))
  

  input = []
  i = 0
  while(i+n_context<=len(stft_data)):
    input.append(stft_data[i:i+n_context,:])
    i+=1

  mixture_data = np.array(input)

  return mixture_data

In [None]:
def test_SMM(mixture):
    mixture           = "C:\\Users\\sseela\\Downloads\\Testing\\mixture\\mixture"
    mixture_wave,sr   = lr.load(mixture,sr=16000)
    mixture_stft      = lr.stft(mixture_wave,n_fft=512,hop_length=256,win_length=512) 
    mmixture_stft     = np.transpose(np.absolute(mixture_stft))
    mixture_stft_norm = normalisation(mixture_stft)
    mask_generation   = model.predict(feature_extraction_test(mixture_stft_norm))
    istft = lr.core.istft(np.transpose(mixture_stft*mask_generation),hop_length=256,win_length=512)
    soundfile.write('C:\\Python\\Python38\\ma.wav', istft, sr)
