In [33]:
import tensorflow.keras.backend as K
import tensorflow.keras as keras
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import librosa
import librosa.display
import soundfile as sf
import pandas as pd
import os
from pathlib import *
import glob
import shutil
import IPython.display as ipd
%pylab
%matplotlib inline

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [43]:
DATA_PATH = Path('/net/store/cv/users/ybrima/scratch/data')
ZIP_PATH =  Path( DATA_PATH, os.listdir(DATA_PATH)[-1])
INPUT_PATH = Path('/net/store/cv/users/ybrima/scratch/data/archive/16000_pcm_speeches/')

In [44]:
CLASSES = os.listdir(INPUT_PATH)
FRAME_LENGHT =  1024
SAMPLE_RATE =  16000
HOP_LENGTH =  512
n_fft=2048
num_mfcc=13

In [None]:
def build_model(input_shape,output_shape):
    """ This function builds a functional model"""
    

    inputs =  keras.Input(shape=input_shape,name="input_layer")
    x =  keras.layers.Flatten()(inputs)
    
    x =  keras.layers.Dense(32,activation="relu")(x)
    x =  keras.layers.BatchNormalization()(x)
    x =  keras.layers.Dropout(0.3)(x)

    x =  keras.layers.Dense(64,activation="relu")(x)
    x =  keras.layers.BatchNormalization()(x)
    x =  keras.layers.Dropout(0.3)(x)
    
    x =  keras.layers.Dense(128,activation="relu")(x)
    x =  keras.layers.BatchNormalization()(x)
    x =  keras.layers.Dropout(0.3)(x)
    
    x =  keras.layers.Dense(64,activation="relu")(x)
    x =  keras.layers.BatchNormalization()(x)
    x =  keras.layers.Dropout(0.3)(x)
    
    x =  keras.layers.Dense(32,activation="relu")(x)
    x =  keras.layers.BatchNormalization()(x)
    x =  keras.layers.Dropout(0.3)(x)

    outputs =  keras.layers.Dense(output_shape,activation="softmax")(x)


    model =  keras.Model(inputs=inputs,outputs=outputs,name="speaker_model")
    
    model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.RMSprop(),metrics=["accuracy"],)
    
    return model

In [31]:
def get_files(INPUT_PATH):
    ds =  {'file': [], 'class': []}
    for d in os.listdir(INPUT_PATH):
        temp =  Path(INPUT_PATH, d)
        if(os.path.isdir(temp)):
            for file in temp.glob("**/*.wav"):
                filename =  Path(temp,file)
                ds['file'].append(filename)
                ds['class'].append(CLASSES.index(d))
    data = pd.DataFrame(ds)
    return data,CLASSES

In [34]:
ds, CLASSES = get_files(INPUT_PATH)

In [53]:
def save_array(df,file_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    X  = []
    Q = []
    y =  []
    for k in range(df.shape[0]):
        r =  df.iloc[k]
        #   Loading the audio file 
        signal,sample_rate = librosa.load(r['file'], sr=SAMPLE_RATE)
        
        X.append(signal)
        y.append(y)
        
        # extract mfcc
        mfcc = librosa.feature.mfcc(signal, SAMPLE_RATE, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
        mfcc = mfcc.T
        Q.append(mfcc)
    
    X =  np.array(X)
    Q =  np.array(Q)
    y =  np.array(y)
    np.savez(file_path,x=X,q=Q,y=y)
    print(f"Data written to storage successfully, path = {file_path}/speakers.npz")

In [None]:
save_array(ds,INPUT_PATH)

In [None]:
input_shape = (30,13)
output_shape =  len(CLASSES)
model = build_model(input_shape,output_shape)