# Q2 - Preprocessing Pipeline for the MUSAN Corpus

## Import Libraries

In [None]:
import os
import h5py
import librosa
import python_speech_features as psf
import numpy as np
import pickle
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

## Pipeline

In [None]:
def extract_features(root_path, target_path, clip_len = 0.5):
    
    target_path = Path(target_path)
    
    if target_path.exists():
        if input('Target path exists... REMOVE? [Y/N] :').lower()=='y':
            os.remove(str(target_path))

    catg = ['noise', 'music', 'speech']
    labeldict = {x : i for i, x in enumerate(catg)}
    roots = {}
    for c in catg:
        roots[c]=root_path/c

    fdict = dict((k, list(roots[k].glob('**/*.wav'))) for k in roots)
            
    names = []
    
    with h5py.File(target_path, mode = 'w') as fl:

        for key in fdict.keys():
            print('\nProcessing', key, 'files: total =', len(fdict[key]))
            for i, file in enumerate(fdict[key]):
                if not (i+1) % 5:
                        print(key.upper(), 'File', i+1, 'of', len(fdict[key]))

                #load the audio file
                signal, sr = librosa.load(file, sr = 16000)

                # Start by removing silence
                inter = librosa.effects.split(signal, top_db=40, frame_length=int(0.025*sr), hop_length=int(0.025*sr))
                isig = np.concatenate(list(np.arange(*v) for v in inter), 0)
                sig = signal[isig]
                
                # Equalise loudness
                frms = psf.sigproc.framesig(sig, .25*sr, .25*sr)
                frms = frms/frms.std(axis=-1, keepdims=True)
                sig = psf.sigproc.deframesig(frms, siglen=len(sig), frame_len=.25*sr, frame_step= .25*sr)

                #Segment files into clips : MUSAN is 16kHz hence 0.5s = 8000frames --> x seconds = x * sr
                for count, x in enumerate(range(0, len(sig), int((clip_len * sr)//2))):

                    clip = sig[x : x + int(clip_len * sr)]

                    if len(clip) == (clip_len * sr):
                        mfcc = psf.mfcc(clip, samplerate=sr, numcep=20, nfilt=32, winlen=0.025, winstep=0.015).astype(np.float32)

                        name = '{}-{}'.format(str(count), str(file.relative_to(root_path)).split('/')[-1])
                        names.append(name)
                        grp = fl.create_group(name)
                        grp['mfcc'] = mfcc
                        grp['label'] = labeldict[key]
                        
    print("Saving file name list")
    with open("../data/f_list_{}s.txt".format(clip_len), "wb") as fp:
        pickle.dump(names, fp)

## Main Function

In [None]:
# Toy dataset

root_path = Path("../data/musan_toy")

extract_features(root_path, '../data/musan_toy_dataset_2s.h5', clip_len = 2)

In [None]:
# Full dataset

root_path = Path("../data/musan")

clip_lengths = [0.5, 1, 2]

for clip in clip_lengths:
    target_path = '../data/musan_data_derived_{}s.h5'.format(clip)
    extract_features(root_path, target_path, clip_len = clip)

In [None]:
# Check

db = h5py.File('../data/musan_toy_dataset_2s.h5', mode = 'r')
db['0-music-fma-0000.wav']['mfcc']