# Q4 - Preprocessing Pipeline for the GTZAN Corpus

## Import Libraries

In [5]:
import os
import h5py
import librosa
import python_speech_features as psf
import numpy as np
import pickle
import json
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

# You must run the following command to use the below library

#     pip install logmmse

from logmmse import logmmse_from_file, logmmse

In [6]:
def extract_features(root_path, target_path, clip_len = 0.5):
    
    target_path = Path(target_path)
    
    if target_path.exists():
        if input('Target path exists... REMOVE? [Y/N] :').lower()=='y':
            os.remove(str(target_path))

    catg = ['music', 'speech']
    labeldict = {x : i for i, x in enumerate(['noise', 'music', 'speech'])}
    roots = {}
    for c in catg:
        roots[c]=root_path/'{}_wav'.format(c)

    fdict = dict((k, list(roots[k].glob('**/*.wav'))) for k in roots)
    
    names = []
            
    with h5py.File(target_path, mode = 'w') as fl:

        for key in fdict.keys():
            print('\nProcessing', key, 'files: total =', len(fdict[key]))
            for i, file in enumerate(fdict[key]):
                if not (i+1) % 5:
                        print(key.upper(), 'File', i+1, 'of', len(fdict[key]))
                        
                        
                #Remove background noise inline with LogMMSE
                logmmse_from_file(file, file)

                #load the audio file
                signal, sr = librosa.load(file, sr = 16000)

                # Start by removing silence
                inter = librosa.effects.split(signal, top_db=40, frame_length=int(0.025*sr), hop_length=int(0.015*sr))
                isig = np.concatenate(list(np.arange(*v) for v in inter), 0)
                sig = signal[isig]
                
                # Equalise loudness
                frms = psf.sigproc.framesig(sig, .25*sr, .25*sr)
                frms = frms/frms.std(axis=-1, keepdims=True)
                sig = psf.sigproc.deframesig(frms, siglen=len(sig), frame_len=.25*sr, frame_step= .25*sr)

                #Segment files into clips
                for count, x in enumerate(range(0, len(sig), int((clip_len * sr)//2))):

                    clip = sig[x : x + int(clip_len * sr)]

                    if len(clip) == (clip_len * sr):
                        mfcc = psf.mfcc(clip, samplerate=sr, numcep=20, nfilt=32, winlen=0.025, winstep=0.015).astype(np.float32)

                        name = '{}-{}'.format(str(count), str(file.relative_to(root_path)).split('/')[-1])
                        names.append(name)
                        grp = fl.create_group(name)
                        grp['mfcc'] = mfcc
                        grp['label'] = labeldict[key]
                
    print("Saving file name list\n")
    with open("../data/gtzan_f_list_{}s.txt".format(clip_len), "wb") as fp:
        pickle.dump(names, fp)

## Main Method

In [7]:
root_path = Path("../data/music_speech")

clip_lens = [0.5, 1, 2]
for clip in clip_lens:
    extract_features(root_path, '../data/gtzan_dataset_{}s.h5'.format(clip), clip_len = clip)

Target path exists... REMOVE? [Y/N] :y

Processing music files: total = 64
MUSIC File 5 of 64
MUSIC File 10 of 64
MUSIC File 15 of 64
MUSIC File 20 of 64
MUSIC File 25 of 64
MUSIC File 30 of 64
MUSIC File 35 of 64
MUSIC File 40 of 64
MUSIC File 45 of 64
MUSIC File 50 of 64
MUSIC File 55 of 64
MUSIC File 60 of 64

Processing speech files: total = 64
SPEECH File 5 of 64
SPEECH File 10 of 64
SPEECH File 15 of 64
SPEECH File 20 of 64
SPEECH File 25 of 64
SPEECH File 30 of 64
SPEECH File 35 of 64
SPEECH File 40 of 64
SPEECH File 45 of 64
SPEECH File 50 of 64
SPEECH File 55 of 64
SPEECH File 60 of 64
Saving file name list
Target path exists... REMOVE? [Y/N] :y

Processing music files: total = 64
MUSIC File 5 of 64
MUSIC File 10 of 64
MUSIC File 15 of 64
MUSIC File 20 of 64
MUSIC File 25 of 64
MUSIC File 30 of 64
MUSIC File 35 of 64
MUSIC File 40 of 64
MUSIC File 45 of 64
MUSIC File 50 of 64
MUSIC File 55 of 64
MUSIC File 60 of 64

Processing speech files: total = 64
SPEECH File 5 of 64
SPEEC