In [11]:
import numpy as np
import pandas as pd

import os
import argparse
import shutil

from pathlib import Path
from tqdm import tqdm
from scipy.stats import skew

import librosa

np.random.seed(1001)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

DATAROOT = Path('../../..') / 'ESC-50'
EXTRA = Path('../../..') / 'fsd2018_extra'

SAMPLE_RATE = 16000
NUM_MFCC = 30
FRAME = 512
NUM_PCA = 65

In [12]:
df_train = pd.read_csv(DATAROOT / 'meta' / 'ESC50.csv')

In [21]:
classes = df_train[df_train['fold'] != 5]['category'].unique()
classes

array(['dog', 'chirping_birds', 'vacuum_cleaner', 'thunderstorm',
       'door_wood_knock', 'can_opening', 'crow', 'clapping', 'fireworks',
       'chainsaw', 'airplane', 'mouse_click', 'pouring_water', 'train',
       'sheep', 'water_drops', 'church_bells', 'clock_alarm',
       'keyboard_typing', 'wind', 'footsteps', 'frog', 'cow',
       'brushing_teeth', 'car_horn', 'crackling_fire', 'helicopter',
       'drinking_sipping', 'rain', 'insects', 'laughing', 'hen', 'engine',
       'breathing', 'crying_baby', 'hand_saw', 'coughing',
       'glass_breaking', 'snoring', 'toilet_flush', 'pig',
       'washing_machine', 'clock_tick', 'sneezing', 'rooster',
       'sea_waves', 'siren', 'cat', 'door_wood_creaks', 'crickets'],
      dtype=object)

In [13]:
df_train.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [None]:
def get_config_default():

    confLH, confX = {}, {}
    confs = [confLH, confX]

    confLH['folder'] = Path('LH')
    # X6 - 6 sec time duration
    confX['folder'] = Path('X')
    # X5 - 5 sec tinme duration
    # confX['folder'] = Path('X5')

    # Approach LH parameters: highest resolutions
    confLH['sampling_rate'] = 44100
    confLH['duration'] = 4
    confLH['hop_length'] = 882  # 20ms
    confLH['fmin'] = 20
    confLH['fmax'] = confLH['sampling_rate'] // 2
    confLH['n_mels'] = 128
    confLH['n_mfcc'] = 30
    confLH['n_fft'] = confLH['n_mels'] * 20
    confLH['audio_split'] = 'head'

    # Approach X uses longer sound, then it uses suppressed
    confX['sampling_rate'] = 26000
    confX['duration'] = 5
    confX['hop_length'] = 520  # 20ms
    confX['fmin'] = 20
    confX['fmax'] = confX['sampling_rate'] // 2
    confX['n_mels'] = 48
    confLH['n_mfcc'] = 13
    confX['n_fft'] = confX['n_mels'] * 20
    confX['audio_split'] = 'dont_crop'

    return confs


def get_config(confLH, confX, trying_as_toy=True):

    confLH['n_fold'] = 2 if trying_as_toy else 5
    confLH['normalize'] = 'samplewise'
    confLH['valid_limit'] = 'manually_verified_only'
    confLH['random_state'] = 42
    confLH['test_size'] = 0.2
    confLH['batch_size'] = 32
    confLH['learning_rate'] = 0.001 if trying_as_toy else 0.0001
    confLH['epochs'] = 2 if trying_as_toy else 200
    confLH['verbose'] = 1

    confX['n_fold'] = 2 if trying_as_toy else 5
    confX['normalize'] = 'featurewise'
    confX['valid_limit'] = None
    confX['random_state'] = 42
    confX['test_size'] = 0.2
    confX['batch_size'] = 32
    confX['learning_rate'] = 0.001 if trying_as_toy else 0.0001
    confX['epochs'] = 2 if trying_as_toy else 200
    confX['verbose'] = 1

    return confLH, confX

In [6]:
def get_mfcc_feature(data, conf):
    """ Generate mfcc features with mean and standard deviation
        all librosa features have hop_length=512 by default
    """

    try:
        ft1 = librosa.feature.mfcc(data, sr=conf['sampling_rate'], n_mfcc=conf['n_mfcc'])
        ft2 = librosa.feature.zero_crossing_rate(data, hop_length=conf['hop_length'])[0]
        ft3 = librosa.feature.spectral_rolloff(data, sr=conf['sampling_rate'], hop_length=conf['hop_length'])[0]
        ft4 = librosa.feature.spectral_centroid(data, sr=conf['sampling_rate'], hop_length=conf['hop_length'])[0]
        ft5 = librosa.feature.spectral_contrast(data, sr=conf['sampling_rate'], n_bands=6, fmin=200.0)[0]
        ft6 = librosa.feature.spectral_bandwidth(data, sr=conf['sampling_rate'], hop_length=conf['hop_length'])[0]
        ft1_trunc = np.hstack((np.mean(ft1, axis=1),
                               np.std(ft1, axis=1),
                               skew(ft1, axis=1),
                               np.max(ft1, axis=1),
                               np.median(ft1, axis=1),
                               np.min(ft1, axis=1)))
        ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
        ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
        ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
        ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
        ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
        return pd.Series(np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc)))
    except Exception as error:
        print('bad file', error)
        return pd.Series([0] * 210)