In [1]:
import essentia
import essentia.standard as es
import glob
import librosa
import soundfile as sf
from essentia.standard import *
import numpy as np
import ntpath
import pandas as pd
import logging
from multiprocessing import Pool
import traceback
logging.basicConfig(filename='audio_feature_extraction.log',level=logging.DEBUG)

import argparse

parser = argparse.ArgumentParser(description='Script to create audio features')
parser.add_argument('--in_wav_dirs', nargs='+',required=True, help='Video directories that contains wav files')
parser.add_argument('--csv_path', type=str, default="csv/audio_features", help='.csv file contain audio features')

args = parser.parse_args()
in_wav_dirs = args.in_wav_dirs
csv_path = args.csv_path

def get_audio_features(f):
    features, features_frames = es.MusicExtractor(
                                              lowlevelStats=['mean', 'stdev'],
                                              rhythmStats=['mean', 'stdev'],
                                              tonalStats=['mean', 'stdev'],
                                             )(f)
    features_name = features.descriptorNames()
    features_name = list(filter(lambda name: "metadata" not in name,features_name))
    info = {}
    for f in features_name:
        if not isinstance(features[f], np.ndarray):
            info[f] = features[f]
    
    return info

def task(file, ID):
    try:
        feature = get_audio_features(file)
        logging.info("processed file {}".format(file))
        return (feature, ID)
    except Exception as e:
        logging.error(traceback.format_exc())
        return (None, None)

    
def get_all_audio_features_parallel(list_dir, out_csv):
    files = []
    for directory in list_dir:
        files += glob.glob(directory + "/*.wav")
    
    IDs = [ntpath.basename(f).replace(".mp3", "").replace(".wav", "") for f in files]
    
    with Pool(processes=7) as pool:
        in_out = pool.starmap(task, list(zip(files, IDs)))

        
    in_out = [p  for p in in_out if p[0] is not None]
    input, output = zip(*in_out)    
    features = np.array(input)
    IDs = np.array(output)
    
    keys = features[0].keys()
    data = {}
    data["ID"] = IDs
    for k in keys:
        data[k] = [feature[k] for feature in features]
        
        
    df = pd.DataFrame(data=data)
    
    df.to_csv(out_csv)
    
get_all_audio_features_parallel(list_dir=in_wav_dirs, out_csv=csv_path)
# get_all_audio_features_parallel(["/data/zalo/hit-song-prediction/train-wav-samples", "/data/zalo/hit-song-prediction/test-wav-samples"])