# Feature selection example
## Extract audio features 
Use `pyAudioAnalysis` library to extract audio features for all WAV files in folders `audio\music` and `audio\speech`. Use a cache pkl file to store audio features so that they not computed every time this notebook is executed. 

In [6]:
import os
from pyAudioAnalysis import MidTermFeatures as mtf
from pyAudioAnalysis import audioTrainTest as at
import numpy as np
import pickle
import os.path

if os.path.isfile('data2.pkl'):
    # if features already calcualted --> load
    with open('data2.pkl','rb') as f:
        X = pickle.load(f)
        y = pickle.load(f)
else:
    with open('data2.pkl','wb') as f:
        # if features not already calculated: extract features from scratch
        dirs = ['audio/music', 'audio/speech']
        # extract features from directories of WAV files:
        f1, _, feature_names = mtf.directory_feature_extraction(dirs[0], 1, 1, 0.1, 0.1)
        f2, _, feature_names = mtf.directory_feature_extraction(dirs[1], 1, 1, 0.1, 0.1)
        mid_term_features = [f1, f2]
        # convert list of feature matrices to x, y format:
        x, y = at.features_to_matrix(mid_term_features)
        m = x.mean(axis=0)
        s = np.std(x, axis = 0)
        X = (x - m) / s
        pickle.dump(X, f)
        pickle.dump(y, f)
print(X.shape)

(40, 138)


## Feature selection

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from itertools import compress
sfm_selector = SelectFromModel(estimator=LogisticRegression())
sfm_selector.fit(X, y)
list_of_selected_features = list(compress(feature_names, sfm_selector.get_support().tolist()))

['zcr_mean',
 'energy_mean',
 'energy_entropy_mean',
 'spectral_centroid_mean',
 'spectral_entropy_mean',
 'spectral_rolloff_mean',
 'mfcc_1_mean',
 'mfcc_2_mean',
 'mfcc_3_mean',
 'mfcc_4_mean',
 'mfcc_5_mean',
 'mfcc_6_mean',
 'mfcc_8_mean',
 'mfcc_10_mean',
 'mfcc_12_mean',
 'chroma_1_mean',
 'chroma_3_mean',
 'chroma_6_mean',
 'chroma_7_mean',
 'chroma_10_mean',
 'chroma_11_mean',
 'chroma_std_mean',
 'delta mfcc_6_mean',
 'delta mfcc_9_mean',
 'delta chroma_11_mean',
 'zcr_std',
 'energy_std',
 'energy_entropy_std',
 'spectral_centroid_std',
 'spectral_spread_std',
 'spectral_entropy_std',
 'spectral_rolloff_std',
 'mfcc_1_std',
 'mfcc_2_std',
 'mfcc_3_std',
 'mfcc_4_std',
 'mfcc_5_std',
 'mfcc_6_std',
 'mfcc_7_std',
 'mfcc_10_std',
 'chroma_3_std',
 'chroma_5_std',
 'chroma_6_std',
 'chroma_7_std',
 'delta zcr_std',
 'delta energy_std',
 'delta energy_entropy_std',
 'delta spectral_centroid_std',
 'delta spectral_entropy_std',
 'delta spectral_rolloff_std',
 'delta mfcc_1_std',
 

['zcr_mean',
 'energy_mean',
 'energy_entropy_mean',
 'spectral_centroid_mean',
 'spectral_entropy_mean',
 'spectral_rolloff_mean',
 'mfcc_1_mean',
 'mfcc_2_mean',
 'mfcc_3_mean',
 'mfcc_4_mean',
 'mfcc_5_mean',
 'mfcc_6_mean',
 'mfcc_8_mean',
 'mfcc_10_mean',
 'mfcc_12_mean',
 'chroma_1_mean',
 'chroma_3_mean',
 'chroma_6_mean',
 'chroma_7_mean',
 'chroma_10_mean',
 'chroma_11_mean',
 'chroma_std_mean',
 'delta mfcc_6_mean',
 'delta mfcc_9_mean',
 'delta chroma_11_mean',
 'zcr_std',
 'energy_std',
 'energy_entropy_std',
 'spectral_centroid_std',
 'spectral_spread_std',
 'spectral_entropy_std',
 'spectral_rolloff_std',
 'mfcc_1_std',
 'mfcc_2_std',
 'mfcc_3_std',
 'mfcc_4_std',
 'mfcc_5_std',
 'mfcc_6_std',
 'mfcc_7_std',
 'mfcc_10_std',
 'chroma_3_std',
 'chroma_5_std',
 'chroma_6_std',
 'chroma_7_std',
 'delta zcr_std',
 'delta energy_std',
 'delta energy_entropy_std',
 'delta spectral_centroid_std',
 'delta spectral_entropy_std',
 'delta spectral_rolloff_std',
 'delta mfcc_1_std',
 