# Feature selection example
## Extract audio features 
Use `pyAudioAnalysis` library to extract audio features for all WAV files in folders `audio\music` and `audio\speech`. Use a cache pkl file to store audio features so that they not computed every time this notebook is executed. 

In [11]:
import os
from pyAudioAnalysis import MidTermFeatures as mtf
from pyAudioAnalysis import audioTrainTest as at
import numpy as np
import pickle
import os.path

if os.path.isfile('data2.pkl'):
    # if features already calcualted --> load
    with open('data2.pkl','rb') as f:
        X = pickle.load(f)
        y = pickle.load(f)
        mid_term_features = pickle.load(f)
        feature_names = pickle.load(f)
else:
    with open('data2.pkl','wb') as f:
        # if features not already calculated: extract features from scratch
        dirs = ['audio/music', 'audio/speech']
        # extract features from directories of WAV files:
        f1, _, feature_names = mtf.directory_feature_extraction(dirs[0], 1, 1, 0.1, 0.1)
        f2, _, feature_names = mtf.directory_feature_extraction(dirs[1], 1, 1, 0.1, 0.1)
        mid_term_features = [f1, f2]
        # convert list of feature matrices to x, y format:
        x, y = at.features_to_matrix(mid_term_features)
        m = x.mean(axis=0)
        s = np.std(x, axis = 0)
        X = (x - m) / s
        pickle.dump(X, f)
        pickle.dump(y, f)
        pickle.dump(mid_term_features, f)
        pickle.dump(feature_names, f)
print(X.shape)

(40, 138)


## Feature selection

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from itertools import compress
sfm_selector = SelectFromModel(estimator=LogisticRegression())
sfm_selector.fit(X, y)
list_of_selected_features = list(compress(feature_names, sfm_selector.get_support().tolist()))
print(list_of_selected_features)

['zcr_mean', 'energy_mean', 'energy_entropy_mean', 'spectral_centroid_mean', 'spectral_entropy_mean', 'spectral_rolloff_mean', 'mfcc_1_mean', 'mfcc_2_mean', 'mfcc_3_mean', 'mfcc_4_mean', 'mfcc_5_mean', 'mfcc_6_mean', 'mfcc_8_mean', 'mfcc_10_mean', 'mfcc_12_mean', 'chroma_1_mean', 'chroma_3_mean', 'chroma_6_mean', 'chroma_7_mean', 'chroma_10_mean', 'chroma_11_mean', 'chroma_std_mean', 'delta mfcc_6_mean', 'delta mfcc_9_mean', 'delta chroma_11_mean', 'zcr_std', 'energy_std', 'energy_entropy_std', 'spectral_centroid_std', 'spectral_spread_std', 'spectral_entropy_std', 'spectral_rolloff_std', 'mfcc_1_std', 'mfcc_2_std', 'mfcc_3_std', 'mfcc_4_std', 'mfcc_5_std', 'mfcc_6_std', 'mfcc_7_std', 'mfcc_10_std', 'chroma_3_std', 'chroma_5_std', 'chroma_6_std', 'chroma_7_std', 'delta zcr_std', 'delta energy_std', 'delta energy_entropy_std', 'delta spectral_centroid_std', 'delta spectral_entropy_std', 'delta spectral_rolloff_std', 'delta mfcc_1_std', 'delta mfcc_3_std', 'delta mfcc_4_std', 'delta mfcc

In [25]:
from sklearn.linear_model import LogisticRegression
#from sklearn.tree import DecisionTreeRegressor
# define the model
model = LogisticRegression()
#model = DecisionTreeRegressor()
# fit the model
model.fit(X, y)
# get importance
importance = model.coef_
#importance = model.feature_importances_
#imp_features = [f for i_f, f in enumerate(feature_names) if abs(importance[0][i_f]) < 0.05]
importance = np.abs(importance[0])
feature_names_s = [feature_names[f] for f in importance.argsort()]
less_important_5 = feature_names_s[:5]
most_important_5 = feature_names_s[:-6:-1]
print("Less important features:")
print(less_important_5)
print("Most important features:")
print(most_important_5)

Less important features:
['delta spectral_centroid_mean', 'delta chroma_2_mean', 'mfcc_7_mean', 'delta chroma_3_mean', 'chroma_12_std']
Most important features:
['energy_entropy_mean', 'mfcc_8_mean', 'mfcc_12_mean', 'energy_entropy_std', 'mfcc_1_std']


In [34]:
import plotly
import plotly.subplots
import plotly.graph_objs as go
import numpy as np
import matplotlib.pyplot as plt

def get_color_combinations(n_classes):
    clr_map = plt.cm.get_cmap('jet')
    range_cl = range(int(int(255/n_classes)/2), 255, int(255/n_classes))
    clr = []
    for i in range(n_classes):
        clr.append('rgba({},{},{},{})'.format(clr_map(range_cl[i])[0],
                                              clr_map(range_cl[i])[1],
                                              clr_map(range_cl[i])[2],
                                              clr_map(range_cl[i])[3]))
    return clr

list_of_feature_mtr = mid_term_features
n_columns=5
class_names = ['music', 'speech']
n_features = len(feature_names)
n_bins = 12
n_rows = 2
figs = plotly.subplots.make_subplots(rows=n_rows, cols=n_columns,
                                     subplot_titles=less_important_5 + most_important_5)
figs['layout'].update(height=(n_rows * 250))
clr = get_color_combinations(len(class_names))
count_less = count_most = 0
for i in range(n_features):
    if feature_names[i] in less_important_5:
        count_less += 1
        # for each feature get its bin range (min:(max-min)/n_bins:max)
        f = np.vstack([x[:, i:i + 1] for x in list_of_feature_mtr])
        bins = np.arange(f.min(), f.max(), (f.max() - f.min()) / n_bins)
        for fi, f in enumerate(list_of_feature_mtr):
            # load the color for the current class (fi)
            mark_prop = dict(color=clr[fi], line=dict(color=clr[fi], width=3))
            # compute the histogram of the current feature (i) and normalize:
            h, _ = np.histogram(f[:, i], bins=bins)
            h = h.astype(float) / h.sum()
            cbins = (bins[0:-1] + bins[1:]) / 2
            scatter_1 = go.Scatter(x=cbins, y=h, name=class_names[fi],
                                   marker=mark_prop, showlegend=(i == 0))
            # (show the legend only on the first line)
            figs.append_trace(scatter_1, 1, count_less)
for i in figs['layout']['annotations']:
    i['font'] = dict(size=10, color='#224488')
figs.show()