# FMA: A Dataset For Music Analysis

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

* This notebook evalutates standard classifiers from scikit-learn on the provided features.
* Moreover, it evaluates Deep Learning models on both audio and spectrograms.

In [56]:
import time
import os

import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC # Support Vector Classification
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
import utils

from sklearn.feature_selection import SelectKBest, SelectPercentile

In [2]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')

tracks = utils.load('tracks.csv') # (106574, 52)
features = utils.load('features.csv') # (106574, 518)
echonest = utils.load('echonest.csv') # (13129, 249)

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape

  'category', categories=SUBSETS, ordered=True)


((106574, 52), (106574, 518), (13129, 249))

## Subset
We use 'medium' to build models. features_all: all 'features' data for <medium and small> subset.

In [3]:
# medium: 25000
subset = tracks.index[tracks['set', 'subset'] <= 'medium'] # track.index for <medium and small> subset

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

# features_all: features inner join echonest
features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]

# features_all: all 'features' data for <medium and small> subset.
features_all = features.loc[subset]

tracks.shape, features_all.shape, features.shape

Not enough Echonest features: (13129, 767)


((25000, 52), (25000, 518), (106574, 518))

In [4]:
ipd.display(features_all.head(1))
ipd.display(features.head(1))
ipd.display(echonest.head(1))

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448


feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448


Unnamed: 0_level_0,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest
Unnamed: 0_level_1,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,metadata,metadata,...,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features
Unnamed: 0_level_2,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,album_date,album_name,...,214,215,216,217,218,219,220,221,222,223
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749


In [5]:
# get <tracks.index> for training data, validation set and test set to separate them. 
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

# genres in 'genre_top';
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))

# genres in 'genres_all';
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

19922 training examples, 2505 validation examples, 2573 testing examples
Top genres (35): [' ', '-', '/', 'B', 'C', 'E', 'F', 'H', 'I', 'J', 'L', 'O', 'P', 'R', 'S', 'T', 'a', 'c', 'd', 'e', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'x', 'y', 'z']
All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 808, 810, 811, 906, 1032, 1060, 1193, 1235]


## 1 Multiple classifiers and feature sets

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [57]:
# tracks: entire tracks sets including train/val/test sets;
# features: entire feature sets used in models: features as X matrix;
# columns: specified features subset extracted from 'features' used in model;
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    # labels
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        #labels = tracks['track', 'genres']

    # Split in training, validation and testing sets.
    # train, val, test: tracks.index for training data, validation set and test set. 
    # labels: genres in 'genre_top'/'genres_all';
    y_train = enc.fit_transform(labels[train]) # labels
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    
    # columns: columns used as features;
    # features: entire features set;
    X_train = features.loc[train, columns].as_matrix()
    X_val = features.loc[val, columns].as_matrix()
    X_test = features.loc[test, columns].as_matrix()
    
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # StandardScaler: Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False) 
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

In [58]:
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features, 'spectral_contrast', multi_label=False, verbose=False)
print(min(y_test), max(y_test))

0 15


In [8]:
tracks.head(5)

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
134,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music


### 1.2 Single genre
What we are going to do in this project.
We use 'genre_top'(16 labels) as y, then output accuracy for various classifiers with 

In [9]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()).insert(0, 'dim') # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test) # accuracy for function clf.
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores, times


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [None]:
classifiers = {
    'LR': LogisticRegression(),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
    'linSVC1': SVC(kernel="linear"),
    'linSVC2': LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    'DT': DecisionTreeClassifier(max_depth=5),
    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    'NB': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_social': ('echonest', 'social_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
#    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
#    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}

# features.columns.levels[0]: 
#     Index(['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
#         'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
#         'spectral_rolloff', 'tonnetz', 'zcr'],
#         dtype='object', name='feature')
for name in features.columns.levels[0]:
    feature_sets[name] = name
    

feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

# test_classifiers_features:
scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

ipd.display(classifiers.keys()); 
ipd.display(feature_sets.keys())

ipd.display(classifiers)
ipd.display(feature_sets)

In [None]:
ipd.display(features.columns.levels[0])
ipd.display(features.columns.levels[1])
ipd.display(features.columns.levels[2])

In [55]:
for name in features.columns.levels[0]:
    feature_sets[name] = name
    
# update: adds dictionary dict2's key-values pairs in to dict.
feature_sets.update({ 
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

list(features.columns.levels[0])

['chroma_cens',
 'chroma_cqt',
 'chroma_stft',
 'mfcc',
 'rmse',
 'spectral_bandwidth',
 'spectral_centroid',
 'spectral_contrast',
 'spectral_rolloff',
 'tonnetz',
 'zcr']

### 1.2.2 Single genres with all features

Todo:
* use above methods to deal with all features for different classifiers.

### Precision and Recall

In [137]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()).insert(0, 'dim') # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores_test = pd.DataFrame(columns = columns, index = feature_sets.keys())
    scores_train = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        
        # Guzhiwei ********
        model = SelectKBest(k=300)
        fit = model.fit(X_train, y_train)
        X_train = fit.transform(X_train)
        X_test = fit.transform(X_test)
        
        # Guzhiwei *********
        
        scores_test.loc[fset_name, 'dim'] = X_train.shape[1]
        scores_train.loc[fset_name, 'dim'] = X_train.shape[1]
        labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
#         n_sample_train = [  58,  495,  142,   13, 5050, 1801, 1215, 1761, 1045,  814,  306,
#          408,  945, 5681,   94,   94]
#         n_sample_test = [  8,  62,  18,   6, 632, 225, 152, 220, 174, 102,  39,  51, 119,
#         711,  42,  12]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            # train the model;
            clf.fit(X_train, y_train)
            
            # for training data
            Y_predict_train = clf.predict(X_train)
            Precision_recall_train = precision_recall_fscore_support(y_train, Y_predict_train)
            score_train = clf.score(X_train, y_train) # accuracy for function clf.
            scores_train.loc[fset_name, clf_name] = score_train
            confusion_train = confusion_matrix(y_train, Y_predict_train, labels=labels) #, sample_weight=Precision_recall_train[3])
            
            # for test data
            Y_predict_test = clf.predict(X_test)
            Precision_recall_test = precision_recall_fscore_support(y_test, Y_predict_test)
            score_test = clf.score(X_test, y_test) # accuracy for function clf.
            scores_test.loc[fset_name, clf_name] = score_test
            confusion_test = confusion_matrix(y_test, Y_predict_test, labels=labels) #, sample_weight=Precision_recall_test[3])
            
            # for time
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores_test, scores_train, times, Precision_recall_train, Precision_recall_test, confusion_train, confusion_test


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [139]:
classifiers = {
#    'LR': LogisticRegression(),
    # Inverse of regularization strength; must be a positive float. Like in support vector machines, 
    # smaller values specify stronger regularization.
#    'LR_l2': LogisticRegression(penalty='l1', C=0.3)
#     'NN': MLPClassifier(hidden_layer_sizes=(280, 18, 16), activation='relu', alpha=2, max_iter=200)
#     'kNN': KNeighborsClassifier(n_neighbors=200),
#     'SVCrbf': SVC(kernel='rbf', C=1.6),
#     'SVCpoly1': SVC(kernel='poly', degree=1),
#     'linSVC1': SVC(kernel="linear"),
#     'linSVC2': LinearSVC(),
   'linSVC2': LinearSVC(penalty='l2', loss='squared_hinge', 
                           dual=True, tol=0.0001, C=0.009, multi_class='ovr', 
                           fit_intercept=True, intercept_scaling=1, 
                           class_weight=None, verbose=0, random_state=None, 
                           max_iter=500),
#     'linSVC2': LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
#                        tol=1e-3, C=0.1, multi_class='ovr', 
#                            fit_intercept=True, intercept_scaling=1, 
#                            class_weight=None, verbose=0, random_state=None, 
#                            max_iter=1000),
#     #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
#     'DT': DecisionTreeClassifier(max_depth=5),
#     'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     'AdaBoost': AdaBoostClassifier(n_estimators=10),
#     'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
#     'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
#     'NB': GaussianNB(),
#     'QDA': QuadraticDiscriminantAnalysis(),
}

# ['mfcc','std']: indicate level[0]: 'mfcc', level[1]: 'std';
# ['mfcc', 'spectral_contrast']: idicate 2 different level[0]: 'mfcc', 'spectral_contrast'

# feature_sets = features.columns.levels[0];
feature_sets = {
     'all': ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
           'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
           'spectral_rolloff', 'tonnetz', 'zcr']
}
# #    'echonest_audio': ('echonest', 'audio_features'),
# #    'echonest_social': ('echonest', 'social_features'),
# #    'echonest_temporal': ('echonest', 'temporal_features'),
# #    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
# #    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
# }

# features.columns.levels[0]: 
#     Index(['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
#         'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
#         'spectral_rolloff', 'tonnetz', 'zcr'],
#         dtype='object', name='feature')
# for name in features.columns.levels[0]:
#     feature_sets[name] = name
    

# feature_sets.update({
#     'mfcc/contrast': ['mfcc', 'spectral_contrast'],
#     'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
#     'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
#     'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
#     'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
#     'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
#     'all_non-echonest': list(features.columns.levels[0])
# })

# test_classifiers_features:
# scores.test, scores.train, times = test_classifiers_features(classifiers, feature_sets)
scores_test, scores_train, times, Precision_recall_train, Precision_recall_test, confusion_train, confusion_test = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores_test))
ipd.display(format_scores(scores_train))
#ipd.display(format_scores(score))
ipd.display(times.style.format('{:.4f}'))
ipd.display(Precision_recall_train)
ipd.display(Precision_recall_test)

# Score for LR model with all features: 61.10% (dim: 518).




  'precision', 'predicted', average, warn_for)


Unnamed: 0,dim,linSVC2
all,300,61.45%


Unnamed: 0,dim,linSVC2
all,300,67.37%


Unnamed: 0,linSVC2
all,43.9244


(array([ 0.        ,  0.71551724,  1.        ,  0.        ,  0.65624013,
         0.55215827,  0.63233083,  0.67056758,  0.59104938,  0.69959677,
         0.74358974,  0.93055556,  0.625     ,  0.70241099,  0.        ,
         0.71794872]),
 array([ 0.        ,  0.83838384,  0.01408451,  0.        ,  0.82257426,
         0.34092171,  0.69218107,  0.65076661,  0.36650718,  0.42628993,
         0.18954248,  0.98529412,  0.02116402,  0.88206302,  0.        ,
         0.29787234]),
 array([ 0.        ,  0.77209302,  0.02777778,  0.        ,  0.73005272,
         0.42155853,  0.66090373,  0.66051873,  0.45245127,  0.52977099,
         0.30208333,  0.95714286,  0.04094166,  0.78205228,  0.        ,
         0.42105263]),
 array([  58,  495,  142,   13, 5050, 1801, 1215, 1761, 1045,  814,  306,
         408,  945, 5681,   94,   94]))

(array([ 0.        ,  0.72368421,  0.        ,  0.        ,  0.659601  ,
         0.41875   ,  0.26219512,  0.73300971,  0.36065574,  0.42372881,
         0.86666667,  0.79032258,  0.        ,  0.65244537,  0.        ,  0.        ]),
 array([ 0.        ,  0.88709677,  0.        ,  0.        ,  0.83702532,
         0.29777778,  0.28289474,  0.68636364,  0.12643678,  0.24509804,
         0.33333333,  0.96078431,  0.        ,  0.88185654,  0.        ,  0.        ]),
 array([ 0.        ,  0.79710145,  0.        ,  0.        ,  0.73779637,
         0.34805195,  0.2721519 ,  0.70892019,  0.18723404,  0.31055901,
         0.48148148,  0.86725664,  0.        ,  0.75      ,  0.        ,  0.        ]),
 array([  8,  62,  18,   6, 632, 225, 152, 220, 174, 102,  39,  51, 119,
        711,  42,  12]))

### confustion matrix

In [None]:
ipd.display(confusion_train)
ipd.display(confusion_test)

In [None]:
# import itertools
# import numpy as np
# import matplotlib.pyplot as plt

# from sklearn import svm, datasets
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix

# # import some data to play with
# iris = datasets.load_iris()
# X = iris.data
# y = iris.target
# class_names = iris.target_names

# # Split the data into a training set and a test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# # Run classifier, using a model that is too regularized (C too low) to see
# # the impact on the results
# classifier = svm.SVC(kernel='linear', C=0.01)
# y_pred = classifier.fit(X_train, y_train).predict(X_test)

# def plot_confusion_matrix(cm, classes,
#                           normalize=False,
#                           title='Confusion matrix',
#                           cmap=plt.cm.Blues):
#     """
#     This function prints and plots the confusion matrix.
#     Normalization can be applied by setting `normalize=True`.
#     """
#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     print(cm)

#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)

#     fmt = '.2f' if normalize else 'd'
#     thresh = cm.max() / 2.
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         plt.text(j, i, format(cm[i, j], fmt),
#                  horizontalalignment="center",
#                  color="white" if cm[i, j] > thresh else "black")

#     plt.tight_layout()
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')

In [None]:
# from matplotlib.backends.backend_pdf import PdfPages
# plt.figure()
# plot1 = plot_confusion_matrix(confusion_train, classes=[i for i in "ABCDEFGHIJKLMNOP"], normalize=True,
#                       title='Confusion matrix for training data: with normalization')

# # Plot normalized confusion matrix
# plt.figure()
# plot2 = plot_confusion_matrix(confusion_test, classes=[i for i in "ABCDEFGHIJKLMNOP"], normalize=True,
#                       title='Confusion matrix for test data: with normalization')
# #plt.show()

# pp = PdfPages('foo.pdf')
# pp.savefig(plot1)
# pp.savefig(plot2)
# pp.close()

# plt.show()

In [None]:
## normalize the matrices

df_test = pd.DataFrame(confusion_test, index = [i for i in "ABCDEFGHIJKLMNOP"],
                  columns = [i for i in "ABCDEFGHIJKLMNOP"])
df_test_norm = (df_test-df_test.mean())/df_test.std()

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_train = pd.DataFrame(confusion_train, index = [i for i in "ABCDEFGHIJKLMNOP"],
                  columns = [i for i in "ABCDEFGHIJKLMNOP"])
# df_train_norm = (df_train-df_train.mean())/df_train.std()*100
df_train_norm = df_train/(df_train.mean()+0.000001)

# df_cm = pd.DataFrame(confusion_train, index = [i for i in "ABCDEFGHIJKLMNOP"],
#                   columns = [i for i in "ABCDEFGHIJKLMNOP"])
plt.figure(figsize = (10,7))
sn.heatmap(df_train, annot=True, cmap="Blues", fmt='.1f')
plt.figure(figsize = (10,7))
sn.heatmap(df_train_norm, annot=True, cmap="Blues", fmt='.1f')
plt.show()

In [None]:
df_test = pd.DataFrame(confusion_test, index = [i for i in "ABCDEFGHIJKLMNOP"],
                  columns = [i for i in "ABCDEFGHIJKLMNOP"])
# df_test_norm = (df_test-df_test.mean())/df_test.std()
df_test_norm = df_test/(df_test.mean()+0.00001)

# df_cm_test = pd.DataFrame(df_test, index = [i for i in "ABCDEFGHIJKLMNOP"],
#                   columns = [i for i in "ABCDEFGHIJKLMNOP"])
plt.figure(figsize = (10,7))
sn.heatmap(df_test, annot=True, cmap="Blues", fmt='.1f')
plt.figure(figsize = (10,7))
sn.heatmap(df_test_norm, annot=True, cmap="Blues", fmt='.1f')
plt.show()

In [None]:
#features.loc[:,feature_sets]
#name, fset = tqdm_notebook(feature_sets.items(), desc='features')
features.loc[:, ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
             'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
             'spectral_rolloff', 'tonnetz', 'zcr']].shape

### 1.2.3 SVM with regularization


In [None]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
from sklearn.metrics import precision_recall_fscore_support
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()) # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores_test = pd.DataFrame(columns = columns, index = feature_sets.keys())
    scores_train = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label=1,
                                    average=None, warn_for=(‘precision’, ’recall’, ’f-score’), 
                                    sample_weight=None)
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores_test.loc[fset_name, 'dim'] = X_train.shape[1]
        scores_train.loc[fset_name, 'dim'] = X_train.shape[1]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score_test = clf.score(X_test, y_test) # accuracy for function clf.
            score_train = clf.score(X_train, y_train) # accuracy for function clf.
            scores_test.loc[fset_name, clf_name] = score_test
            scores_train.loc[fset_name, clf_name] = score_train
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores_test, scores_train, times


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [None]:
from tensorflow.contrib.learn.python.learn.estimators import svm
classifiers = {
#    'LR': LogisticRegression(),
#     'kNN': KNeighborsClassifier(n_neighbors=200),
#     'SVCrbf': SVC(kernel='rbf'),
#     'SVCpoly1': SVC(kernel='poly', degree=1),
#     'linSVC1': SVC(kernel="linear"),
#    'linSVC2': LinearSVC(),
#       'linSVC2': LinearSVC(penalty='l2', loss='squared_hinge', 
#                            dual=True, tol=0.0001, C=0.008, multi_class='ovr', 
#                            fit_intercept=True, intercept_scaling=1, 
#                            class_weight=None, verbose=0, random_state=None, 
#                            max_iter=1000),
#     'linSVC2': LinearSVC(penalty='l1', loss='squared_hinge', dual=False,
#                        tol=1e-3, C=0.08, multi_class='ovr', 
#                            fit_intercept=True, intercept_scaling=1, 
#                            class_weight=None, verbose=0, random_state=None, 
#                            max_iter=1000),
#     #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
#     'DT': DecisionTreeClassifier(max_depth=5),
#     'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     'AdaBoost': AdaBoostClassifier(n_estimators=10),
#     'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
#     'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
#     'NB': GaussianNB(),
#     'QDA': QuadraticDiscriminantAnalysis(),
}

# ['mfcc','std']: indicate level[0]: 'mfcc', level[1]: 'std';
# ['mfcc', 'spectral_contrast']: idicate 2 different level[0]: 'mfcc', 'spectral_contrast'

# feature_sets = features.columns.levels[0];
feature_sets = {
     'all': ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
           'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
           'spectral_rolloff', 'tonnetz', 'zcr']
}
# #    'echonest_audio': ('echonest', 'audio_features'),
# #    'echonest_social': ('echonest', 'social_features'),
# #    'echonest_temporal': ('echonest', 'temporal_features'),
# #    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
# #    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
# }

# features.columns.levels[0]: 
#     Index(['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
#         'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
#         'spectral_rolloff', 'tonnetz', 'zcr'],
#         dtype='object', name='feature')
# for name in features.columns.levels[0]:
#     feature_sets[name] = name
    

# feature_sets.update({
#     'mfcc/contrast': ['mfcc', 'spectral_contrast'],
#     'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
#     'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
#     'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
#     'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
#     'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
#     'all_non-echonest': list(features.columns.levels[0])
# })

# test_classifiers_features:
# scores.test, scores.train, times = test_classifiers_features(classifiers, feature_sets)
scores_test, scores_train, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores_test))
ipd.display(format_scores(scores_train))
#ipd.display(format_scores(score))
ipd.display(times.style.format('{:.4f}'))

# Score for LR model with all features: 61.10% (dim: 518).

### 1.2.3 Learning Curve

Todo:
* Learning curve for both SVM and softmax

### 1.3 Multiple genres

Todo:
* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks.

In [None]:
classifiers = {
    #LogisticRegression(),
    'LR': OneVsRestClassifier(LogisticRegression()),
    'SVC': OneVsRestClassifier(SVC()),
    'MLP': MLPClassifier(max_iter=700),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
    'mfcc': 'mfcc',
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

# multi_label: use 'genres_all'(151) as y; 
scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

## 2 Deep learning on raw audio

Other architectures:
* [Learning Features of Music from Scratch (MusicNet)](https://arxiv.org/abs/1611.09827), John Thickstun, Zaid Harchaoui, Sham Kakade.

In [None]:
labels_onehot = MultiLabelBinarizer().fit_transform(tracks['track', 'genres_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:
* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg
    * resampling is very slow --> use `kaiser_fast`
    * does not work with multi-processing, for keras `fit_generator()`
* pydub is a high-level interface for audio modification, uses ffmpeg to load
    * store a temporary `.wav`
* directly pipe ffmpeg output
    * fastest method
* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries

In [None]:
# Just be sure that everything is fine. Multiprocessing is tricky to debug.
utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())
SampleLoader(train, batch_size=2).__next__()[0].shape

In [None]:
# Keras parameters.
NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs
params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}

### 2.1 Fully connected neural network

* Two layers with 10 hiddens is no better than random, ~11%.

Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed.

In [None]:
loader = utils.FfmpegLoader(sampling_rate=2000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
print('Dimensionality: {}'.format(loader.shape))

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Dense(output_dim=1000, input_shape=loader.shape))
model.add(Activation("relu"))
model.add(Dense(output_dim=100))
model.add(Activation("relu"))
model.add(Dense(output_dim=labels_onehot.shape[1]))
model.add(Activation("softmax"))

optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)
#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);

loss

### 2.2 Convolutional neural network

* Architecture: [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf), Sander Dieleman, Benjamin Schrauwen.
* Missing: track segmentation and class averaging (majority voting)
* Compared with log-scaled mel-spectrograms instead of strided convolution as first layer.
* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html

In [None]:
loader = utils.FfmpegLoader(sampling_rate=16000)
#loader = utils.LibrosaLoader(sampling_rate=16000)
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)

keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((-1, 1), input_shape=loader.shape))
print(model.output_shape)

model.add(Conv1D(128, 512, subsample_length=512))
print(model.output_shape)
model.add(Activation("relu"))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

model.add(Conv1D(32, 8))
print(model.output_shape)
model.add(Activation("relu"))
model.add(MaxPooling1D(4))

print(model.output_shape)
#model.add(Dropout(0.25))
model.add(Flatten())
print(model.output_shape)
model.add(Dense(100))
model.add(Activation("relu"))
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)

loss

### 2.3 Recurrent neural network

## 3 Deep learning on extracted audio features

Look at:
* Pre-processing in Keras: https://github.com/keunwoochoi/kapre
* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017
* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras
* Pre-processor: https://github.com/bmcfee/pumpp

### 3.1 ConvNet on MFCC

* Architecture: [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf), Tom LH. Li, Antoni B. Chan and Andy HW. Chun
* Missing: track segmentation and majority voting.
* Best seen: 17.6%

In [None]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape

In [None]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss