# FMA: A Dataset For Music Analysis

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

* This notebook evalutates standard classifiers from scikit-learn on the provided features.
* Moreover, it evaluates Deep Learning models on both audio and spectrograms.

In [1]:
import time
import os

import IPython.display as ipd
from tqdm import tqdm_notebook

import numpy as np
import pandas as pd
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape
from keras.models import Sequential

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier


import utils
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.sparse
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, SelectPercentile

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')

tracks = utils.load('data/tracks.csv')
features = utils.load('data/features.csv')
echonest = utils.load('data/echonest.csv')
print(echonest.shape)

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape

(13129, 249)


((106574, 52), (106574, 518), (13129, 249))

## Subset

In [3]:
subset = tracks.index[tracks['set', 'subset'] <= 'medium']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

Not enough Echonest features: (13129, 767)


((25000, 52), (25000, 518))

In [4]:
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

genres = list(MultiLabelBinarizer().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

19922 training examples, 2505 validation examples, 2573 testing examples
Top genres (35): [' ', '-', '/', 'B', 'C', 'E', 'F', 'H', 'I', 'J', 'L', 'O', 'P', 'R', 'S', 'T', 'a', 'c', 'd', 'e', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'x', 'y', 'z']
All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 808, 810, 811, 906, 1032, 1060, 1193, 1235]


## 1 Multiple classifiers and feature sets

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [5]:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        #labels = tracks['track', 'genres']

    # Split in training, validation and testing sets.
    y_train = enc.fit_transform(labels[train])
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    X_train = features.loc[train, columns].as_matrix()
    X_val = features.loc[val, columns].as_matrix()
    X_test = features.loc[test, columns].as_matrix()
    
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

# Softmax for baseline

In [12]:
##### softmax regression ########
## loss function of softmax regression
def getLoss(w,x,y,lam):
    m = x.shape[0] #number of training example
    y_mat = oneHotIt(y)  #convert the interger class coding in to a one-hot representation
    scores = np.dot(x,w) #comput raw class scores given input and current weight
    prob = softmax(scores) #perform softmax on these scores to get their probabilities
    loss = (-1 / m) * np.sum(y_mat * np.log(prob)) + (lam/2)*np.sum(w*w) #We then find the loss of the probabilities
    grad = (-1 / m) * np.dot(x.T, (y_mat - prob)) + lam*w #And compute the gradient for the loss
    
    return loss, grad

## unidimentional array of labels into a one-hot varient
def oneHotIt(Y):
    m = Y.shape[0]
    OHX = scipy.sparse.csr_matrix((np.ones(m),(Y,np.array(range(m)))))
    OHX = np.array(OHX.todense()).T
    return OHX

def softmax(z):
    z -= np.max(z)
    sm = (np.exp(z).T / np.sum(np.exp(z),axis=1)).T
    return sm

## determine the probabilities and predictions for each class when given a set of input data:
def getProbsAndPreds(someX,w):
    probs = softmax(np.dot(someX,w))
    preds = np.argmax(probs,axis=1)
    return probs, preds

##
def getAccuracy(someX,someY,w):
    prob, prede = getProbsAndPreds(someX,w)
    accuracy = sum(prede == someY)/(float(len(someY)))
    return accuracy

    



In [13]:
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())

    
    columns = ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
             'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
             'spectral_rolloff', 'tonnetz', 'zcr']
    
    y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features, columns, multi_label=False, verbose=False);


    
    M = X_train.shape[0]/10
    X_learning = np.empty(10)
    Y_train_curve = np.empty(10)
    Y_test_curve = np.empty(10)
    for j in range(10):
        MJ = int(M*j)
        X_train_this = np.delete(X_train,np.s_[0:MJ],axis=0)
        Y_train_this = np.delete(y_train,np.s_[0:MJ],axis=0)
        

        
        w = np.zeros([X_train_this.shape[1], 16])   #len(np.unique(Y_train_this)=16
        lam = 1
        iterations = 5000
        learningRate = 1e-4
        losses = []
               
        for i in range(0,iterations):
            loss, grad = getLoss(w,X_train_this,Y_train_this,lam)
            losses.append(loss)
            w = w - (learningRate * grad)
        
        X_learning[j] = (10-j)*M
        Y_train_curve[j] = getAccuracy(X_train_this,Y_train_this,w)
        Y_test_curve[j] = getAccuracy(X_val,y_val,w)
        print('Training Accuracy:', Y_train_curve[j])
        print('Test Accuracy:', Y_test_curve[j])
      
    plt.plot(X_learning, Y_train_curve, linewidth = 2.0, color = 'red')
    plt.plot(X_learning, Y_test_curve, linewidth = 2.0, color = 'blue')
    plt.show()
    #print('Training Accuracy:', getAccuracy(X_train,y_train,w))
    #print('Test Accuracy:', getAccuracy(X_test,y_test,w))


def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])
classifiers = {
}
feature_sets = {
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})
test_classifiers_features(classifiers, feature_sets)



shape:  (19922, 518) (19922,)
2.72712255201
Training Accuracy: 0.479670715792
Test Accuracy: 0.47260007773
shape:  (17930, 518) (17930,)
2.72705942451
Training Accuracy: 0.482208588957
Test Accuracy: 0.471822774971
shape:  (15938, 518) (15938,)
2.72656854697
Training Accuracy: 0.482933868741
Test Accuracy: 0.472211426351
shape:  (13946, 518) (13946,)
2.72629859806
Training Accuracy: 0.485730675462
Test Accuracy: 0.474154683249
shape:  (11954, 518) (11954,)
2.72631561022
Training Accuracy: 0.487870168981
Test Accuracy: 0.474931986009
shape:  (9961, 518) (9961,)
2.72590218012
Training Accuracy: 0.492219656661
Test Accuracy: 0.475709288768
shape:  (7969, 518) (7969,)
2.72544865504
Training Accuracy: 0.492784540093
Test Accuracy: 0.471822774971
shape:  (5977, 518) (5977,)
2.72425531267
Training Accuracy: 0.497574033796
Test Accuracy: 0.474543334629
shape:  (3985, 518) (3985,)
2.72301800337
Training Accuracy: 0.502885821832
Test Accuracy: 0.474931986009
shape:  (1993, 518) (1993,)
2.72654753076
Training Accuracy: 0.508278976417
Test Accuracy: 0.482316362223

# SVM for baseline

In [None]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()).insert(0, 'dim') # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores_test = pd.DataFrame(columns = columns, index = feature_sets.keys())
    scores_train = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        
        # Guzhiwei ********
        model = SelectKBest(k=200)
        fit = model.fit(X_train, y_train)
        X_train = fit.transform(X_train)
        X_test = fit.transform(X_test)
        
        # Guzhiwei *********
        
        scores_test.loc[fset_name, 'dim'] = X_train.shape[1]
        scores_train.loc[fset_name, 'dim'] = X_train.shape[1]
        labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            # train the model;
            clf.fit(X_train, y_train)
            
            # for training data
            Y_predict_train = clf.predict(X_train)
            Precision_recall_train = precision_recall_fscore_support(y_train, Y_predict_train)
            score_train = clf.score(X_train, y_train) # accuracy for function clf.
            scores_train.loc[fset_name, clf_name] = score_train
            confusion_train = confusion_matrix(y_train, Y_predict_train, labels=labels) #, sample_weight=Precision_recall_train[3])
            
            # for test data
            Y_predict_test = clf.predict(X_test)
            Precision_recall_test = precision_recall_fscore_support(y_test, Y_predict_test)
            score_test = clf.score(X_test, y_test) # accuracy for function clf.
            scores_test.loc[fset_name, clf_name] = score_test
            confusion_test = confusion_matrix(y_test, Y_predict_test, labels=labels) #, sample_weight=Precision_recall_test[3])
            
            # for time
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores_test, scores_train, times, Precision_recall_train, Precision_recall_test, confusion_train, confusion_test


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

classifiers = {
     'SVC'
}

# feature_sets = features.columns.levels[0];
feature_sets = {
     'all': ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
           'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
           'spectral_rolloff', 'tonnetz', 'zcr']
}


# test_classifiers_features:
# scores.test, scores.train, times = test_classifiers_features(classifiers, feature_sets)
scores_test, scores_train, times, Precision_recall_train, Precision_recall_test, confusion_train, confusion_test = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores_test))
ipd.display(format_scores(scores_train))
#ipd.display(format_scores(score))
ipd.display(times.style.format('{:.4f}'))
ipd.display(Precision_recall_train)
ipd.display(Precision_recall_test)