# FMA: A Dataset For Music Analysis

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Baselines

* This notebook evalutates standard classifiers from scikit-learn on the provided features.
* Moreover, it evaluates Deep Learning models on both audio and spectrograms.

In [2]:
import time
import os

import IPython.display as ipd
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC # Support Vector Classification
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
import utils

from sklearn.feature_selection import SelectKBest, SelectPercentile

Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')

tracks = utils.load('data/tracks.csv') # (106574, 52)
features = utils.load('data/features.csv') # (106574, 518)
echonest = utils.load('data/echonest.csv') # (13129, 249)

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape

  'category', categories=SUBSETS, ordered=True)


((106574, 52), (106574, 518), (13129, 249))

## Subset
We use 'medium' to build models. features_all: all 'features' data for <medium and small> subset.

In [4]:
# medium: 25000
subset = tracks.index[tracks['set', 'subset'] <= 'medium'] # track.index for <medium and small> subset

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

# features_all: features inner join echonest
features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]

# features_all: all 'features' data for <medium and small> subset.
features_all = features.loc[subset]

tracks.shape, features_all.shape, features.shape

Not enough Echonest features: (13129, 767)


((25000, 52), (25000, 518), (106574, 518))

In [5]:
ipd.display(features_all.head(1))
ipd.display(features.head(1))
ipd.display(echonest.head(1))

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448


feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448


Unnamed: 0_level_0,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest
Unnamed: 0_level_1,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,metadata,metadata,...,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features
Unnamed: 0_level_2,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,album_date,album_name,...,214,215,216,217,218,219,220,221,222,223
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749


In [6]:
# get <tracks.index> for training data, validation set and test set to separate them. 
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

#test = tracks.index[tracks['set', 'split'] != 'training']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

# genres in 'genre_top';
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))

# genres in 'genres_all';
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

19922 training examples, 2505 validation examples, 2573 testing examples
Top genres (35): [' ', '-', '/', 'B', 'C', 'E', 'F', 'H', 'I', 'J', 'L', 'O', 'P', 'R', 'S', 'T', 'a', 'c', 'd', 'e', 'g', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'x', 'y', 'z']
All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 808, 810, 811, 906, 1032, 1060, 1193, 1235]


## 1 Multiple classifiers and feature sets

Todo:
* Cross-validation for hyper-parameters.
* Dimensionality reduction?

### 1.1 Pre-processing

In [7]:
# tracks: entire tracks sets including train/val/test sets;
# features: entire feature sets used in models: features as X matrix;
# columns: specified features subset extracted from 'features' used in model;
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    # labels
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        #labels = tracks['track', 'genres']

    # Split in training, validation and testing sets.
    # train, val, test: tracks.index for training data, validation set and test set. 
    # labels: genres in 'genre_top'/'genres_all';
    y_train = enc.fit_transform(labels[train]) # labels
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    
    # columns: columns used as features;
    # features: entire features set;
    X_train = features.loc[train, columns].as_matrix()
    X_val = features.loc[val, columns].as_matrix()
    X_test = features.loc[test, columns].as_matrix()
    
    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    
    # StandardScaler: Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False) 
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)
    
    return y_train, y_val, y_test, X_train, X_val, X_test

### 1.2 Single genre
What we are going to do in this project.
We use 'genre_top'(16 labels) as y, then output accuracy for various classifiers with 

In [None]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features, 'spectral_contrast', multi_label=False, verbose=False)
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()).insert(0, 'dim') # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test) # accuracy for function clf.
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores, times


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

ipd.display(classifiers.keys()); 
ipd.display(feature_sets.keys())

ipd.display(classifiers)
ipd.display(feature_sets)

In [None]:
ipd.display(features.columns.levels[0])
ipd.display(features.columns.levels[1])
ipd.display(features.columns.levels[2])

In [None]:
for name in features.columns.levels[0]:
    feature_sets[name] = name
    
# update: adds dictionary dict2's key-values pairs in to dict.
feature_sets.update({ 
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

list(features.columns.levels[0])

# Rogistic Regression + Learning curve

In [None]:
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())    
    columns = ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
             'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
             'spectral_rolloff', 'tonnetz', 'zcr']
    
    y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features, columns, multi_label=False, verbose=False);
    

    #print("x size, y size: ", X_train_all.shape, y_train.shape)
    model = SelectKBest(k=275)
    fit = model.fit(X_train, y_train)
    X_train = fit.transform(X_train)
    X_test = fit.transform(X_test)
    
    M = X_train.shape[0]/10
    X_learning = np.empty(10)
    Y_train_curve = np.empty(10)
    Y_test_curve = np.empty(10)
    for j in range(10):
        MJ = int(M*j)
        X_train_this = np.delete(X_train,np.s_[0:MJ],axis=0)
        Y_train_this = np.delete(y_train,np.s_[0:MJ],axis=0)
        

        
        w = np.zeros([X_train_this.shape[1], 16])   #len(np.unique(Y_train_this)=16
        lam = 1
        iterations = 5000
        learningRate = 1e-4
        losses = []
        
        
        for i in range(0,iterations):
            loss, grad = getLoss(w,X_train_this,Y_train_this,lam)
            losses.append(loss)
            w = w - (learningRate * grad)

            
        #print(loss)
        
        X_learning[j] = (10-j)*M
        Y_train_curve[j] = getAccuracy(X_train_this,Y_train_this,w)
        Y_test_curve[j] = getAccuracy(X_test,y_test,w)
        print('Training Accuracy:', Y_train_curve[j])
        print('Test Accuracy:', Y_test_curve[j])
      
    plt.plot(X_learning, Y_train_curve, linewidth = 2.0, color = 'red')
    plt.plot(X_learning, Y_test_curve, linewidth = 2.0, color = 'blue')
    plt.show()
    #print('Training Accuracy:', getAccuracy(X_train,y_train,w))
    #print('Test Accuracy:', getAccuracy(X_test,y_test,w))


def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])


classifiers = {
    'LR': LogisticRegression(),
}

feature_sets = {
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})
test_classifiers_features(classifiers, feature_sets)

# KNN + PCA + Model Selection

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=141, weights='distance')
neigh.fit(X_train, y_train)
score = neigh.score(X_test, y_test)
print('Accuracy: {:.2%}'.format(score))


#searching optimal k without PCA transformation
accuracy = []
for i in range(1, 10):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)

    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]
    print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))
    
    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)


    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=i, weights='distance')
    neigh.fit(X_train, y_train)
    score = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(score))
    accuracy.append(score)
    
#searching optimal number of PC with k=20
accuracy = []
for i in range(125, 145):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]

    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)

    #transform to PC space
    estimator = PCA(n_components = i)
    X_train = estimator.fit_transform(X_train)
    #variance_explained = estimator.explained_variance_ratio_
    #print('{:.2%} variance explained'.format(np.sum(variance_explained)))
    X_test = estimator.transform(X_test)

    neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
    neigh.fit(X_train, y_train)
    accuracy = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(accuracy))
    
    
#search for optimal k with 200 selected feature 
medium = tracks['set', 'subset'] <= 'medium'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

y_train = tracks.loc[medium & train, ('track', 'genre_top')]
y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
y_test = tracks.loc[medium & test, ('track', 'genre_top')]
y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
X_train = features.loc[medium & train,:]
X_test = features.loc[medium & test,:]

# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

model = SelectKBest(k=169)
fit = model.fit(X_train, y_train)
X_train = fit.transform(X_train)
X_test = fit.transform(X_test)

neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
neigh.fit(X_train, y_train)
score = neigh.score(X_train, y_train)
print('Accuracy: {:.2%}'.format(score))


#search for optimal feature size with k = 20
medium = tracks['set', 'subset'] <= 'medium'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

#searching optimal k without PCA transformation
for i in range(160, 170):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]

    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)
    
    model = SelectKBest(k=i)
    fit = model.fit(X_train, y_train)
    X_train = fit.transform(X_train)
    X_test = fit.transform(X_test)
    neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
    neigh.fit(X_train, y_train)
    score = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(score))
    
    

#searching optimal number of WEIGHTED PC with k=20 
for i in range(200,500,50):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]

    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)

    #transform to PC space
    estimator = PCA(n_components = i)
    X_train = estimator.fit_transform(X_train)
    X_test = estimator.transform(X_test)

    variance_explained = estimator.explained_variance_ratio_
    LAMBDA = np.diag(variance_explained) #diagonal matrix of loadings
    print('{:.2%} variance explained'.format(np.sum(variance_explained)))
    
    X_train = np.dot(X_train, LAMBDA) #PC weighted by eigenvalue
    X_test = np.dot(X_test, LAMBDA) #PC weighted by eigenvalue

    neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
    neigh.fit(X_train, y_train)
    accuracy = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(accuracy))

# Neural Network + Model selection 

In [None]:
import keras
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape
from keras.models import Sequential
columns = ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
             'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
             'spectral_rolloff', 'tonnetz', 'zcr']
    
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features, columns, multi_label=False, verbose=False);

model = SelectKBest(k=300)
fit = model.fit(X_train, y_train)
X_train = fit.transform(X_train)
X_test = fit.transform(X_val)
y_test = y_val

model = Sequential([
    Dense(100,input_dim=300),
    Activation('sigmoid'),
    Dense(33,input_dim=100),
    Activation('sigmoid'),
    Dense(16),
    Activation('softmax'),
])

# For a multi-class classification problem
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])



# Convert labels to categorical one-hot encoding
one_hot_labels = keras.utils.to_categorical(y_train, num_classes=16)

# Train the model, iterating on the data in batches of 32 samples
model.fit(X_train, one_hot_labels, epochs=20, batch_size=200)
ylabels = keras.utils.to_categorical(y_test, num_classes=16)
model.evaluate(X_test, ylabels)

# Neural Network + PCA + learning curve

In [None]:
X_train = PCA(n_components=250).fit_transform(X_train)
X_test = PCA(n_components=250).fit_transform(X_train)
M = X_train.shape[0]/10
X_learning = np.empty(10)
Y_train_curve = np.empty(10)
Y_test_curve = np.empty(10)
for j in range(10):
    MJ = int(M*j)
    
    model = Sequential([
    Dense(100,input_dim=250),
    Activation('sigmoid'),
    Dense(32,input_dim=100),
    Activation('sigmoid'),
    Dense(16),
    Activation('softmax'),

    ])

# For a multi-class classification problem
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])



    
    
    X_train_this = np.delete(X_train,np.s_[0:MJ],axis=0)
    Y_train_this = np.delete(y_train,np.s_[0:MJ],axis=0)
    X_learning[j] = (10-j)*M
    
    one_hot_labels = keras.utils.to_categorical(Y_train_this, num_classes=16)
    trainingObj = model.fit(X_train_this, one_hot_labels, epochs=20, batch_size=200)
    Y_train_curve[j] = trainingObj.history['acc'][19]   
    ylabels = keras.utils.to_categorical(y_test, num_classes=16)
    testingObj = model.evaluate(X_test, ylabels)
    Y_test_curve[j] = testingObj[1]
    

plt.plot(X_learning, Y_train_curve, linewidth = 2.0, color = 'red')
plt.plot(X_learning, Y_test_curve, linewidth = 2.0, color = 'blue')
plt.show()



# SVM_rbf + Model selection

In [14]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()).insert(0, 'dim') # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores_test = pd.DataFrame(columns = columns, index = feature_sets.keys())
    scores_train = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        
        # Guzhiwei ********
        model = SelectKBest(k=200)
        fit = model.fit(X_train, y_train)
        X_train = fit.transform(X_train)
        X_test = fit.transform(X_test)
        
        # Guzhiwei *********
        
        scores_test.loc[fset_name, 'dim'] = X_train.shape[1]
        scores_train.loc[fset_name, 'dim'] = X_train.shape[1]
        labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            # train the model;
            clf.fit(X_train, y_train)
            
            # for training data
            Y_predict_train = clf.predict(X_train)
            Precision_recall_train = precision_recall_fscore_support(y_train, Y_predict_train)
            score_train = clf.score(X_train, y_train) # accuracy for function clf.
            scores_train.loc[fset_name, clf_name] = score_train
            confusion_train = confusion_matrix(y_train, Y_predict_train, labels=labels) #, sample_weight=Precision_recall_train[3])
            
            # for test data
            Y_predict_test = clf.predict(X_test)
            Precision_recall_test = precision_recall_fscore_support(y_test, Y_predict_test)
            score_test = clf.score(X_test, y_test) # accuracy for function clf.
            scores_test.loc[fset_name, clf_name] = score_test
            confusion_test = confusion_matrix(y_test, Y_predict_test, labels=labels) #, sample_weight=Precision_recall_test[3])
            
            # for time
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores_test, scores_train, times, Precision_recall_train, Precision_recall_test, confusion_train, confusion_test


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [15]:
classifiers = {
     'SVCrbf': SVC(kernel='rbf', C=1.6, gamma = 0.003),
}

# feature_sets = features.columns.levels[0];
feature_sets = {
     'all': ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
           'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
           'spectral_rolloff', 'tonnetz', 'zcr']
}


# test_classifiers_features:
# scores.test, scores.train, times = test_classifiers_features(classifiers, feature_sets)
scores_test, scores_train, times, Precision_recall_train, Precision_recall_test, confusion_train, confusion_test = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores_test))
ipd.display(format_scores(scores_train))
#ipd.display(format_scores(score))
ipd.display(times.style.format('{:.4f}'))
ipd.display(Precision_recall_train)
ipd.display(Precision_recall_test)

# Score for LR model with all features: 61.10% (dim: 518).

  'precision', 'predicted', average, warn_for)





Unnamed: 0,dim,SVCrbf
all,200,63.39%


Unnamed: 0,dim,SVCrbf
all,200,76.45%


Unnamed: 0,SVCrbf
all,261.4798


(array([ 0.        ,  0.87025948,  1.        ,  0.        ,  0.72935706,
         0.72789969,  0.74065041,  0.78636364,  0.73413174,  0.81639929,
         0.85430464,  0.99502488,  0.72687225,  0.78165473,  0.        ,
         0.796875  ]),
 array([ 0.        ,  0.88080808,  0.11267606,  0.        ,  0.90079208,
         0.64464187,  0.74979424,  0.68767746,  0.58660287,  0.56265356,
         0.42156863,  0.98039216,  0.17460317,  0.90301003,  0.        ,
         0.54255319]),
 array([ 0.        ,  0.87550201,  0.20253165,  0.        ,  0.80606007,
         0.68374558,  0.74519427,  0.73371706,  0.65212766,  0.66618182,
         0.56455142,  0.98765432,  0.28156997,  0.83796145,  0.        ,
         0.64556962]),
 array([  58,  495,  142,   13, 5050, 1801, 1215, 1761, 1045,  814,  306,
         408,  945, 5681,   94,   94]))

(array([ 0.        ,  0.80952381,  0.        ,  0.        ,  0.63771712,
         0.4       ,  0.36231884,  0.73262032,  0.49494949,  0.65079365,
         0.85      ,  0.92592593,  0.34782609,  0.70227273,  0.        ,
         0.4       ]),
 array([ 0.        ,  0.82258065,  0.        ,  0.        ,  0.81329114,
         0.41777778,  0.32894737,  0.62272727,  0.2816092 ,  0.40196078,
         0.43589744,  0.98039216,  0.06722689,  0.86919831,  0.        ,
         0.16666667]),
 array([ 0.        ,  0.816     ,  0.        ,  0.        ,  0.71488178,
         0.40869565,  0.34482759,  0.67321867,  0.35897436,  0.4969697 ,
         0.57627119,  0.95238095,  0.11267606,  0.77686989,  0.        ,
         0.23529412]),
 array([  8,  62,  18,   6, 632, 225, 152, 220, 174, 102,  39,  51, 119,
        711,  42,  12]))

### confustion matrix

In [None]:
ipd.display(confusion_train)
ipd.display(confusion_test)

In [None]:
## normalize the matrices

df_test = pd.DataFrame(confusion_test, index = [i for i in "ABCDEFGHIJKLMNOP"],
                  columns = [i for i in "ABCDEFGHIJKLMNOP"])
df_test_norm = (df_test-df_test.mean())/df_test.std()

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_train = pd.DataFrame(confusion_train, index = [i for i in "ABCDEFGHIJKLMNOP"],
                  columns = [i for i in "ABCDEFGHIJKLMNOP"])
# df_train_norm = (df_train-df_train.mean())/df_train.std()*100
df_train_norm = df_train/(df_train.sum()+0.000001)*100

# df_cm = pd.DataFrame(confusion_train, index = [i for i in "ABCDEFGHIJKLMNOP"],
#                   columns = [i for i in "ABCDEFGHIJKLMNOP"])
plt.figure(figsize = (10,7))
sn.heatmap(df_train, annot=True, cmap="Blues", fmt='.1f')
plt.figure(figsize = (10,7))
sn.heatmap(df_train_norm, annot=True, cmap="Blues", fmt='.0f')
plt.show()

In [None]:
df_test = pd.DataFrame(confusion_test, index = [i for i in "ABCDEFGHIJKLMNOP"],
                  columns = [i for i in "ABCDEFGHIJKLMNOP"])
# df_test_norm = (df_test-df_test.mean())/df_test.std()
df_test_norm = df_test/(df_test.sum()+0.00001)*100

# df_cm_test = pd.DataFrame(df_test, index = [i for i in "ABCDEFGHIJKLMNOP"],
#                   columns = [i for i in "ABCDEFGHIJKLMNOP"])
plt.figure(figsize = (10,7))
sn.heatmap(df_test, annot=True, cmap="Blues", fmt='.1f')
plt.figure(figsize = (10,7))
sn.heatmap(df_test_norm, annot=True, cmap="Blues", fmt='.1f')
plt.show()

In [None]:
np.unique(y_train)

In [None]:
plt.hist(y_train, bins=16)
plt.title('Training data distribution')
plt.show()

In [None]:
plt.hist(y_test, bins=16)
plt.title('Test data distribution')
plt.show()

In [None]:
len(y_test)

In [None]:
#features.loc[:,feature_sets]
#name, fset = tqdm_notebook(feature_sets.items(), desc='features')
features.loc[:, ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
             'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
             'spectral_rolloff', 'tonnetz', 'zcr']].shape

### 1.2.3 SVM with regularization


In [None]:
# classifiers: a dict with a key(name) and a classifier function;
# feature_sets: a dict with a key(name) and a set of features: specified features subset extracted from 'features' used in model;
# Function usage: compute score for each classifier with each feature_sets as features;
from sklearn.metrics import precision_recall_fscore_support
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    
    columns = list(classifiers.keys()) # insert a column 'dim';
    
    # an accuracy dataframe and time dataframe;
    # columns: classifiers.keys;  index: feature_sets.keys();
    scores_test = pd.DataFrame(columns = columns, index = feature_sets.keys())
    scores_train = pd.DataFrame(columns = columns, index = feature_sets.keys())
    times = pd.DataFrame(columns = classifiers.keys(), index = feature_sets.keys())
    
    precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label=1,
                                    average=None, warn_for=(‘precision’, ’recall’, ’f-score’), 
                                    sample_weight=None)
    
    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):
        
        # pre-process: columns = fset, that is, it only uses only one feature per iteration.
        # multi_label=False: use 'genre_top'(16) as labels y;
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores_test.loc[fset_name, 'dim'] = X_train.shape[1]
        scores_train.loc[fset_name, 'dim'] = X_train.shape[1]
        
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score_test = clf.score(X_test, y_test) # accuracy for function clf.
            score_train = clf.score(X_train, y_train) # accuracy for function clf.
            scores_test.loc[fset_name, clf_name] = score_test
            scores_train.loc[fset_name, clf_name] = score_train
            times.loc[fset_name, clf_name] = time.process_time() - t
            
    return scores_test, scores_train, times


def format_scores(scores):
    
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [None]:
from tensorflow.contrib.learn.python.learn.estimators import svm
classifiers = {

}

# feature_sets = features.columns.levels[0];
feature_sets = {
     'all': ['chroma_cens', 'chroma_cqt', 'chroma_stft', 'mfcc', 'rmse',
           'spectral_bandwidth', 'spectral_centroid', 'spectral_contrast',
           'spectral_rolloff', 'tonnetz', 'zcr']
}


# test_classifiers_features:
# scores.test, scores.train, times = test_classifiers_features(classifiers, feature_sets)
scores_test, scores_train, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores_test))
ipd.display(format_scores(scores_train))
#ipd.display(format_scores(score))
ipd.display(times.style.format('{:.4f}'))

# Score for LR model with all features: 61.10% (dim: 518).

In [None]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))
print(model.output_shape)

model.add(Conv2D(3, 13, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(15, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Conv2D(65, 1, 10, subsample=(1, 4)))
model.add(Activation("relu"))
print(model.output_shape)

model.add(Flatten())
print(model.output_shape)
model.add(Dense(labels_onehot.shape[1]))
model.add(Activation("softmax"))
print(model.output_shape)

optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)
#optimizer = keras.optimizers.Adam()#lr=1e-5)#
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)
loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)
loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)
#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)

loss

In [None]:
class MfccLoader(utils.Loader):
    raw_loader = utils.FfmpegLoader(sampling_rate=22050)
    #shape = (13, 190)  # For segmented tracks.
    shape = (13, 2582)
    def load(self, filename):
        import librosa
        x = self.raw_loader.load(filename)
        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.
        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)
        return mfcc

loader = MfccLoader()
SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)
loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape