In [None]:
import time

import IPython.display as ipd
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import utils

In [None]:
tracks = utils.load('fma_metadata/tracks.csv')
features = utils.load('fma_metadata/features.csv')
echonest = utils.load('fma_metadata/echonest.csv')

tracks.shape, features.shape, echonest.shape

((106574, 52), (106574, 518), (13129, 249))

In [3]:
echonest

Unnamed: 0_level_0,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest,echonest
Unnamed: 0_level_1,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,audio_features,metadata,metadata,...,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features,temporal_features
Unnamed: 0_level_2,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,album_date,album_name,...,214,215,216,217,218,219,220,221,222,223
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661,,,...,-1.992303,6.805694,0.233070,0.192880,0.027455,0.06408,3.676960,3.61288,13.316690,262.929749
3,0.374408,0.528643,0.817461,0.001851,0.105880,0.461818,126.957,0.269240,,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.082770,6.01864,16.673548,325.581085
5,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661,,,...,-2.288358,11.527109,0.256821,0.237820,0.060122,0.06014,5.926490,5.86635,16.013849,356.755737
10,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590,2008-03-11,Constant Hitmaker,...,-3.662988,21.508228,0.283352,0.267070,0.125704,0.08082,8.414010,8.33319,21.317064,483.403809
134,0.452217,0.513238,0.560410,0.019443,0.096567,0.525519,114.290,0.894072,,,...,-1.452696,2.356398,0.234686,0.199550,0.149332,0.06440,11.267070,11.20267,26.454180,751.147705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124857,0.007592,0.790364,0.719288,0.853114,0.720715,0.082550,141.332,0.890461,,,...,-1.721207,4.686078,0.213789,0.208800,0.007911,0.06395,2.040730,1.97678,8.144532,147.040405
124862,0.041498,0.843077,0.536496,0.865151,0.547949,0.074001,101.975,0.476845,,,...,-0.647897,1.282306,0.214586,0.181860,0.011247,0.06240,0.922360,0.85996,1.794739,6.321268
124863,0.000124,0.609686,0.895136,0.846624,0.632903,0.051517,129.996,0.496667,,,...,-0.771613,1.623510,0.180471,0.128185,0.010103,0.06222,2.251160,2.18894,5.578341,89.180328
124864,0.327576,0.574426,0.548327,0.452867,0.075928,0.033388,142.009,0.569274,,,...,-2.054143,7.927149,0.250178,0.219205,0.014851,0.06390,1.487440,1.42354,2.173092,12.503966


In [None]:
def test_classifiers_features(classifiers, data, labels):
    columns = ["dim"] + list(classifiers.keys())
    scores = pd.DataFrame(columns=columns, index=["echonest_all"])
    times = pd.DataFrame(columns=classifiers.keys())
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=42)
    scores['dim'] = X_train.shape[1]
    for clf_name, clf in classifiers.items():
        print(clf_name, "====================start")
        t = time.process_time()
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(score)
        scores[clf_name] = score
        times[clf_name] = time.process_time() - t
        print(clf_name, "====================end")
    return scores, times

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [8]:
label = tracks["track", "genre_top"].dropna().reset_index()
label.columns = [''.join(col) for col in label.columns]
label = label.rename(columns={"trackgenre_top" : "genre"})

In [9]:
echonest_temporal_data = echonest["echonest", "temporal_features"]
echonest_temporal_data = echonest_temporal_data.merge(label, how="inner", on="track_id")
echonest_temporal_data = echonest_temporal_data.drop(columns="track_id")

In [10]:
label = echonest_temporal_data["genre"]
echonest_temporal_data = echonest_temporal_data.drop(columns="genre")

In [None]:
rstate= 42

classifiers = {
   'LR': LogisticRegression(n_jobs=-1, random_state=rstate),
   'kNN': KNeighborsClassifier(n_neighbors=200, n_jobs=-1),
   'SVCrbf': SVC(kernel='rbf', random_state=rstate),
   'DT': DecisionTreeClassifier(max_depth=100),
   'RF1': RandomForestClassifier(max_depth=20, n_estimators=20),
}


scores, times = test_classifiers_features(classifiers, echonest_temporal_data, label)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6323215049166311
0.5083368961094484
0.5814450619923044
0.5664814023086789
0.6883283454467721


In [None]:
ipd.display(format_scores(scores))

Unnamed: 0,dim,LR,kNN,SVCrbf,DT,RF1
echonest_all,224,63.23%,50.83%,58.14%,56.65%,68.83%


Unnamed: 0,LR,kNN,SVCrbf,DT,RF1


In [13]:
scores.to_csv('scores_.csv', index=False)  # Set index=True if you want to include the index column

# Save the `times` table as a CSV file
times.to_csv('times_.csv', index=False)