In [1]:
%matplotlib inline

import os
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#Built-in module from FMA
import utils


In [2]:
supervised_tracks = pd.read_csv('tracks_supervised.csv', index_col=0)

### DataFrames:
- supervised_tracks: Contains audio features and labels for genres


In [3]:
print(supervised_tracks.head(), supervised_tracks.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13129 entries, 2 to 124911
Data columns (total 12 columns):
acousticness        13129 non-null float64
danceability        13129 non-null float64
energy              13129 non-null float64
instrumentalness    13129 non-null float64
liveness            13129 non-null float64
speechiness         13129 non-null float64
tempo               13129 non-null float64
valence             13129 non-null float64
year_released       13129 non-null float64
genres_top          13129 non-null object
genres_all          13129 non-null object
duration            13129 non-null float64
dtypes: float64(10), object(2)
memory usage: 1.3+ MB
          acousticness  danceability    energy  instrumentalness  liveness  \
track_id                                                                     
2             0.416675      0.675894  0.634476          0.010628  0.177647   
3             0.374408      0.528643  0.817461          0.001851  0.105880   
5          

In [4]:
#print(audio_features.head(), audio_features.info())

### Preliminary Machine Learning

#### MeanShift

In [5]:
from sklearn.cluster import MeanShift
#from sklearn.cluster import AgglomerativeClustering

In [6]:
X = supervised_tracks[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence']]

ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

In [7]:
print(labels)
print(cluster_centers)
print(cluster_centers.shape)
print("number of estimated clusters : %d" % n_clusters_)


[0 0 0 ..., 0 0 0]
[[  5.12189903e-01   5.39468855e-01   5.28416565e-01   6.39504180e-01
    1.85235361e-01   8.90235131e-02   1.09530824e+02   4.42817303e-01]]
(1, 8)
number of estimated clusters : 1


In the first run of MeanShift, only 1 cluster was identified. To improve the result, I'll try the following:
- tuning parameters
- modifying / normalizing the features

In [9]:
from sklearn.preprocessing import Normalizer

norm = Normalizer()
X_new = Normalizer().fit_transform(X)
pd.DataFrame(X_new,columns=X.columns).head(10)

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,0.002511,0.004073,0.003824,6.4e-05,0.001071,0.00096,0.999974,0.003475
1,0.002949,0.004164,0.006439,1.5e-05,0.000834,0.003637,0.999957,0.002121
2,0.000435,0.007436,0.006996,7e-06,0.003721,0.001243,0.999921,0.0062
3,0.008529,0.005899,0.008286,0.008652,0.001035,0.000296,0.999837,0.008636
4,0.003956,0.00449,0.004903,0.00017,0.000845,0.004598,0.999929,0.007822
5,0.000541,0.001325,0.003082,0.00424,0.001136,0.000155,0.999984,0.000814
6,0.003486,0.0068,0.002461,0.006202,0.000797,0.000362,0.999932,0.00565
7,0.028769,0.013014,0.002258,0.010314,0.003155,0.000796,0.999428,0.004895
8,0.004485,0.002565,0.005575,0.006159,0.0006,0.000535,0.999952,0.000628
9,0.007071,0.003451,0.004994,0.007189,0.002082,0.000697,0.99991,0.006132


In [10]:
ms = MeanShift()
ms.fit(X_new)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

In [11]:
#print(labels)
#print(cluster_centers)
print("unique labels", labels_unique)
print(cluster_centers.shape)
print("number of estimated clusters : %d" % n_clusters_)


unique labels [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
(32, 8)
number of estimated clusters : 32


In [12]:
for center in cluster_centers:
    print(center)

[  3.51707727e-03   3.79090685e-03   4.36475808e-03   4.98134812e-03
   1.32732358e-03   6.20965217e-04   9.99943500e-01   3.30116821e-03]
[ 0.00395012  0.00665657  0.00478975  0.02595981  0.00272784  0.00137623
  0.99960468  0.00237762]
[ 0.02539936  0.01099698  0.00300099  0.01057238  0.00439752  0.00120894
  0.99952145  0.00542535]
[ 0.0073403   0.01264887  0.00975572  0.01588068  0.00130752  0.01444928
  0.99955601  0.01011208]
[  1.83393493e-02   5.38413238e-03   1.12711831e-02   1.76620307e-02
   1.27947554e-02   8.78630807e-04   9.99421952e-01   1.33426183e-02]
[  4.87904297e-02   1.69784747e-02   5.27476937e-03   8.69503375e-05
   6.71735380e-03   1.72852545e-03   9.98624700e-01   1.99811938e-03]
[ 0.00822875  0.01606381  0.03168794  0.03559801  0.00341067  0.00164987
  0.99850939  0.01917138]
[  7.74644755e-02   2.71507305e-02   1.20572754e-04   7.48124014e-02
   7.15151949e-03   3.24638772e-03   9.93778118e-01   2.92442498e-03]
[ 0.02116622  0.01318676  0.02050669  0.01987001

In [None]:
from sklearn.pipeline import Pipeline


In [None]:
# # Plot result
# from itertools import cycle

# plt.figure(1)
# plt.clf()

# colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
# for k, col in zip(range(n_clusters_), colors):
#     my_members = labels == k
#     cluster_center = cluster_centers[k]
#     plt.plot(X.loc[my_members, 0], X.loc[my_members, 1], col + '.')
#     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
#              markeredgecolor='k', markersize=14)
# plt.title('Estimated number of clusters: %d' % n_clusters_)
# plt.show()

#### Affinity Propagation

In [None]:
# from sklearn.cluster import AffinityPropagation
# from sklearn import metrics


# # #############################################################################
# # Compute Affinity Propagation
# af = AffinityPropagation(preference=-50).fit(X)
# cluster_centers_indices = af.cluster_centers_indices_
# labels = af.labels_

# n_clusters_ = len(cluster_centers_indices)

# print('Estimated number of clusters: %d' % n_clusters_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
#       % metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f"
#       % metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# # #############################################################################
# # Plot result
# import matplotlib.pyplot as plt
# from itertools import cycle

# plt.close('all')
# plt.figure(1)
# plt.clf()

# colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
# for k, col in zip(range(n_clusters_), colors):
#     class_members = labels == k
#     cluster_center = X[cluster_centers_indices[k]]
#     plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
#     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
#              markeredgecolor='k', markersize=14)
#     for x in X[class_members]:
#         plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

# plt.title('Estimated number of clusters: %d' % n_clusters_)
# plt.show()