In [None]:
# Download da biblioteca auxiliar

!wget https://github.com/ufrpe-2021-2-IA/music-dataset/releases/download/v1.1.2/mgd-1.1.2-py3-none-any.whl
%pip install mgd --find-links /content/

In [None]:
%pip install scikit-learn-extra

In [None]:
from mgd import utils

utils.download_dataset('standardized')

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import cluster
from sklearn import metrics
from sklearn import mixture
from sklearn_extra.cluster import KMedoids

In [None]:
# Leitura do dataset
df = pd.read_csv('/content/mgd_standardized.csv').sample(frac=1.0, random_state=22)

In [None]:
# Exemplo das entradas do dataset
df.head(5)

Unnamed: 0,song_id,label,spectral_centroid_mean,spectral_centroid_std,spectral_centroid_skew,spectral_centroid_kurtosis,spectral_centroid_median,spectral_centroid_min,spectral_centroid_max,spectral_flatness_mean,...,tonnetz_5_median,tonnetz_5_min,tonnetz_5_max,tonnetz_6_mean,tonnetz_6_std,tonnetz_6_skew,tonnetz_6_kurtosis,tonnetz_6_median,tonnetz_6_min,tonnetz_6_max
866,reggae.00050.wav,8,1.051107,2.725637,-0.59808,-0.874107,0.514739,-0.777964,1.086332,2.379048,...,-0.249972,0.428833,-0.983983,-0.213158,-0.59812,-1.254527,-0.329291,0.100623,0.138318,-0.820549
191,classical.00045.wav,1,-1.433061,-1.777644,-0.641979,-0.505925,-1.328526,-0.233223,-2.024723,-0.7138,...,0.676203,-0.793815,0.032886,0.545841,0.705776,-1.230407,-0.486097,0.858529,-1.03342,0.194184
732,pop.00049.wav,7,-0.299668,1.565697,0.457804,0.237259,-0.438698,-1.08447,1.849367,-0.350016,...,0.127714,-2.060203,1.700123,1.705215,2.553196,0.975014,-0.222565,0.602329,-1.850041,2.671904
657,metal.00078.wav,6,0.855397,-0.106286,-1.315603,-0.755641,1.010388,0.951476,-0.072471,0.353623,...,0.192174,0.566299,-1.019629,0.150987,-0.962032,-0.095019,0.41084,0.337541,1.022577,-0.806552
155,classical.00065.wav,1,-1.919163,-1.916638,0.209252,-0.260501,-1.827764,-0.638904,-2.322155,-0.738876,...,-0.468952,-1.782476,-0.037417,-0.578533,1.093656,0.794091,-0.565256,-1.324233,-0.675676,0.526491


In [None]:
features = df.iloc[:, 2:].to_numpy() # Obter apenas as características das músicas
true_labels = df.iloc[:, 0:2] # id da música e gênero

In [None]:
# Geramos essas seeds com np.random.randint(0, 999999, size=50)
seeds = [8230, 652865, 115153, 13895, 897030, 770879, 635255, 832508,
         722384, 843355, 896943, 677075, 783267, 918119,  93487,  11157,
         808101, 123187, 104329, 684938, 232375, 998432, 635025, 552263,
         536805, 664894, 100581, 189723, 375784, 425864, 972287, 598820,
         168946, 970498, 174848, 637504, 967059, 647788, 871312, 575300,
         939588, 730937, 729146, 815384, 6936, 239314, 292608, 934670,
         518483, 748275]

In [None]:
# Dicionários com resultados
results = {
    'k-means': {
        'calinski': [],
        'davies': [],
        'clusters': [],
        'adjusted_rand_score': [],
        'adjusted_mutual_info_score': []
    },
    'k-medoids': {
        'calinski': [],
        'davies': [],
        'clusters': [],
        'adjusted_rand_score': [],
        'adjusted_mutual_info_score': []
    },
    'gmm': {
        'calinski': [],
        'davies': [],
        'clusters': [],
        'adjusted_rand_score': [],
        'adjusted_mutual_info_score': []
    }
}

# K-Means

In [None]:
for i in range(50):
  # Instanciando modelo
  k_means = cluster.KMeans(max_iter=5000, 
                           init='k-means++', 
                           n_clusters=10, 
                           random_state=seeds[i], 
                           tol=0.0001, 
                           algorithm='auto')

  # Treino: aprender os clusters
  k_means.fit(features)

  # Obter clusters
  clusters = k_means.predict(features)

  # Calcular métricas internas
  davies = metrics.davies_bouldin_score(features, clusters)
  calinski = metrics.calinski_harabasz_score(features, clusters)

  # Calcular métricas externas
  adjusted_rand_score = metrics.adjusted_rand_score(true_labels['label'], clusters)
  adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(true_labels['label'], clusters)

  # Salvar resultados
  results['k-means']['clusters'].append(clusters)
  results['k-means']['davies'].append(davies)
  results['k-means']['calinski'].append(calinski)
  results['k-means']['adjusted_rand_score'].append(adjusted_rand_score)
  results['k-means']['adjusted_mutual_info_score'].append(adjusted_mutual_info_score)
  

# K-medoids

In [None]:
for i in range(50):
  # Instanciando modelo
  k_medoids = KMedoids(max_iter=5000, 
                       init='k-medoids++', 
                       n_clusters=10, 
                       random_state=seeds[i])

  # Treino: aprender os clusters
  k_medoids.fit(features)
  
  # Obter clusters
  clusters = k_medoids.predict(features)

  # Calcular métricas internas
  davies = metrics.davies_bouldin_score(features, clusters)
  calinski = metrics.calinski_harabasz_score(features, clusters)

  # Calcular métricas externas
  adjusted_rand_score = metrics.adjusted_rand_score(true_labels['label'], clusters)
  adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(true_labels['label'], clusters)

  # Salvar resultados
  results['k-medoids']['clusters'].append(clusters)
  results['k-medoids']['davies'].append(davies)
  results['k-medoids']['calinski'].append(calinski)
  results['k-medoids']['adjusted_rand_score'].append(adjusted_rand_score)
  results['k-medoids']['adjusted_mutual_info_score'].append(adjusted_mutual_info_score)

# Gaussian Mixture

In [None]:
for i in range(50):
  # Instanciando modelo
  gmm = mixture.GaussianMixture(max_iter=5000, 
                                n_components=10, 
                                random_state=seeds[i])

  # Treino: aprender os clusters
  gmm.fit(features)
  
  # Obter clusters
  clusters = gmm.predict(features)

  # Calcular métricas internas
  davies = metrics.davies_bouldin_score(features, clusters)
  calinski = metrics.calinski_harabasz_score(features, clusters)

  # Calcular métricas externas
  adjusted_rand_score = metrics.adjusted_rand_score(true_labels['label'], clusters)
  adjusted_mutual_info_score = metrics.adjusted_mutual_info_score(true_labels['label'], clusters)

  # Salvar resultados
  results['gmm']['clusters'].append(clusters)
  results['gmm']['davies'].append(davies)
  results['gmm']['calinski'].append(calinski)
  results['gmm']['adjusted_rand_score'].append(adjusted_rand_score)
  results['gmm']['adjusted_mutual_info_score'].append(adjusted_mutual_info_score)

# Análise dos Resultados

In [None]:
def mean_std_metric(alg: str, metric: str, precision=3):
  arr = np.array(results[alg][metric])
  return round(arr.mean(), precision), round(arr.std(), precision)

## Média do Davies

In [None]:
print(mean_std_metric('k-means', 'davies'))
print(mean_std_metric('k-medoids', 'davies'))
print(mean_std_metric('gmm', 'davies'))

(2.663, 0.141)
(3.467, 0.318)
(2.832, 0.153)


## Média do Calinski

In [None]:
print(mean_std_metric('k-means', 'calinski'))
print(mean_std_metric('k-medoids', 'calinski'))
print(mean_std_metric('gmm', 'calinski'))

(61.803, 0.479)
(50.028, 3.806)
(60.503, 0.779)


## Média adjusted_rand_score

In [None]:
print(mean_std_metric('k-means', 'adjusted_rand_score'))
print(mean_std_metric('k-medoids', 'adjusted_rand_score'))
print(mean_std_metric('gmm', 'adjusted_rand_score'))

(0.201, 0.013)
(0.168, 0.012)
(0.198, 0.013)


## Média adjusted_mutual_info_score

In [None]:
print(mean_std_metric('k-means', 'adjusted_mutual_info_score'))
print(mean_std_metric('k-medoids', 'adjusted_mutual_info_score'))
print(mean_std_metric('gmm', 'adjusted_mutual_info_score'))

(0.325, 0.011)
(0.289, 0.019)
(0.327, 0.011)


# Visuallização através de Principal Components Analysis (PCA)

Para visualização dos resultados, utilizamos uma técnica de redução de dimensionalidade chamada PCA. Assim, conseguimos reduzir as características para 2 dimensões e colorir de acordo com o cluster/rótulo.

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px

In [None]:
pca_2d = PCA(n_components=2)
features_2d = pca_2d.fit_transform(features)

In [None]:
def get_clusters(alg: str):
  return list(map(str, results[alg]['clusters'][np.argmin(results[alg]['davies'])]))

## 2 Dimensões

### Grupos originais (gêneros)

In [None]:
label_mapper = {
    'blues': 0,
    'classical': 1,
    'country': 2,
    'disco': 3,
    'hiphop': 4,
    'jazz': 5,
    'metal': 6,
    'pop': 7,
    'reggae': 8,
    'rock': 9
}

label_mapper = {v: k for k, v in label_mapper.items()}

fig = px.scatter(pd.DataFrame({
    'genre': list(map(lambda l: label_mapper[l], true_labels['label'])),
    'id': true_labels['song_id'],
    'x': features_2d[:, 0],
    'y': features_2d[:, 1], }).sort_values(by='genre'),
    x="x",
    y="y",
    color="genre",
    symbol="genre",
    hover_data=['id'])
fig.show()


### Clusters

#### K-Means

In [None]:
fig = px.scatter(pd.DataFrame({
    'cluster': get_clusters('k-means'),
    'genre': true_labels['label'],
    'id': true_labels['song_id'],
    'x': features_2d[:, 0],
    'y': features_2d[:, 1], }).sort_values(by='cluster'),
    x="x",
    y="y",
    symbol="cluster",
    color="cluster",
    hover_data=['id', 'genre'])

fig.show()


#### K-Medoids

In [None]:
fig = px.scatter(pd.DataFrame({
    'cluster': get_clusters('k-medoids'),
    'genre': true_labels['label'],
    'id': true_labels['song_id'],
    'x': features_2d[:, 0],
    'y': features_2d[:, 1], }).sort_values(by='cluster'),
    x="x",
    y="y",
    symbol="cluster",
    color="cluster",
    hover_data=['id', 'genre'])

fig.show()


#### GMM

In [None]:
fig = px.scatter(pd.DataFrame({
    'cluster': get_clusters('gmm'),
    'genre': true_labels['label'],
    'id': true_labels['song_id'],
    'x': features_2d[:, 0],
    'y': features_2d[:, 1], }).sort_values(by='cluster'),
    x="x",
    y="y",
    symbol="cluster",
    color="cluster",
    hover_data=['id', 'genre'])

fig.show()


# Salvar Resultados

In [None]:
import pickle
from google.colab import files

file_name = 'results_dict.pkl'

with open(file_name, 'wb+') as f:
  pickle.dump(results, f)

In [None]:
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open(file_name, 'rb') as f:
  results_ = pickle.load(f)