In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

In [12]:
data = pd.read_csv("unpopular_songs_full.csv")

In [13]:
columns = ['danceability', 'loudness', 'duration_ms', 'instrumentalness','valence', 'acousticness']
df = data[columns]
df

Unnamed: 0,danceability,loudness,duration_ms,instrumentalness,valence,acousticness
0,0.722,-8.203,220549,0.0000,0.9270,0.67000
1,0.955,-11.392,149543,0.0595,0.0934,0.00480
2,0.753,-11.799,130019,0.0000,0.6000,0.57300
3,0.650,-10.818,116193,0.0000,0.5810,0.86500
4,0.690,-15.624,111827,0.0000,0.1050,0.76400
...,...,...,...,...,...,...
4068,0.862,-9.364,139442,0.0000,0.3850,0.01460
4069,0.686,-10.253,97440,0.0000,0.4020,0.01010
4070,0.725,-13.018,91742,0.0000,0.1390,0.00189
4071,0.763,-8.889,113868,0.0000,0.3020,0.00531


In [4]:
scaler = StandardScaler()
df = scaler.fit_transform(df)
df

array([[ 0.70212452,  0.43970598,  0.09349236, -0.57343199,  1.67698269,
         1.02426181],
       [ 1.89545935, -0.08463916, -0.52910405, -0.40250266, -1.37236672,
        -0.96331055],
       [ 0.86089439, -0.15155935, -0.70029483, -0.57343199,  0.48080076,
         0.73443241],
       ...,
       [ 0.71748935, -0.35199106, -1.03591608, -0.57343199, -1.2055597 ,
        -0.97200543],
       [ 0.91211048,  0.32691176, -0.84191039, -0.57343199, -0.60929776,
        -0.9617867 ],
       [ 0.71236774,  0.09096467, -0.20733658, -0.57343199,  0.3564271 ,
        -0.77955273]])

In [None]:
def davies_bouldin(X, labels):
    clusters = np.unique(labels)
    k = len(unique_labels)
    centres = np.array([X[labels == i].mean(axis=0) for i in clusters])

    cluster_distances = np.zeros((k, k))

    for i in range(k):
        for j in range(k):
            if i != j:
                distance = np.linalg.norm(centres[i] - centres[j])
                cluster_distances[i, j] = distance

    score = np.zeros(k)

    for i in range(k):
        max_diameters = 0
        for j in range(k):
            if i != j:
                diameter = np.max([np.linalg.norm(X[labels == i] - centres[i]), np.linalg.norm(X[labels == j] - centres[j])])
                if diameter > max_diameters:
                    max_diameters = diameter

        score[i] = max_diameters / cluster_distances[i, np.argmax(np.delete(cluster_distances[i, :], i))]

    return np.mean(score)

In [6]:
X = df
y = data['popularity']

scores_train = []
silhouettes_train = []
rand_train = []
scores_test = []
silhouettes_test = []
rand_test = []

for i in range(10):
    print("Itération :",i+1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)
    
    kmeans = KMeans(n_clusters=10)  
    kmeans.fit(X_train)
    
    clusters = kmeans.predict(X_test)

    scores_train.append( davies_bouldin_score(X_train, kmeans.labels_))
    silhouettes_train.append(silhouette_score(X_train, kmeans.labels_))
    rand_train.append(adjusted_rand_score(y_train, kmeans.labels_))
    print("Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index):",davies_bouldin_score(X_train, kmeans.labels_), silhouette_score(X_train, kmeans.labels_), adjusted_rand_score(y_train, kmeans.labels_))

        # Étape 5 : Calculer les scores sur le jeu de test
    scores_test.append(davies_bouldin_score(X_test, clusters))
    silhouettes_test.append(silhouette_score(X_test, clusters))
    rand_test.append(adjusted_rand_score(y_test, clusters))
    print("Scores sur le jeu de test (DB, Silhouette, Rand Index)", davies_bouldin_score(X_test, clusters), silhouette_score(X_test, clusters), adjusted_rand_score(y_test, clusters))



print("KMeans :\n")
print("Moyenne des scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index):",np.mean(scores_train), np.mean(silhouettes_train), np.mean(rand_train))
print("Moyenne des scores sur le jeu de test (DB, Silhouette, Rand Index)",np.mean(scores_test), np.mean(silhouettes_test), np.mean(rand_test))


Itération : 1


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.2127926975479368 0.22016512761155943 -0.0026907572055941504
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.2188953575913835 0.21871056814406822 -0.0021354254183962415
Itération : 2


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.2299962095280348 0.2213900241076404 0.0007966160005763929
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.2710257486489698 0.21726760660558717 -0.0024185087271262444
Itération : 3


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.317923415290002 0.22324820193356268 -0.003756938013779615
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.3772412667266694 0.20636651857325508 0.0020569262892738134
Itération : 4


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.1769366416860723 0.22218314211341278 4.2206044003186534e-05
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.3701811134296473 0.2047624762830889 -0.00326164913041378
Itération : 5


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.2257015487427574 0.22169364657748014 -0.0017416333597559258
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.2254541904471499 0.2120182448854089 0.0011610979331291601
Itération : 6


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.23594029327333 0.22338787787364162 -0.0023051089847934488
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.1986490251909967 0.2179133966034063 0.00241227661492925
Itération : 7


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.314815664778973 0.22160147364643493 0.0003632750284555863
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.3882858216508303 0.2055158723689498 -0.0032090971243990784
Itération : 8


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.2543261398611425 0.21895345150569637 -0.0013318534471571809
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.1857383889714048 0.22579010950984602 -0.0015992212091856992
Itération : 9


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.244976487193067 0.22009494080663483 -0.002880674979637682
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.225737403234366 0.21759724845333747 0.0028850056945133066
Itération : 10


  super()._check_params_vs_input(X, default_n_init=10)


Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.2481109426396826 0.2242901772626575 -0.00042537879451418694
Scores sur le jeu de test (DB, Silhouette, Rand Index) 1.242302281817289 0.21321263327280535 -0.0016952975765696292
KMeans :

Moyenne des scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 1.2461520040540999 0.22170080634387204 -0.0013930247712197026
Moyenne des scores sur le jeu de test (DB, Silhouette, Rand Index) 1.2703510597708707 0.21391546746997533 -0.0005803892654245142


In [7]:
scores_train = []
silhouettes_train = []
rand_train = []
scores_test = []
silhouettes_test = []
rand_test = []

for i in range(10):
    print("Itération :",i+1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)
    
    gmm = GaussianMixture(n_components=10)
    gmm.fit(X_train)
    
    clusters = gmm.predict(X_test)

    scores_train.append( davies_bouldin_score(X_train, gmm.predict(X_train)))
    silhouettes_train.append(silhouette_score(X_train, gmm.predict(X_train)))
    rand_train.append(adjusted_rand_score(y_train, gmm.predict(X_train)))
    print("Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index):",davies_bouldin_score(X_train, gmm.predict(X_train)), silhouette_score(X_train, gmm.predict(X_train)), adjusted_rand_score(y_train, gmm.predict(X_train)))

        # Étape 5 : Calculer les scores sur le jeu de test
    scores_test.append(davies_bouldin_score(X_test, clusters))
    silhouettes_test.append(silhouette_score(X_test, clusters))
    rand_test.append(adjusted_rand_score(y_test, clusters))
    print("Scores sur le jeu de test (DB, Silhouette, Rand Index)",davies_bouldin_score(X_test, clusters), silhouette_score(X_test, clusters), adjusted_rand_score(y_test, clusters))



print("Gaussian Mixture :\n")
print("Moyenne des scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index):",np.mean(scores_train), np.mean(silhouettes_train), np.mean(rand_train))
print("Moyenne des scores sur le jeu de test (DB, Silhouette, Rand Index)",np.mean(scores_test), np.mean(silhouettes_test), np.mean(rand_test))


Itération : 1
Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 3.271792495501825 0.011511909171955624 0.003471727423021604
Scores sur le jeu de test (DB, Silhouette, Rand Index) 4.11372589928917 -0.0048119305089468975 -0.017762598612460483
Itération : 2
Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 3.295385181809607 0.006488798386281572 -0.0038742929141073095
Scores sur le jeu de test (DB, Silhouette, Rand Index) 4.210654570215031 -0.0017610013634669428 -0.010491764990232985
Itération : 3
Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 3.0843377622991994 0.041502332499324175 -0.007098302356814998
Scores sur le jeu de test (DB, Silhouette, Rand Index) 3.8910901297299154 0.024377519418094115 -0.012882892895736674
Itération : 4
Scores sur le jeu d'apprentissage (DB, Silhouette, Rand Index): 3.0284514826594684 0.0023341169914713276 -0.012695531426212192
Scores sur le jeu de test (DB, Silhouette, Rand Index) 2.9304241035777294 -0.011056774842

In [8]:
kmeans = KMeans(n_clusters=10)
kmeans_clusters = kmeans.fit_predict(X)

gmm = GaussianMixture(n_components=10)
gmm_clusters = gmm.fit_predict(X)

rand_index = adjusted_rand_score(kmeans_clusters, gmm_clusters)

print("Rand Index entre K-Means et GMM :",rand_index)


  super()._check_params_vs_input(X, default_n_init=10)


Rand Index entre K-Means et GMM : 0.21384797788184182


In [14]:
import pandas as pd
df['popularity']= data['popularity']

# Supposons que vous avez un DataFrame 'df' contenant vos données, y compris les niveaux de popularité.
# Supposons également que 'kmeans_clusters' contient les clusters obtenus par K-Means.

# Ajouter les informations de clustering au DataFrame
df['KMeans_Cluster'] = kmeans_clusters

# Compter le nombre d'occurrences de chaque niveau de popularité dans chaque cluster
cluster_popularity_counts = df.groupby(['KMeans_Cluster', 'popularity']).size().unstack(fill_value=0)

# Afficher les résultats
print(cluster_popularity_counts)


popularity        0    1   2   3   4  5  6   7   8  9
KMeans_Cluster                                       
0               289   91  29  15   4  4  2  14  15  1
1               557  161  44  29  14  4  6  29  31  1
2               232   29   5   0   2  0  0   4   3  0
3                 4    2   0   0   0  0  0   0   0  0
4               389  158  51  18  13  5  2  21  25  1
5                89   35  21   2   3  4  1   1   6  0
6               485  130  40  16   9  5  2  24  37  1
7               238   66  13   6   5  3  1   4   2  0
8               163  111  24   5   5  4  1   2   4  0
9               130   53  13   2   1  1  1   0   0  0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['popularity']= data['popularity']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KMeans_Cluster'] = kmeans_clusters


In [15]:
vrai_labels = df['popularity']

# Calculer le Rand Index
rand_index = adjusted_rand_score(vrai_labels, kmeans_clusters)

# Afficher le résultat
print(f"Rand Index : {rand_index}")

Rand Index : -0.0017754010995472875


In [25]:
# Calculer les distances entre chaque chanson et les centres des clusters
distances = cdist(df_features, centers, metric='euclidean')

# Identifier les indices des chansons les plus proches de chaque centre de cluster
indices_plus_proches = np.argmin(distances, axis=0)

# Afficher les résultats
for i, indice_chanson in enumerate(indices_plus_proches):
    print(f"Centre du Cluster {i + 1} - Chanson la plus proche : {data['track_name'].iloc[indice_chanson]}")

ValueError: XA and XB must have the same number of columns (i.e. feature dimension.)

In [19]:
# Supposons que 'kmeans' est votre modèle K-Means déjà entraîné
# et 'df_features' est votre DataFrame avec les caractéristiques utilisées pour l'entraînement

# Accéder aux centroides des clusters
centroides = kmeans.cluster_centers_

# Créer un DataFrame pour les centroides pour une analyse plus facile
df_centroides = pd.DataFrame(centroides, columns=df_features.columns)

# Visualiser les valeurs des centroides
print(df_centroides)


   danceability  loudness  duration_ms  instrumentalness   valence  \
0      0.256809  0.070722    -0.278911         -0.531860  0.591195   
1      0.737421  0.537575     0.068170         -0.504912  1.217662   
2     -1.657397 -2.224911    -0.142580          1.843513 -1.425691   
3     -0.744730 -0.284632    14.564022         -0.087089 -0.014255   
4     -0.708322  0.628038     0.034188         -0.487832 -0.261554   
5     -0.026978  0.052002     2.312243          0.629456 -0.086748   
6      0.745846  0.185848    -0.332408         -0.541692 -0.456544   
7     -1.005819 -0.738218     0.025533         -0.408644 -0.955380   
8     -0.271800  0.132232    -0.154394          1.786636 -0.340898   
9      0.238785 -1.307776    -0.430658          1.930702  0.064983   

   acousticness  
0      0.974694  
1     -0.468640  
2      1.396309  
3      0.091524  
4     -0.680217  
5     -0.193029  
6     -0.607751  
7      1.326126  
8     -0.736271  
9      1.570117  
