### Spectral Clustering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import SpectralClustering, KMeans
from sklearn.neighbors import kneighbors_graph
from scipy import sparse, linalg

In [3]:
data_unstructured = pd.read_csv("data\spotify.csv") 
data_unstructured = data_unstructured.iloc[:,1:]
relevant_columns = ["track_name","genre",
                    "acousticness","danceability","energy","instrumentalness",
                    "liveness","loudness","speechiness","tempo","valence"]
data = data_unstructured.loc[:, relevant_columns]
data = data[data["genre"] != "Gaming"]
data["genre"].replace({"K-pop": "Pop","reggae": "Reggae"}, inplace = True)

X = data[["acousticness","danceability","energy","instrumentalness",
          "liveness","loudness","speechiness","tempo","valence"]].to_numpy()

X

array([[3.48000e-01, 5.91000e-01, 7.18000e-01, ..., 3.68000e-02,
        1.17995e+02, 4.68000e-01],
       [1.33000e-01, 5.68000e-01, 6.86000e-01, ..., 9.03000e-02,
        1.48294e+02, 4.00000e-01],
       [6.48000e-01, 6.00000e-01, 5.35000e-01, ..., 1.07000e-01,
        7.19120e+01, 2.69000e-01],
       ...,
       [3.57000e-01, 7.69000e-01, 9.62000e-01, ..., 6.91000e-02,
        1.07008e+02, 7.08000e-01],
       [1.41000e-01, 7.76000e-01, 9.32000e-01, ..., 1.23000e-01,
        1.04988e+02, 7.75000e-01],
       [1.85000e-01, 8.17000e-01, 9.50000e-01, ..., 4.58000e-02,
        1.33041e+02, 9.70000e-01]])

In [None]:
#https://juanitorduz.github.io/spectral_clustering/
def generate_graph_laplacian(df, nn):
    """Generate graph Laplacian from data."""
    # Adjacency Matrix.
    connectivity = kneighbors_graph(X=df, n_neighbors=nn, mode='connectivity')
    adjacency_matrix_s = (1/2)*(connectivity + connectivity.T)
    # Graph Laplacian.
    graph_laplacian_s = sparse.csgraph.laplacian(csgraph=adjacency_matrix_s, normed=False)
    graph_laplacian = graph_laplacian_s.toarray()
    return graph_laplacian 

def compute_spectrum_graph_laplacian(graph_laplacian):
    """Compute eigenvalues and eigenvectors and project 
    them onto the real numbers.
    """
    eigenvals, eigenvcts = linalg.eig(graph_laplacian)
    eigenvals = np.real(eigenvals)
    eigenvcts = np.real(eigenvcts)
    return eigenvals, eigenvcts

def project_and_transpose(eigenvals, eigenvcts, num_ev):
    """Select the eigenvectors corresponding to the first 
    (sorted) num_ev eigenvalues as columns in a data frame.
    """
    eigenvals_sorted_indices = np.argsort(eigenvals)
    indices = eigenvals_sorted_indices[: num_ev]

    proj_df = pd.DataFrame(eigenvcts[:, indices.squeeze()])
    proj_df.columns = ['v_' + str(c) for c in proj_df.columns]
    return proj_df
    
def run_k_means(df, n_clusters):
    """K-means clustering."""
    k_means = KMeans(random_state=25, n_clusters=n_clusters)
    k_means.fit(df)
    cluster = k_means.predict(df)
    return cluster

def spectral_clustering(df, n_neighbors, n_clusters):
    """Spectral Clustering Algorithm."""
    graph_laplacian = generate_graph_laplacian(df, n_neighbors)
    eigenvals, eigenvcts = compute_spectrum_graph_laplacian(graph_laplacian)
    proj_df = project_and_transpose(eigenvals, eigenvcts, n_clusters)
    cluster = run_k_means(proj_df, proj_df.columns.size)
    return ['c_' + str(c) for c in cluster]

In [None]:
data["cluster"] = spectral_clustering(X, 500, 4)
data["cluster"].replace(["c_0", "c_1", "c_2", "c_3"], ["Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4"], inplace = True)

In [None]:
#2D cross-section between valence and danceability

fig, [first, second] = plt.subplots(figsize = (15, 5), ncols = 2)

#True Labels
sns.scatterplot(x = "valence", y = "danceability",  hue = "genre", alpha = 0.60,
                data = data, ax = first)
first.set_title("True Song Labels Based on Valence and Danceability");
first.set_xlabel("Valence (overall positiveness of a song)", size = 12, weight = "bold");
first.set_ylabel("Danceability", size = 12, weight = "bold");

#Cluster Assignments
sns.scatterplot(x = "valence", y = "danceability",  hue = "cluster", alpha = 0.60,
                data = data, ax = second)
second.set_title("Spectral Clustering");
second.set_xlabel("Valence (overall positiveness of a song)", size = 12, weight = "bold");
second.set_ylabel("Danceability", size = 12, weight = "bold");
handles, labels  =  second.get_legend_handles_labels()
second.legend(handles, ["Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4"]);

In [None]:
#2D cross-section between loudness and energy

fig, [first, second] = plt.subplots(figsize = (15, 5), ncols = 2)

#True Labels
sns.scatterplot(x = "loudness", y = "energy",  hue = "genre", alpha = 0.60,
                data = data, ax = first)
first.set_title("True Song Labels Based on Loudness and Energy");
first.set_xlabel("Loudness", size = 12, weight = "bold");
first.set_ylabel("Energy", size = 12, weight = "bold");

#Cluster Assignments
sns.scatterplot(x = "loudness", y = "energy",  hue = "cluster", alpha = 0.60,
                data = data, ax = second)
second.set_title("Spectral Clustering");2
second.set_xlabel("Loudness", size = 12, weight = "bold");
second.set_ylabel("Energy", size = 12, weight = "bold");
handles, labels  =  second.get_legend_handles_labels()
second.legend(handles, ["Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4"]);