In [None]:
''' Install Spotipy package '''

!pip install spotipy
!pip install -U kaleido

In [None]:
''' Mount Google Drive '''

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
''' IMPORTS '''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN, KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

from sklearn.manifold import TSNE
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.offline import plot
from plotly.subplots import make_subplots
import kaleido
import plotly

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [None]:
''' READ DATA '''

data = pd.read_csv('drive/MyDrive/USML/v2/data.csv')
genre_data = pd.read_csv('drive/MyDrive/USML/v2/data_by_genres.csv')
year_data = pd.read_csv('drive/MyDrive/USML/v2/data_by_year.csv')
artist_data = pd.read_csv('drive/MyDrive/USML/v2/data_by_artist.csv')

In [None]:
''' READ DIMENSION REDUCED DATA '''

''' PCA '''
pca_music = pd.read_csv('drive/MyDrive/USML/v2/pca_music.csv')
pca_genre = pd.read_csv('drive/MyDrive/USML/v2/pca_genre.csv')
pca_artist = pd.read_csv('drive/MyDrive/USML/v2/pca_artist.csv')

''' TSNE '''
tsne_music = pd.read_csv('drive/MyDrive/USML/v2/tsne_music.csv')
tsne_genre = pd.read_csv('drive/MyDrive/USML/v2/tsne_genre.csv')
tsne_artist = pd.read_csv('drive/MyDrive/USML/v2/tsne_artist.csv')

<h2> Clustering Algorithms <h2>

<h3> DBSCAN </h3>

In [None]:
''' Clustering music data '''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('dbscan', DBSCAN(eps = 0.1, min_samples = 5))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)
print('Number of clusters = ', len(Counter(clusters).keys()))

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_music['x']
df_temp_pca['y_component']=pca_music['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_music['x']
df_temp_tsne['y_component']=tsne_music['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of songs using DBSCAN (visualized with PCA vs. tSNE)', 'font': {'size': 20}})        
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/dbscan-music.png')

In [None]:
''' Clustering genre data '''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('dbscan', DBSCAN(eps = 0.1, min_samples = 3))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_genre['x']
df_temp_pca['y_component']=pca_genre['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_genre['x']
df_temp_tsne['y_component']=tsne_genre['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of genres using DBSCAN (visualized with PCA vs. tSNE)', 'font': {'size': 20}})        
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/dbscan-genre.png')

In [None]:
''' Clustering artist data '''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('dbscan', DBSCAN(eps = 0.1, min_samples = 10))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)
print('Number of clusters = ', len(Counter(clusters).keys()))

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_artist['x']
df_temp_pca['y_component']=pca_artist['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_artist['x']
df_temp_tsne['y_component']=tsne_artist['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of artists using DBSCAN (visualized with PCA vs. tSNE)', 'font': {'size': 20}})     
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/dbscan-artist.png')

<h3> KMeans </h3>

In [None]:
''' Clustering music data '''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=20, random_state=0))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_music['x']
df_temp_pca['y_component']=pca_music['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_music['x']
df_temp_tsne['y_component']=tsne_music['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of songs using KMeans (visualized with PCA vs. tSNE)', 'font': {'size': 20}})         
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/kmeans-music.png')

In [None]:
''' Clustering genre data '''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, random_state=0))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_genre['x']
df_temp_pca['y_component']=pca_genre['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_genre['x']
df_temp_tsne['y_component']=tsne_genre['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of genres using KMeans (visualized with PCA vs. tSNE)', 'font': {'size': 20}})       
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/kmeans-genre.png')

In [None]:
''' Clustering artist data '''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=15, random_state=0))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.predict(X)
print('Number of clusters = ', len(Counter(clusters).keys()))

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_artist['x']
df_temp_pca['y_component']=pca_artist['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_artist['x']
df_temp_tsne['y_component']=tsne_artist['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of artists using KMeans (visualized with PCA vs. tSNE)', 'font': {'size': 20}})            
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/kmeans-artist.png')

<h3> Spectral </h3>

In [None]:
''' Clustering music data '''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']].head(10000)

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('spectral', SpectralClustering(n_clusters=20))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_music['x'][:10000]
df_temp_pca['y_component']=pca_music['y'][:10000]
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_music['x'][:10000]
df_temp_tsne['y_component']=tsne_music['y'][:10000]
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of songs using Spectral clustering (visualized with PCA vs. tSNE)', 'font': {'size': 20}})            
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/spectral-music.png')

In [None]:
''' Clustering genre data '''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('spectral', SpectralClustering(n_clusters=10))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_genre['x']
df_temp_pca['y_component']=pca_genre['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_genre['x']
df_temp_tsne['y_component']=tsne_genre['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of genres using Spectral clustering (visualized with PCA vs. tSNE)', 'font': {'size': 20}})         
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/spectral-genre.png')

In [None]:
''' Clustering artist data '''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']].head(10000)

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('spectral', SpectralClustering(n_clusters=15))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)
print('Number of clusters = ', len(Counter(clusters).keys()))

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_artist['x'][:10000]
df_temp_pca['y_component']=pca_artist['y'][:10000]
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_artist['x'][:10000]
df_temp_tsne['y_component']=tsne_artist['y'][:10000]
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of artists using Spectral clustering (visualized with PCA vs. tSNE)', 'font': {'size': 20}})          
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/spectral-artist.png')

<h3> GMM </h3>

In [None]:
''' Clustering music data '''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('gmm', GaussianMixture(n_components = 20, random_state = 0))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_music['x']
df_temp_pca['y_component']=pca_music['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_music['x']
df_temp_tsne['y_component']=tsne_music['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of songs using GMM (visualized with PCA vs. tSNE)', 'font': {'size': 20}})        
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/gmm-music.png')

In [None]:
''' Clustering genre data '''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('gmm', GaussianMixture(n_components = 10, random_state = 0))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_genre['x']
df_temp_pca['y_component']=pca_genre['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_genre['x']
df_temp_tsne['y_component']=tsne_genre['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of genres using GMM (visualized with PCA vs. tSNE)', 'font': {'size': 20}})      
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/gmm-genre.png')

In [None]:
''' Clustering artist data '''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('gmm', GaussianMixture(n_components = 15, random_state = 0))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_artist['x']
df_temp_pca['y_component']=pca_artist['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_artist['x']
df_temp_tsne['y_component']=tsne_artist['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of artists using GMM (visualized with PCA vs. tSNE)', 'font': {'size': 20}})      
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/gmm-artist.png')

<h3> Agglomerative </h3>

In [None]:
''' Clustering music data '''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']].head(30000)

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('agglomerative', AgglomerativeClustering(n_clusters = 20, affinity = 'euclidean', linkage = 'complete'))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_music['x'][:30000]
df_temp_pca['y_component']=pca_music['y'][:30000]
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_music['x'][:30000]
df_temp_tsne['y_component']=tsne_music['y'][:30000]
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of songs using Agglomerative Clustering (visualized with PCA vs. tSNE)', 'font': {'size': 20}})          
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/agglomerative-music.png')

In [None]:
''' Clustering genre data '''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('agglomerative', AgglomerativeClustering(n_clusters = 10, affinity = 'euclidean', linkage = 'complete'))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_genre['x']
df_temp_pca['y_component']=pca_genre['y']
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_genre['x']
df_temp_tsne['y_component']=tsne_genre['y']
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of genres using Agglomerative Clustering (visualized with PCA vs. tSNE)', 'font': {'size': 20}})       
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/agglomerative-genre.png')

In [None]:
''' Clustering artist data '''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']].head(30000)

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('agglomerative', AgglomerativeClustering(n_clusters = 15, affinity = 'euclidean', linkage = 'complete'))])
cluster_pipeline.fit(X)

clusters = cluster_pipeline.fit_predict(X)

df_temp_pca = pd.DataFrame()
df_temp_pca['x_component']=pca_artist['x'][:30000]
df_temp_pca['y_component']=pca_artist['y'][:30000]
df_temp_pca['cluster'] = clusters

df_temp_tsne = pd.DataFrame()
df_temp_tsne['x_component']=tsne_artist['x'][:30000]
df_temp_tsne['y_component']=tsne_artist['y'][:30000]
df_temp_tsne['cluster'] = clusters

figures = [
            px.scatter(df_temp_pca, x = 'x_component', y = 'y_component', color = 'cluster'),
            px.scatter(df_temp_tsne, x = 'x_component', y = 'y_component', color = 'cluster')
    ]

fig = make_subplots(rows=1, cols=len(figures)) 

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=i+1)
fig.update_layout(title={'text': 'Clustering of artists using Agglomerative Clustering (visualized with PCA vs. tSNE)', 'font': {'size': 20}})       
fig.show()

fig.write_image('drive/MyDrive/USML/v2/plots/clustering/agglomerative-artist.png')