# Visualizing Genres using Clustering Models

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

## Data Pre-processing

In [2]:
spotify = pd.read_csv('../data/tops/features.csv',index_col=0)
spotify.dropna(how='any', inplace=True)
spotify.head()

Unnamed: 0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,Callaita,0.61,0.624,2.0,-4.773,1.0,0.309,0.6,2e-06,0.243,0.244,176.169,audio_features,2TH65lNHgvLxCKXM3apjxI,spotify:track:2TH65lNHgvLxCKXM3apjxI,https://api.spotify.com/v1/tracks/2TH65lNHgvLx...,https://api.spotify.com/v1/audio-analysis/2TH6...,250534.0,4.0
1,Sacrifices (with EARTHGANG & J. Cole feat. Smi...,0.825,0.672,0.0,-7.108,1.0,0.418,0.0858,0.0,0.123,0.246,147.01,audio_features,7wTA0NKIm6T7nP2kaymU2a,spotify:track:7wTA0NKIm6T7nP2kaymU2a,https://api.spotify.com/v1/tracks/7wTA0NKIm6T7...,https://api.spotify.com/v1/audio-analysis/7wTA...,382307.0,4.0
2,Envy Me,0.74,0.488,1.0,-7.664,0.0,0.27,0.234,0.0,0.241,0.584,149.042,audio_features,7rvyVWja33WG9R97oeJAjx,spotify:track:7rvyVWja33WG9R97oeJAjx,https://api.spotify.com/v1/tracks/7rvyVWja33WG...,https://api.spotify.com/v1/audio-analysis/7rvy...,120133.0,4.0
3,Me Odias?,0.786,0.581,1.0,-6.387,0.0,0.197,0.552,0.0,0.0859,0.33,131.874,audio_features,5I31UVARw4Di9SooJGN9el,spotify:track:5I31UVARw4Di9SooJGN9el,https://api.spotify.com/v1/tracks/5I31UVARw4Di...,https://api.spotify.com/v1/audio-analysis/5I31...,234550.0,4.0
4,Quiero,0.651,0.737,9.0,-7.416,1.0,0.0415,0.0447,0.0,0.156,0.703,104.832,audio_features,0iEW2LlHNXVyZhwO96sL3z,spotify:track:0iEW2LlHNXVyZhwO96sL3z,https://api.spotify.com/v1/tracks/0iEW2LlHNXVy...,https://api.spotify.com/v1/audio-analysis/0iEW...,209583.0,4.0


In [3]:
features =['danceability','energy','key','loudness', 'mode', 
           'speechiness', 'acousticness', 'instrumentalness', 
           'liveness','valence','tempo','duration_ms', 'time_signature']

x = spotify.loc[:,features].values
x = StandardScaler().fit_transform(x)

## Principal Component Analysis

In [4]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(x)

principal_df = pd.DataFrame(data=principal_components, columns=['principal_component_1', 'principal_component_2'])

kmeans_pca = KMeans(n_clusters=8, random_state=0, init='random').fit(principal_df)
predicted_pca = kmeans_pca.fit_predict(music_data_pca)

principal_df['prediction'] = predicted_pca
principal_df.set_index(keys=spotify.index, inplace=True)
principal_df.head()

NameError: name 'music_data_pca' is not defined

In [None]:
spotify['pca_pred'] = predicted_pca

## t-SNE

In [None]:
tsne = TSNE(n_components=2)
x_embedded = tsne.fit_transform(x)

In [None]:
x_embedded

In [None]:
tsne = TSNE(n_components=2)
x_embedded = tsne.fit_transform(x)
tsne_df = pd.DataFrame(data=x_embedded, columns=['tsne_1', 'tsne_2'])

kmeans_tsne = KMeans(n_clusters=8, random_state=0, init='random').fit(tsne_df)
predicted_tsne = kmeans_tsne.fit_predict(tsne_df)

tsne_df['prediction'] = predicted_tsne
tsne_df.set_index(keys=spotify.index, inplace=True)
tsne_df.head()

In [None]:
spotify['tsne_pred'] = predicted_tsne

## Grouped DataFrames

In [None]:
pred_0_pca = principal_df[principal_df['prediction'] == 0]
pred_1_pca = principal_df[principal_df['prediction'] == 1]
pred_2_pca = principal_df[principal_df['prediction'] == 2]
pred_3_pca = principal_df[principal_df['prediction'] == 3]
pred_4_pca = principal_df[principal_df['prediction'] == 4]

pred_0_tsne = tsne_df[tsne_df['prediction'] == 0]
pred_1_tsne = tsne_df[tsne_df['prediction'] == 1]
pred_2_tsne = tsne_df[tsne_df['prediction'] == 2]
pred_3_tsne = tsne_df[tsne_df['prediction'] == 3]
pred_4_tsne = tsne_df[tsne_df['prediction'] == 4]

## Comparison Plot

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(1,2,1)
sns.scatterplot(x='principal_component_1', y='principal_component_2', data=principal_df, 
                hue='prediction', palette=sns.color_palette('hls',8), 
                style='prediction', markers=["X","X","X","X","X","X","X","X"])

for center, i in zip(kmeans_pca.cluster_centers_, range(8)):
    if not i:
        plt.scatter(center[0], center[1], marker='*',s=200, c='black', label='Centroid')
    else:
        plt.scatter(center[0], center[1], marker='*',s=200, c='black')
    
plt.xlabel('First Principal Component (PCA)')
plt.ylabel('Second Principal Component (PCA)')
plt.title('k-Means Clustering via PCA', pad=20)
plt.legend(shadow=True)

plt.subplot(1,2,2)
sns.scatterplot(x='tsne_1', y='tsne_2', data=tsne_df, 
                hue='prediction', palette=sns.color_palette('hls',8),
                style='prediction', markers=['X','X','X','X','X','X','X','X'])

for center, i in zip(kmeans_tsne.cluster_centers_, range(8)):
    if not i:
        plt.scatter(center[0], center[1], marker='*',s=200, c='black', label='Centroid')
    else:
        plt.scatter(center[0], center[1], marker='*',s=200, c='black')

plt.xlabel('t-SNE One')
plt.ylabel('t-SNE Two')
plt.title('k-Means Clustering via t-SNE Reduction', pad=20)
plt.legend(shadow=True)
plt.show();