In [None]:
# installs

# colab
# ! pip install scikit-plot

# local/server
# pip install scikit-plot

In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_samples, silhouette_score

import scikitplot as skplot

In [None]:
# COLAB - auth big query

# from google.colab import auth
# auth.authenticate_user()
# print('Authenticated')

In [None]:
# get the data
SQL = "SELECT * from `questrom.datasets.spotify_2018`"
PROJECT = "questrom"


spotify = pd.read_gbq(SQL, PROJECT)

In [None]:
# the shape
spotify.shape

In [None]:
# quick review
spotify.head(3)

In [None]:
# info
spotify.info()

In [None]:
# summary
spotify.describe().T

In [None]:
# time, mode, key
# docs
# https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject
COLS = ['time_signature', 'mode', 'key', 'name', 'artists']

X = spotify.drop(columns=COLS)
X.index = X.id

del X['id']

In [None]:
# quick preview
X.head(3)

In [None]:
# scale, because clearly these are not on the same scale, and I want to ensure each variable has equal weight
sc = StandardScaler()
xs = sc.fit_transform(X)
X = pd.DataFrame(xs, index=X.index, columns=X.columns)

In [None]:
# confirm
X.describe().T

In [None]:
# hclust
METHODS = ['single', 'complete', 'average', 'ward']
plt.figure(figsize=(15,5))


# loop and build our plot
for i, m in enumerate(METHODS):
  plt.subplot(1, 4, i+1)
  plt.title(m)
  dendrogram(linkage(X.values, method=m),
             labels = X.index,
             leaf_rotation=90,
             leaf_font_size=10)
  
plt.show()

In [None]:
# average or ward look like appear to have favorable properties to me
# we can think of genres as a macro or micro level
# for example: https://www.musicgenreslist.com/
# dataset appears to be a list of top 100 songs
# I am going to use average because I want to test if the top songs span some generes


In [None]:
plt.figure(figsize=(10, 6))

avg = linkage(X.values, method="average")
dendrogram(avg,
          labels = X.index,
          leaf_rotation=90,
          leaf_font_size=10, color_threshold=4)

plt.axhline(y=4)
plt.show()

In [None]:
# the clusters
hc_labs = fcluster(avg, 4, criterion="distance")

# the metrics
hc_silo = silhouette_score(X, hc_labs)
hc_ssamps = silhouette_samples(X, hc_labs)
np.unique(hc_labs)

In [None]:
# Kmeans
KS = range(2, 30)

# storage
inertia = []
silo = []

for k in KS:
  km = KMeans(k)
  km.fit(X)
  labs = km.predict(X)
  inertia.append(km.inertia_)
  silo.append(silhouette_score(X, labs))


In [None]:
plt.figure(figsize=(15,5))


plt.subplot(1, 2, 1)
plt.title("Inertia")
sns.lineplot(KS, inertia)

plt.subplot(1, 2, 2)
plt.title("Silohouette Score")
sns.lineplot(KS, silo)

plt.show()

In [None]:
for i, s in enumerate(silo[:10]):
  print(i+2,s) # +2 to align num clusters with value

In [None]:
# 9 looks like a good number, we get improvement in silo score and approx. 
# where the elbow could be viewed for interita



In [None]:
# get the model
k9 = KMeans(9)
k9_labs = k9.fit_predict(X)

# metrics
k9_silo = silhouette_score(X, k9_labs)
k9_ssamps = silhouette_samples(X, k9_labs)
np.unique(k9_labs)

In [None]:
# lets compare via silo

skplot.metrics.plot_silhouette(X, hc_labs, title="HClust", figsize=(15,5))
plt.show()

In [None]:
skplot.metrics.plot_silhouette(X, k9_labs, title="KMeans - 9", figsize=(15,5))
plt.show()

In [None]:
# I like the Kmeans fit.  There are a few negative values, which suggests fit could be off, 
# but some of the clusters look pretty good.

In [None]:
# lets profile the songs
spotify['k9_labs'] = k9_labs

In [None]:
# profile
profile = spotify.groupby('k9_labs').mean()
profile

In [None]:
# heatmap
sc = StandardScaler()
profile_scaled = sc.fit_transform(profile)

plt.figure(figsize=(12, 6))
pal = sns.color_palette("vlag", as_cmap=True)
sns.heatmap(profile_scaled, center=0, cmap=pal, xticklabels=profile.columns)

In [None]:
# counts by cluster
spotify.k9_labs.value_counts(sort=False)

In [None]:
# OBSERVATIONS:
#1. profile 0 = apperas to be longer songs
#2. profile 1 = higher likelihood of a live song
#3. profile 5 = only 1 song, and its larger on instrumentalness
#4. profile 4 = appears to a reprsentation of "average" songs
#5. profile 8 = the largest cluster, tends to be high on positivity measure, but less on tempo

In [None]:
# CONSIDERATIONS
# - Look at DBSCAN
# - explore both distance metrics (euclidean used) and different linkage approaches
# - I was ok with 9 clusters because this task was to think about genre proxies.  
#   In some cases, large number of clusters are not actionable for a firm because the management is too 
#   resource intensives.  Because we are trying to dervice genres, volume is not an issue
# - consider the features themselves.  Perhaps we don't need to use them all in the case some are correlated.
# - Dive deeper into profiling.  Look at the songs within the clusters to validate as best as you can.
# - Enrich the dataset by appending/annotating the true genre of each song. 
#   After clustering the data, profile by the true genres to get a sense of how the segments align.