In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nem.cleanup import filter_artists
from nem.util import load_artists

In [None]:
artists = load_artists('../artists.csv')

In [None]:
unique_genres = {}
for artist_genres in artists.genres:
    for genre in artist_genres:
        unique_genres[genre] = unique_genres.get(genre, 0) + 1

unique_genres = pd.Series(unique_genres, name='song count')
n_genres = len(unique_genres)
print(f'There are {n_genres} unique genres')
print('A small excerpt:', list(unique_genres.index[:30]))

#### Genres with the most songs

In [None]:
unique_genres.sort_values(ascending=False).head(25)

#### Distribution

In [None]:
unique_genres.hist(bins=15)
plt.yscale('log')

In [None]:
unique_genres.describe()

In [None]:
unique_genres = unique_genres.loc[unique_genres > 25]
n_genres = len(unique_genres)
unique_genres.describe()

In [None]:
genre_to_idx = {genre:i for i,genre in enumerate(unique_genres.index)}
idx_to_genre = {i:genre for i,genre in enumerate(unique_genres.index)}

In [None]:
adjacency_matrix = np.zeros((n_genres, n_genres))

for artist_genres in artists.genres:
    for genre1 in artist_genres:
        for genre2 in artist_genres:
            #if genre1 == genre2:
            #    continue
            try:
                idx1 = genre_to_idx[genre1]
                idx2 = genre_to_idx[genre2]
                adjacency_matrix[idx1, idx2] += 1
            except KeyError:
                pass

plt.figure(figsize=(8,8))
plt.imshow(adjacency_matrix)
plt.show()

distance_matrix = 1 / (adjacency_matrix+1)**1.0
plt.figure(figsize=(8,8))
plt.imshow(distance_matrix)

In [None]:
tmp  = adjacency_matrix / adjacency_matrix.sum(axis=1, keepdims=True)
adjacency_matrix = (tmp + tmp.T) / 2
distance_matrix =  1 / (adjacency_matrix+1)**1.0

In [None]:
from sklearn.cluster import AgglomerativeClustering, DBSCAN, SpectralClustering

clustering = AgglomerativeClustering(n_clusters = None, affinity='precomputed', linkage='average', distance_threshold=0.6)
labels = clustering.fit_predict(distance_matrix)

clustering = AgglomerativeClustering(n_clusters = None, affinity='precomputed', linkage='average', distance_threshold=0.997)
labels = clustering.fit_predict(distance_matrix)


#clustering = DBSCAN(eps= 0.2, metric='precomputed', min_samples=5)
#labels = clustering.fit_predict(distance_matrix)

#clustering = SpectralClustering(n_clusters=15)
#labels = clustering.fit_predict(distance_matrix)

In [None]:
from pprint import pprint
clusters = pd.Series({idx_to_genre[i]: label for i, label in enumerate(labels)})
pprint({cluster: list(genres) for cluster, genres in clusters.groupby(clusters).groups.items()})

In [None]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


plt.figure(figsize=(15,15))
plot_dendrogram(clustering)

* coarse: 59.5% artists, 39.54% songs

* 1: 55.64%, 36.94%
* 2: 51.64%, 34.05%
* 3: 49.82%, 32.69%
* 4: 46.12%, 29.31%
* 5: 44.8%, 28.65%

* fine: 22.48% artists, 12.16% songs