In [2]:
# fetch data

from sortify import *
from graph import *

import pickle
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
from jupyterthemes import jtplot
jtplot.style(figsize=(15,10))
import seaborn as sns
import os
import glob

# files = glob.glob('cache/*')
# for f in files:
#     os.remove(f)

try:
    tracks = pd.read_pickle('cache/tracks.pkl')
except FileNotFoundError:
    print('Fetching user tracks...')
    tracks = get_user_tracks()
    tracks.to_pickle('cache/tracks.pkl')
    
try:
    artists = pd.read_pickle('cache/artists.pkl')
except FileNotFoundError:
    print('Fetching artist data...')
    artists = get_artist_data(tracks.artist.unique())
    artists.to_pickle('cache/artists.pkl')

try:
    G = nx.read_graphml('related_artists.graphml')
except FileNotFoundError:
    G = initialize_graph(artists.index)

In [2]:
i = 0
not_found = 0
skipped = 0
missing = 0
missing_artists = []

for artist_id in tqdm(all_artists):
    if artist_fetched(G, artist_id):
        skipped += 1
    else:
        related_list = sp.artist_related_artists(artist_id)['artists']
        if len(related_list) <= 1:
            missing += 1
            missing_artists.append(artist_id)
            continue
        source = add_vertex(G, artist_id)
        source['fetched'] = True
        source['alias'] = artist_objects[artist_id]['name']
        source['popularity'] = artist_popularity[artist_id]
        not_found += 1

        for related in related_list:
            artist_objects[related['id']] = related
            target = add_edge(G, artist_id, related['id'])[1]
            target['alias'] = related['name']
            target['popularity'] = related['popularity']
            if target['fetched'] is None:
                target['fetched'] = False

    i += 1
#     if i % 100 == 0:
#         print("%d/%d artists processed, %d fetched, %d skipped, %d missing" % (i, len(all_artists), not_found, skipped, missing))
print("All artists processed (%d fetched, %d skipped, %d missing)" % (not_found, skipped, missing))
        
for artist in missing_artists:
    del artist_tracks[artist]
    all_artists.remove(artist)
        
for artist in list(artist_tracks.keys()):
    try:
        G.vs.find(artist)
    except ValueError:
        del artist_tracks[artist]
        all_artists.remove(artist)

incomplete_artists = []
for artist_id in all_artists:
    artist = G.vs.find(artist_id)
    if not artist['alias'] or artist['popularity'] is None:
        try:
            artist['alias'] = artist_objects[artist['name']]['name']
            artist['popularity'] = artist_objects[artist['name']]['popularity']
        except KeyError:
            incomplete_artists.append(artist_id)
    elif np.isnan(artist['popularity']):
        artist['popularity'] = 0

print("%s artists with incomplete data found" % len(incomplete_artists))
if incomplete_artists:
    print("fetching data...")
for chunk in chunks(50, incomplete_artists):
    for artist in sp.artists(chunk)['artists']:
        artist_vertex = G.vs.find(artist['id'])
        artist_vertex['alias'] = artist['name']
        artist_vertex['popularity'] = artist['popularity']

ig.write(G, 'related_artists.graphml', format='graphml')

print("Done!")

Genres fetched


100%|█████████████████████████████████████| 2657/2657 [00:07<00:00, 358.01it/s]


All artists processed (0 fetched, 2548 skipped, 109 missing)
0 artists with incomplete data found
Done!


In [1]:
artist_names = dict()
for artist in artist_tracks:
    v = G.vs.find(artist)
    artist_names[artist] = v['alias']

In [3]:
components = G.components().subgraphs()
sizes = {}
for g in components:
    sizes[g] = len(g.vs)
disconnected = sorted(components, key=lambda x: sizes[x])[:-1]
print('%d disconnected components' % len(disconnected))
print(sizes)

# non_fetched = []
# for g in disconnected:
#     for v in g.vs:
#         if not v['fetched']:
#             non_fetched.append(v['name'])
# #         G.vs.find(v['name'])['fetched'] = False
# print(non_fetched)

0 disconnected components
{<igraph.Graph object at 0x00000232B58F84F8>: 222495}


In [None]:
components, points = G.biconnected_components(return_articulation_points=True)

In [None]:
for p in points:
    v = G.vs[p]
    print('%s pop=%d, deg=%d' % (v['alias'], v['popularity'], v.degree()))

In [None]:
erroneous = 0
i = 0
artists = G.vs.select(popularity=None)['name']
for chunk in chunks(50, artists):
    for artist in sp.artists(chunk)['artists']:
        try:
            G.vs.find(artist['id'])['popularity'] = artist['popularity']
            i += 1
        except ValueError:
            erroneous += 1

        if i % 10000 == 0:
            print('%d/%d artists processed, %d erroneous artists returned' % (i, len(artists), erroneous))
            ig.write(G, 'related_artists.graphml', format='graphml')
            
ig.write(G, 'related_artists.graphml', format='graphml')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('dark_background')

counts = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
# df = pd.DataFrame({'deg': G.vs.degree(), 'pop': G.vs['popularity']})
for deg in G.vs.degree():
    if deg <= 5:
        counts[deg] += 1
        
print(counts)

# sns.distplot(G.vs.select(name_in=all_artists)['popularity'], rug=True)
# plt.show()

In [None]:
# Expand graph
g_artists = G.vs['name']

print('Current graph: %d nodes, %d edges' % (len(G.vs), len(G.es)))
print('Processing %d artists in graph...' % len(g_artists))
processed = 0
fetched = 0
skipped = 0

for i in range(len(g_artists)):
    if g_artists[i] in fetched_artists:
        skipped += 1
        
    else:
        related_list = sp.artist_related_artists(g_artists[i])['artists']
        not_found += 1
        fetched_artists.add(g_artists[i])

        for related in related_list:
            artist_names[related['id']] = related['name']
            add_edge(G, g_artists[i], related['id'])
            
    if i % 1000 == 0:
        print("%d/%d artists processed, %d fetched, %d skipped" % (i, len(g_artists), not_found, skipped))
        
    if not_found % 1000 == 0:
        print("Saving graph...")
        ig.write(G, 'related_artists.graphml', format='graphml')
        with open('fetched_artists', 'wb') as file:
            pickle.dump(fetched_artists, file)
        with open('artist_names', 'wb') as file:
            pickle.dump(artist_names, file)
            
print('New graph: %d nodes, %d edges' % (len(G.vs), len(G.es)))
ig.write(G, 'related_artists.graphml', format='graphml')
with open('fetched_artists', 'wb') as file:
    pickle.dump(fetched_artists, file)
with open('artist_names', 'wb') as file:
    pickle.dump(artist_names, file)

In [None]:
degrees = list(G.nodes)
for i in range(len(degrees)):
    degree = 0
    artist = degrees[i]
    for neighbor in nx.neighbors(G, artist):
        if neighbor in artist_tracks:
            degree += len(artist_tracks[neighbor])
    degrees[i] = (artist, degree)

degrees.sort(key=lambda tup: tup[1], reverse=True)
degrees = [node for node in degrees if node[0] not in artist_tracks]
degrees = degrees[:100]
top_artists = [node[0] for node in degrees]

for chunk in chunks(50, top_artists):
    for artist in sp.artists(chunk)['artists']:
        id = artist['id']
        artist_genres[id] = artist['genres']

In [19]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from multiprocessing import Pool
from functools import partial

print("Graph nodes:", len(G.vs))
print("Graph edges:", len(G.es))
print("Graph components:", len(G.components()))

artists = list(artist_tracks.keys()) #+ top_artists
artists_set = set(artists)
distance = np.zeros((len(artists), len(artists)))

start = datetime.now()
# distance = G.shortest_paths(source=artists, target=artists)
%timeit G.shortest_paths(source=artists[2000])
print(datetime.now() - start)

Graph nodes: 222495
Graph edges: 1164028
Graph components: 1
65.2 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
0:00:05.274894


In [None]:
import sklearn.cluster
import sklearn.metrics

# i = 0
# for lengths in paths:
#     source = artists[i]
#     for j in range(len(artists)):
#         target = artists[j]
#         if target in lengths:
#             distance[i][j] = lengths[target]
#         else:
#             distance[i][j] = np.inf
#     iters += 1
#     i += 1
distance = np.array(distance)

# connectivity = nx.to_numpy_matrix(G, nodelist=artists)
similarity = np.exp(-distance / distance[ np.isfinite(distance) ].std())
# for n_clusters in range(2, 13):
# dist = np.copy(distance)
# max_value = dist[ np.isfinite(dist) ].max()
# print(max_value)
# for i in range(len(dist)):
#     for j in range(len(dist)):
#         if dist[i][j] == np.inf:
#             dist[i][j] = max_value * 2

#     clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average', compute_full_tree=True).fit_predict(dist)
# clustering = sklearn.cluster.DBSCAN(eps=3, min_samples=1, metric='precomputed', n_jobs=-1).fit_predict(dist)
#     print(np.unique(clustering))
#     print("%d cluster silhouette:" % len(np.unique(clustering)), sklearn.metrics.silhouette_score(dist, clustering, metric='precomputed'))
# print(sklearn.metrics.silhouette_score(dist, clustering, metric='precomputed'))

clustering = sklearn.cluster.SpectralClustering(n_clusters=6, affinity='precomputed').fit_predict(similarity)
# clustering = sklearn.cluster.AgglomerativeClustering(n_clusters=8, affinity='precomputed', linkage='average', compute_full_tree=True).fit_predict(dist)
print(np.unique(clustering))
artist_cluster = {}
cluster_artists = {}
for i in range(len(artists)):
    artist_cluster[artists[i]] = clustering[i]
    if clustering[i] not in cluster_artists:
        cluster_artists[clustering[i]] = [artists[i]]
    else:
        cluster_artists[clustering[i]].append(artists[i])
for clus in cluster_artists:
    print(clus, len(cluster_artists[clus]))

In [3]:
import networkx as nx

G = nx.read_graphml('related_artists.graphml')

In [7]:
import datetime

start = datetime.datetime.now()
nx.is_distance_regular(G)
print(datetime.datetime.now() - start)

0:00:00


In [4]:
import datetime
from concurrent.futures import ProcessPoolExecutor

start = datetime.datetime.now()

def print_mod(future):
    print('modularity:', datetime.datetime.now() - start)
def print_lpa(future):
    print('lpa:', datetime.datetime.now() - start)
def print_label(future):
    print('label:', datetime.datetime.now() - start)
def print_fluid(future):
    print('fluid:', datetime.datetime.now() - start)

futures = {}
with ProcessPoolExecutor() as executor:
#     futures['modularity'] = executor.submit(nx.algorithms.community.modularity_max.greedy_modularity_communities, G)
#     futures['lpa'] = executor.submit(nx.algorithms.community.label_propagation.asyn_lpa_communities, G)
    futures['label'] = executor.submit(nx.algorithms.community.label_propagation.label_propagation_communities, G)
    futures['fluid'] = executor.submit(nx.algorithms.community.asyn_fluid.asyn_fluidc, G, 8)
    
#     futures['modularity'].add_done_callback(print_mod)
#     futures['lpa'].add_done_callback(print_lpa)
    futures['label'].add_done_callback(print_label)
    futures['fluid'].add_done_callback(print_fluid)

label: 0:00:03.601370
fluid: 0:02:28.887838


In [None]:
import datetime

start = datetime.datetime.now()
# communities = nx.algorithms.community.label_propagation.label_propagation_communities(G)
communities = nx.algorithms.community.centrality.girvan_newman(G)
print(datetime.datetime.now() - start)

cluster_artists = {}
artist_clusters = {}

label = 1
for community in tqdm(communities):
    cluster_artists[label] = []
    for node in community:
        id = G.nodes[node]['name']
        cluster_artists[label].append(id)
        artist_clusters[id] = label
    label += 1

0:00:00


0it [00:00, ?it/s]

In [46]:
import sklearn.cluster
import sklearn.mixture
import pandas as pd

artists = list(artist_tracks.keys())
genres = set()
for genre_list in artist_genres.values():
    for genre in genre_list:
        genres.add(genre)
genres = list(genres)

print('Building dataset...')
X = np.zeros((len(artists), len(genres)))
for i in range(len(artists)):
    artist = artists[i]
    for genre in artist_genres[artist]:
        j = genres.index(genre)
        X[i][j] = 1

print('Clustering...')
clustering = sklearn.mixture.GaussianMixture(n_components=6, covariance_type='full', tol=0.00001).fit_predict(X)

artist_cluster = {}
cluster_artists = {}
for i in range(len(artists)):
    artist_cluster[artists[i]] = clustering[i]
    if clustering[i] not in cluster_artists:
        cluster_artists[clustering[i]] = [artists[i]]
    else:
        cluster_artists[clustering[i]].append(artists[i])
for clus in cluster_artists:
    print(clus, len(cluster_artists[clus]))

Building dataset...
Clustering...
1 169
0 555
3 258
5 67
2 1219
4 280


In [42]:
import pandas as pd

counts = {'cluster': [], 'genre': [], 'count': []}
for cluster in cluster_artists:
    genre_count = dict()
    total_tracks = 0
    for artist in cluster_artists[cluster]:
        if artist not in artist_tracks:
            continue
        for genre in artist_genres[artist]:
            if genre in genre_count:
                genre_count[genre] += len(artist_tracks[artist])
            else:
                genre_count[genre] = len(artist_tracks[artist])
        total_tracks += len(artist_tracks[artist])
    for genre, count in genre_count.items():
        counts['cluster'].append(cluster)
        counts['genre'].append(genre)
        counts['count'].append(count / total_tracks)

counts = pd.DataFrame(counts)
counts.sort_values('count', ascending=False, inplace=True)
original_counts = counts.copy()

clusters_to_name = set(counts['cluster'].unique())
playlists = {}
cluster_names = {}
while clusters_to_name:
    cluster = counts.iloc[0]['cluster']
    name = counts.iloc[0]['genre']
    if name == 'edm':
        name = 'EDM'
    else:
        name = name.title()
    playlists[name] = cluster
    cluster_names[cluster] = name
    counts = counts[ (counts['cluster']!=cluster) & (counts['genre']!=name) ]
    clusters_to_name.remove(cluster)
    
artist_names = dict()
for artist in artist_tracks:
    v = G.vs.find(artist)
    artist_names[artist] = v['alias']
    
playlist_tracks = dict()
cluster_counts = dict()
from collections import OrderedDict
for name in playlists:
    cluster_counts[name] = dict()
    playlist_tracks[name] = []
    cluster = playlists[name]
    for artist in cluster_artists[cluster]:
        if artist in artist_tracks:
            cluster_counts[name][artist_names[artist]] = len(artist_tracks[artist])
            playlist_tracks[name] += artist_tracks[artist]
  
for playlist, counts in cluster_counts.items():
    artists_sorted = sorted(counts.keys(), key=lambda x: counts[x], reverse=True)
    print(playlist, f'{len(playlist_tracks[playlist])} tracks', artists_sorted[:10])

New Rave 67 tracks ['Boys Noize', 'Jamie xx', 'SBTRKT', 'Twin Shadow', 'Hot Chip', 'DJ Koze', 'Moderat', 'WhoMadeWho', 'Mr. Oizo', 'Mount Kimbie']
EDM 1726 tracks ['deadmau5', 'Andrew Bayer', 'Porter Robinson', 'Chocolate Puma', 'DEAD BATTERY', 'Joran Van Pol', 'Killer Bee', 'ATTLAS', 'Lulu Rouge', 'Petit Biscuit']
Electronic Trap 1288 tracks ['What So Not', 'Skrillex', 'UZ', 'Zeds Dead', 'Excision', 'NGHTMRE', 'San Holo', 'RL Grime', 'DJ Snake', 'Alison Wonderland']
Pop 519 tracks ['Flume', 'Diplo', 'Isaiah Rashad', 'G-Eazy', 'The Chainsmokers', 'Stwo', 'Stephen', 'BURNS', 'Major Lazer', 'Justin Bieber']
Tech House 1774 tracks ['Matt Lange', 'Noisia', 'Iglooghost', 'Lewis Fautzi', 'Pleasurekraft', 'Jon Hopkins', 'Maceo Plex', 'Gesaffelstein', 'Eprom', 'Dusky']


In [6]:
token = util.prompt_for_user_token(username, 'playlist-modify-private', client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri)
sp = spotipy.Spotify(auth=token)

for name in playlist_tracks:
    id = sp.user_playlist_create(username, name+' [Sortify]', public=False)['id']
    for chunk in chunks(100, playlist_tracks[name]):
        sp.user_playlist_add_tracks(username, id, chunk)
    print('%s playlist created (%d tracks)' % (name, len(playlist_tracks[name])))



            User authentication requires interaction with your
            web browser. Once you enter your credentials and
            give authorization, you will be redirected to
            a url.  Paste that url you were directed to to
            complete the authorization.

        
Opened https://accounts.spotify.com/authorize?client_id=b69a9985fa8842deb0691b2d0e3f0b69&response_type=code&redirect_uri=http%3A%2F%2Flocalhost%2F&scope=playlist-modify-private in your browser


Enter the URL you were redirected to: http://localhost/?code=AQBRSjQCCdHhn28GaAUeAn4QdzVoo4kogiHWcRMB3YZ-iEdvfUQDdNdkB0Q9LcoZZPaPsQaaHn2RRCXRId70cylcSSE4Bc949aoqIoGdOTjl7kvk9Xy620lJWGcFfhcZqF97UketlgY9kGbJ5llNFdg51ZmIpJYebne5QTmEWw5Yh_YQuvGfJ8j87xbPEa9KrQ2OB6cg6ehKrZS9Q4AfnRy7


EDM playlist created (1874 tracks)
Pop playlist created (278 tracks)
Electronic trap playlist created (668 tracks)
Tech house playlist created (586 tracks)
Indietronica playlist created (198 tracks)
Electronic playlist created (1291

In [None]:
top_clustered = {}
for degree in degrees:
    artist = degree[0]
    cluster = artist_cluster[artist]
    if cluster not in top_clustered:
        top_clustered[cluster] = [degree]
    else:
        top_clustered[cluster].append(degree)
        
for cluster in top_clustered:
    print(cluster_names[cluster])
    for degree in top_clustered[cluster][:10]:
        print("\t%s: %d" % (artist_names[degree[0]], degree[1]))

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(playlists)