In [2]:
import json
import re

import pandas as pd
import numpy as np

from datetime import datetime
from typing import Dict, List
from tqdm import tqdm

# Embedding algorithm essentials
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [3]:
df_db_tracks = pd.read_csv('../dataset/tracks.csv')
df_db_tracks['id_artists'] = df_db_tracks['id_artists'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_tracks.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1


In [4]:
df_db_artists = pd.read_csv('../dataset/artists.csv')
df_db_artists['genres'] = df_db_artists['genres'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_artists.tail(2)

Unnamed: 0,id,followers,genres,name,popularity
1162093,5nvjpU3Y7L6Hpe54QuvDjy,2357.0,[black comedy],Donnell Rawlings,15
1162094,2bP2cNhNBdKXHC6AnqgyVp,40.0,[new comedy],Gabe Kea,8


In [5]:
artist_information = df_db_artists.set_index('id').to_dict(orient='index')

In [6]:
def get_genres_for_artists(id_artists: List[str]) -> List[str]:
    genres = []
    for artist_id in id_artists:
        genres += artist_information[artist_id]['genres'] if artist_id in artist_information.keys() else []
    return list(set(genres))


def get_popularity_for_artists(id_artists: List[str]) -> List[int]:
    popularity = []
    for artist_id in id_artists:
        popularity += [artist_information[artist_id]['popularity']] if artist_id in artist_information.keys() else []
    return popularity

In [7]:
df_db_tracks['genres'] = df_db_tracks['id_artists'].apply(lambda x: get_genres_for_artists(x))
df_db_tracks.head(1)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,[]


#### Artist Representation

Create the artist social network, concatenated artist ids, for which we will create embeddings with w2v

In [45]:
# create artist graph based on track associations
df_db_tracks_artists = df_db_tracks[['id', 'id_artists']]
df_db_tracks_artists = df_db_tracks_artists.to_dict('records')

graph = {}
for track in tqdm(df_db_tracks_artists):
    # O(n^2) complexity for artists
    track_artists_id = [a.strip() for a in track['id_artists']]
    for artist_root in track_artists_id:
        for artist_edge in track_artists_id:
            if artist_root == artist_edge: continue
            
            if artist_root not in graph:
                graph[artist_root] = {}
            
            if artist_edge not in graph[artist_root]:
                graph[artist_root][artist_edge] = 0
            
            graph[artist_root][artist_edge] += 1

100%|██████████| 586672/586672 [00:00<00:00, 601931.15it/s]


In [46]:
import networkx as nx
from node2vec import Node2Vec as n2v

G = nx.Graph()

for artist in graph:
  for artist_to in graph[artist]:
    G.add_edge(artist, artist_to, weight=graph[artist][artist_to])

In [61]:
g_emb = n2v(
  G,
  dimensions=16,
  walk_length=30, 
  num_walks=20
)

mdl = g_emb.fit(
    vector_size=16,
    window=2,
    min_count=1
)

Computing transition probabilities:   0%|          | 0/54244 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 20/20 [03:32<00:00, 10.65s/it]


In [62]:
input_node = '1vCWHaC5f2uS3yhpwWbIA6'
for s in mdl.wv.most_similar(input_node, topn = 10):
    print(s)

('5baGk14NDI3ZDPbr6wJilc', 0.9921169281005859)
('0lHsjYcb3lGjkJQQqC6HVt', 0.991969883441925)
('7hOGhpa8RMSuDOWntGIAJt', 0.991741418838501)
('142TY556WknD1ZRV8XilJQ', 0.9915919303894043)
('7iLc4hrjOjQPfg1E3cCHg1', 0.9881303906440735)
('5lKZWd6HiSCLfnDGrq9RAm', 0.9554837942123413)
('71UZaAOfewFyTqIb0Kji9h', 0.9471638202667236)
('5ChF3i92IPZHduM7jN3dpg', 0.9407984018325806)
('1rw8ZTLnDHd74TWDDukjVi', 0.9373382925987244)
('6ydoSd3N2mwgwBHtF6K7eX', 0.9322147369384766)


In [58]:
G['1vCWHaC5f2uS3yhpwWbIA6']

AtlasView({'0SfsnGyD8FpIN4U4WCkBZ5': {'weight': 3}, '5YFS41yoX0YuFY39fq21oN': {'weight': 1}, '1C60viSZv6BoYtrnkZ44g5': {'weight': 1}, '1Cs0zKBU1kc0i8ypK3B9ai': {'weight': 1}, '7iLc4hrjOjQPfg1E3cCHg1': {'weight': 6}, '71UZaAOfewFyTqIb0Kji9h': {'weight': 3}, '5ChF3i92IPZHduM7jN3dpg': {'weight': 3}, '5JYo7gm2dkyLLlWHjxS7Dy': {'weight': 6}, '5CCwRZC6euC8Odo6y9X8jr': {'weight': 2}, '0id62QV2SZZfvBn9xpmuCl': {'weight': 1}, '5he5w2lnU9x7JFhnwcekXX': {'weight': 1}, '0lHsjYcb3lGjkJQQqC6HVt': {'weight': 1}, '5lKZWd6HiSCLfnDGrq9RAm': {'weight': 1}, '5gznATMVO85ZcLTkE9ULU7': {'weight': 1}, '1rw8ZTLnDHd74TWDDukjVi': {'weight': 1}, '4tZwfgrHOc3mvqYlEYSvVi': {'weight': 1}, '79DOwuMzV2h4es3em0t002': {'weight': 1}, '5baGk14NDI3ZDPbr6wJilc': {'weight': 1}, '5fahUm8t5c0GIdeTq0ZaG8': {'weight': 1}, '2fVW2ix4ANKiofDZIsy1XR': {'weight': 2}, '5ITI6SEoUZMIXXkzCfr4oE': {'weight': 1}, '6SsTlCsuCYleNza6xGwynu': {'weight': 1}, '142TY556WknD1ZRV8XilJQ': {'weight': 1}, '53XhwfbYqKCa1cC15pYq2q': {'weight': 1}, '7hOG

In [71]:
df_db_artists[df_db_artists['id']=='6ydoSd3N2mwgwBHtF6K7eX']

Unnamed: 0,id,followers,genres,name,popularity
122463,6ydoSd3N2mwgwBHtF6K7eX,3240686.0,"[indie cafe pop, pop, viral pop]",Calum Scott,76
