In [1]:
import json
import re

import pandas as pd
import numpy as np

from datetime import datetime
from typing import Dict, List
from tqdm import tqdm

# Embedding algorithm essentials
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [2]:
df_db_tracks = pd.read_csv('../dataset/tracks.csv')
df_db_tracks['id_artists'] = df_db_tracks['id_artists'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_tracks.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1


In [3]:
df_db_artists = pd.read_csv('../dataset/artists.csv')
df_db_artists['genres'] = df_db_artists['genres'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_artists.tail(2)

Unnamed: 0,id,followers,genres,name,popularity
1162093,5nvjpU3Y7L6Hpe54QuvDjy,2357.0,[black comedy],Donnell Rawlings,15
1162094,2bP2cNhNBdKXHC6AnqgyVp,40.0,[new comedy],Gabe Kea,8


In [4]:
artist_information = df_db_artists.set_index('id').to_dict(orient='index')

In [5]:
def get_genres_for_artists(id_artists: List[str]) -> List[str]:
    genres = []
    for artist_id in id_artists:
        genres += artist_information[artist_id]['genres'] if artist_id in artist_information.keys() else []
    return list(set(genres))


def get_popularity_for_artists(id_artists: List[str]) -> List[int]:
    popularity = []
    for artist_id in id_artists:
        popularity += [artist_information[artist_id]['popularity']] if artist_id in artist_information.keys() else []
    return popularity

In [6]:
df_db_tracks['genres'] = df_db_tracks['id_artists'].apply(lambda x: get_genres_for_artists(x))
df_db_tracks.head(1)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,[]


#### Artist Representation

Create the artist social network, concatenated artist ids, for which we will create embeddings with w2v

In [7]:
# create artist graph based on track associations
df_db_tracks_artists = df_db_tracks[['id', 'id_artists']]
df_db_tracks_artists = df_db_tracks_artists.to_dict('records')

graph = {}
for track in tqdm(df_db_tracks_artists):
    # O(n^2) complexity for artists
    track_artists_id = [a.strip() for a in track['id_artists']]
    for artist_root in track_artists_id:
        for artist_edge in track_artists_id:
            if artist_root == artist_edge: continue
            
            if artist_root not in graph:
                graph[artist_root] = {}
            
            if artist_edge not in graph[artist_root]:
                graph[artist_root][artist_edge] = 0
            
            graph[artist_root][artist_edge] += 1

100%|██████████| 586672/586672 [00:00<00:00, 604453.22it/s]


In [8]:
import networkx as nx
from node2vec import Node2Vec as n2v

G = nx.Graph()

for artist in graph:
  for artist_to in graph[artist]:
    G.add_edge(artist, artist_to, weight=graph[artist][artist_to])

In [9]:
g_emb = n2v(
  G,
  dimensions=8,
  walk_length=30, 
  num_walks=100
)

mdl = g_emb.fit(
    vector_size=8,
    window=1,
    min_count=1,
    workers=4
)

Computing transition probabilities:   0%|          | 0/54244 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 100/100 [15:59<00:00,  9.60s/it]


In [10]:
input_node = '1vCWHaC5f2uS3yhpwWbIA6'
for s in mdl.wv.most_similar(input_node, topn = 10):
    print(s)

('4NHQUGzhtTLFvgF5SZesLK', 0.9930707216262817)
('69GGBxA162lTqCwzJG5jLp', 0.9918947219848633)
('01pKrlgPJhm5dB4lneYAqS', 0.9897546172142029)
('2REmOYHdJPfuEKXH1mSaHs', 0.9894715547561646)
('3XC57xz74X3xUi1hv4mge1', 0.9893032312393188)
('1QM03lnjHcx2l52sWaW5V5', 0.9890238642692566)
('6qaQDRYp95AylkA1FnEI3Q', 0.987949013710022)
('6jsjhAEteAlY0vCiLvMLBA', 0.9873182773590088)
('1VF81Q4Usmpf2fgjEV3HPJ', 0.9870807528495789)
('2qPxiZiD34NtmokWN6RoP2', 0.9860274791717529)


In [21]:
df_db_artists[df_db_artists['id']=='2qPxiZiD34NtmokWN6RoP2']

Unnamed: 0,id,followers,genres,name,popularity
111654,2qPxiZiD34NtmokWN6RoP2,17797.0,"[deep tropical house, electro house, pop edm...",King Topher,56


In [22]:
mdl.save("../models/artist_embedding_v2.model")