In [34]:
import json
import re

import pandas as pd
import numpy as np

from datetime import datetime
from typing import Dict, List
from tqdm.notebook import tqdm

# Embedding algorithm essentials
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [2]:
df_db_tracks = pd.read_csv('../dataset/tracks.csv')
df_db_tracks['id_artists'] = df_db_tracks['id_artists'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_tracks.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1


In [3]:
df_db_tracks.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0
mean,27.570053,230051.2,0.044086,0.563594,0.542036,5.221603,-10.206067,0.658797,0.104864,0.449863,0.113451,0.213935,0.552292,118.464857,3.873382
std,18.370642,126526.1,0.205286,0.166103,0.251923,3.519423,5.089328,0.474114,0.179893,0.348837,0.266868,0.184326,0.257671,29.764108,0.473162
min,0.0,3344.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.0,175093.0,0.0,0.453,0.343,2.0,-12.891,0.0,0.034,0.0969,0.0,0.0983,0.346,95.6,4.0
50%,27.0,214893.0,0.0,0.577,0.549,5.0,-9.243,1.0,0.0443,0.422,2.4e-05,0.139,0.564,117.384,4.0
75%,41.0,263867.0,0.0,0.686,0.748,8.0,-6.482,1.0,0.0763,0.785,0.00955,0.278,0.769,136.321,4.0
max,100.0,5621218.0,1.0,0.991,1.0,11.0,5.376,1.0,0.971,0.996,1.0,1.0,1.0,246.381,5.0


In [4]:
df_db_artists = pd.read_csv('../dataset/artists.csv')
df_db_artists['genres'] = df_db_artists['genres'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_artists.tail(5)

Unnamed: 0,id,followers,genres,name,popularity
1162090,3cOzi726Iav1toV2LRVEjp,4831.0,[black comedy],Ali Siddiq,34
1162091,6LogY6VMM3jgAE6fPzXeMl,46.0,[],Rodney Laney,2
1162092,19boQkDEIay9GaVAWkUhTa,257.0,[],Blake Wexler,10
1162093,5nvjpU3Y7L6Hpe54QuvDjy,2357.0,[black comedy],Donnell Rawlings,15
1162094,2bP2cNhNBdKXHC6AnqgyVp,40.0,[new comedy],Gabe Kea,8


Join Tracks table with artists table in order to bring the Genre information and Artist popularity, in the track schema

In [5]:
artist_information = df_db_artists.set_index('id').to_dict(orient='index')

In [6]:
def get_genres_for_artists(id_artists: List[str]) -> List[str]:
    genres = []
    for artist_id in id_artists:
        genres += artist_information[artist_id]['genres'] if artist_id in artist_information.keys() else []
    return list(set(genres))


def get_popularity_for_artists(id_artists: List[str]) -> List[int]:
    popularity = []
    for artist_id in id_artists:
        popularity += [artist_information[artist_id]['popularity']] if artist_id in artist_information.keys() else []
    return popularity


In [7]:
df_db_tracks['genres'] = df_db_tracks['id_artists'].apply(lambda x: get_genres_for_artists(x))

#### Genre Representation

* Word embeddings
* TF IDF

In [8]:
def process_genre(genre: str) -> str:
    new_genre = genre.replace('hip hop', 'hiphop')
    new_genre = genre.replace('r&b', 'rb')
    return new_genre.strip()

In [9]:
vocab = list(set([token for genres in df_db_tracks['genres'].to_list() for genre in genres for token in word_tokenize(process_genre(genre))]))
genres_text = [" ".join([process_genre(genre) for genre in track_genres]) for track_genres in df_db_tracks['genres'].to_list()]

genres_text_tokenized = []
for track_genres in df_db_tracks['genres'].to_list():
    track_genres_text = []
    for genre in track_genres:
        track_genres_text.extend(word_tokenize(process_genre(genre)))
    genres_text_tokenized.append(track_genres_text)

In [10]:
len(genres_text_tokenized)

586672

In [11]:
tfidf = TfidfVectorizer(tokenizer=word_tokenize)

tfidf = tfidf.fit(vocab)



In [12]:
# example
tfidf.transform(['easy pop folk pop pop guitar']).toarray().shape

(1, 2351)

In [13]:
w2v_features = 20

model_w2v = Word2Vec(
    sentences=genres_text_tokenized,
    vector_size=w2v_features,
    window=10,
    min_count=1,
    workers=8
)

model_w2v.train(genres_text_tokenized, total_examples=len(genres_text_tokenized), epochs=2)

(4333986, 6915880)

In [14]:
# example
model_w2v.wv.most_similar(positive='house')

[('complextro', 0.6763811707496643),
 ('techno', 0.6613274216651917),
 ('euro', 0.6371831297874451),
 ('edm', 0.6229261159896851),
 ('bailando', 0.6126195788383484),
 ('bassline', 0.5992982983589172),
 ('eurodance', 0.5941179990768433),
 ('polo', 0.5918890237808228),
 ('ghettotech', 0.5870377421379089),
 ('breaks', 0.5693063735961914)]

In [15]:
from numpy.typing import NDArray

def create_sentence_embedding(
    sentence_tokenized: List[str],
    weights_per_word: NDArray,
    tfidf_vocab
) -> NDArray:
    sentence_vector = np.zeros(w2v_features,)
    for word in sentence_tokenized:
        weight = weights_per_word[tfidf_vocab.index(word)]
        word_vector = model_w2v.wv[word]
        word_vector = word_vector * weight
        sentence_vector += word_vector
    return sentence_vector / len(sentence_tokenized) if len(sentence_tokenized) > 0 else np.zeros(w2v_features,)

In [30]:
del genres_text_tokenized
genres_text_tokenized = None

In [35]:
# w2v
track_genres_embeddings = [
    create_sentence_embedding(
        sentence_tokenized=word_tokenize(track_genre_text), 
        weights_per_word=tfidf.transform([track_genre_text]).toarray().reshape(-1,), 
        tfidf_vocab=list(tfidf.get_feature_names_out())
    ) 
        for track_genre_text in tqdm(genres_text, desc="track genres")
]

track genres:   0%|          | 0/586672 [00:00<?, ?it/s]

In [36]:
df_db_track_with_genres = pd.DataFrame(track_genres_embeddings, columns=[f"w{i+1}" for i in range(w2v_features)])
df_db_track_with_genres['id'] = df_db_tracks['id'].to_list()
df_db_track_with_genres.set_index('id', inplace=True)
df_db_track_with_genres

Unnamed: 0_level_0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,w14,w15,w16,w17,w18,w19,w20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
35iwgR4jXetI318WEWsa1Q,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
021ht4sdgPcrDgSk7JTbKY,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
07A5yehtSnoedViJAZkNnc,0.457791,-4.223015,4.060837,4.507574,0.130734,-0.726367,-0.419549,-0.798410,-4.955767,1.249623,0.366326,-2.231257,0.051802,-3.592614,0.057358,1.198494,0.681954,0.883344,1.759210,-0.396757
08FmqUhxtyLTn6pAh6bk45,0.457791,-4.223015,4.060837,4.507574,0.130734,-0.726367,-0.419549,-0.798410,-4.955767,1.249623,0.366326,-2.231257,0.051802,-3.592614,0.057358,1.198494,0.681954,0.883344,1.759210,-0.396757
08y9GfoqCWfOGsKdwojr5e,-0.288087,-0.684354,-0.107649,-0.389103,-0.198719,-0.372447,0.950715,-0.658217,-0.286189,-0.580392,-0.067683,0.505792,-0.157257,0.192510,1.022010,-2.272481,-0.920461,-0.132006,-0.845449,-0.938256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5rgu12WBIHQtvej2MdHSH0,0.151706,-0.247853,-0.001868,0.464932,-0.649480,0.982235,1.018137,-1.974212,-0.582699,-0.117452,-1.252162,-0.182321,0.950308,1.244908,-0.476225,1.139155,1.066202,-1.391023,2.257735,-0.346963
0NuWgxEp51CutD2pJoF4OM,-0.439060,0.659596,-0.593329,0.237153,-0.107202,0.198366,0.079501,-0.585903,0.590404,-0.003172,-0.682298,-0.469110,0.321974,0.582032,-0.086415,0.638045,0.305289,-0.294539,0.622220,-0.821035
27Y1N4Q4U3EfDU5Ubw8ws2,-0.643048,0.718119,-0.543615,0.201523,-0.063478,0.501327,0.213093,-0.456675,0.602283,-0.103154,-0.597918,-0.561513,0.279113,0.699534,-0.117997,0.644036,0.339878,-0.634665,0.706240,-0.714964
45XJsGpFTyzbzeWK8VzR8S,-0.183473,0.339475,-0.490911,-0.446044,-0.263200,0.499438,0.069759,-0.726273,0.531243,0.046736,-0.722350,-0.027891,0.111241,0.747268,-0.078711,0.797373,0.386622,-0.527939,0.590261,-0.740747


In [37]:
df_db_track_with_genres.to_csv('../dataset/test_genre_embeddings.csv')