In [1]:
import json
import re

import pandas as pd
import numpy as np

from datetime import datetime
from typing import Dict, List
from tqdm.notebook import tqdm

# Embedding algorithm essentials
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


In [2]:
df_db_tracks = pd.read_csv('../dataset/tracks.csv')
df_db_tracks['id_artists'] = df_db_tracks['id_artists'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_tracks.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1


In [3]:
df_db_tracks.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0,586672.0
mean,27.570053,230051.2,0.044086,0.563594,0.542036,5.221603,-10.206067,0.658797,0.104864,0.449863,0.113451,0.213935,0.552292,118.464857,3.873382
std,18.370642,126526.1,0.205286,0.166103,0.251923,3.519423,5.089328,0.474114,0.179893,0.348837,0.266868,0.184326,0.257671,29.764108,0.473162
min,0.0,3344.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13.0,175093.0,0.0,0.453,0.343,2.0,-12.891,0.0,0.034,0.0969,0.0,0.0983,0.346,95.6,4.0
50%,27.0,214893.0,0.0,0.577,0.549,5.0,-9.243,1.0,0.0443,0.422,2.4e-05,0.139,0.564,117.384,4.0
75%,41.0,263867.0,0.0,0.686,0.748,8.0,-6.482,1.0,0.0763,0.785,0.00955,0.278,0.769,136.321,4.0
max,100.0,5621218.0,1.0,0.991,1.0,11.0,5.376,1.0,0.971,0.996,1.0,1.0,1.0,246.381,5.0


In [4]:
df_db_artists = pd.read_csv('../dataset/artists.csv')
df_db_artists['genres'] = df_db_artists['genres'].apply(lambda x: x[1:-1].strip().replace("'", "").split(','))
df_db_artists.tail(5)

Unnamed: 0,id,followers,genres,name,popularity
1162090,3cOzi726Iav1toV2LRVEjp,4831.0,[black comedy],Ali Siddiq,34
1162091,6LogY6VMM3jgAE6fPzXeMl,46.0,[],Rodney Laney,2
1162092,19boQkDEIay9GaVAWkUhTa,257.0,[],Blake Wexler,10
1162093,5nvjpU3Y7L6Hpe54QuvDjy,2357.0,[black comedy],Donnell Rawlings,15
1162094,2bP2cNhNBdKXHC6AnqgyVp,40.0,[new comedy],Gabe Kea,8


Join Tracks table with artists table in order to bring the Genre information and Artist popularity, in the track schema

In [5]:
artist_information = df_db_artists.set_index('id').to_dict(orient='index')

In [6]:
def get_genres_for_artists(id_artists: List[str]) -> List[str]:
    genres = []
    for artist_id in id_artists:
        genres += artist_information[artist_id]['genres'] if artist_id in artist_information.keys() else []
    return list(set(genres))


def get_popularity_for_artists(id_artists: List[str]) -> List[int]:
    popularity = []
    for artist_id in id_artists:
        popularity += [artist_information[artist_id]['popularity']] if artist_id in artist_information.keys() else []
    return popularity


In [7]:
df_db_tracks['genres'] = df_db_tracks['id_artists'].apply(lambda x: get_genres_for_artists(x))

#### Genre Representation

* Word embeddings
* TF IDF

In [8]:
def process_genre(genre: str) -> str:
    new_genre = genre.replace('hip hop', 'hiphop')
    new_genre = genre.replace('r&b', 'rb')
    return new_genre.strip()

In [9]:
vocab = list(set([token for genres in df_db_tracks['genres'].to_list() for genre in genres for token in word_tokenize(process_genre(genre))]))
genres_text = [" ".join([process_genre(genre) for genre in track_genres]) for track_genres in df_db_tracks['genres'].to_list()]

genres_text_tokenized = []
for track_genres in df_db_tracks['genres'].to_list():
    track_genres_text = []
    for genre in track_genres:
        track_genres_text.extend(word_tokenize(process_genre(genre)))
    genres_text_tokenized.append(track_genres_text)

In [10]:
len(genres_text_tokenized)

586672

In [11]:
tfidf = TfidfVectorizer(tokenizer=word_tokenize)

tfidf = tfidf.fit(vocab)



In [12]:
# example
tfidf.transform(['easy pop folk pop pop guitar']).toarray().shape

(1, 2351)

In [27]:
w2v_features = 8

model_w2v = Word2Vec(
    sentences=genres_text_tokenized,
    vector_size=w2v_features,
    window=10,
    min_count=1,
    sg=1,
    hs=5,
    workers=8
)

model_w2v.train(genres_text_tokenized, total_examples=len(genres_text_tokenized), epochs=5)

(10833554, 17289700)

In [35]:
# example
model_w2v.wv.most_similar(positive='uk', topn=10)

[('freakbeat', 0.9382889866828918),
 ('merseybeat', 0.9158648252487183),
 ('beatlesque', 0.8998480439186096),
 ('neo-rockabilly', 0.8969053626060486),
 ('art', 0.8928782343864441),
 ('invasion', 0.8915444016456604),
 ('bubblegum', 0.8873463273048401),
 ('gold', 0.8843992352485657),
 ('mellow', 0.8814758658409119),
 ('sunshine', 0.8744872808456421)]

In [36]:
model_w2v.save("../models/genre_embeddings_v2.model")