In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

import plotly.express as px

In [2]:
lyrics_df = pd.read_csv("../data/processed/lyrics.csv", sep=";")

In [3]:
with open("../data/processed/song_encodings.pkl", "rb") as infile:
    song_encodings_map = pickle.load(infile)

In [4]:
with open("../data/processed/song_encodings_big.pkl", "rb") as infile:
    new_song_encodings = pickle.load(infile)

In [5]:
with open("../data/processed/sentence_encodings.pkl", "rb") as infile:
    sentence_encodings = pickle.load(infile)

In [6]:
song_idxs = lyrics_df.reset_index().groupby("song_idx")["index"].apply(list)

In [13]:
# # save setences by song to improve performance
# for i, sidx in song_idxs.iteritems():
#     np.savez_compressed(
#         f"../data/processed/song_sentence_encodings/{i}", 
#         sentence_encodings[sidx]
#     )

---

Reduce and aggregate vectors

In [9]:
new_song_encodings = []
n_vectors = 10


for _, sidx in song_idxs.iteritems():
    new_encoding = []
    n = sentence_encodings[sidx].shape[0]
    
    if n < n_vectors:
        new_encoding = sentence_encodings[sidx].tolist()
        for i in range(n, n_vectors):
            new_encoding.append(sentence_encodings[sidx][-1])
    else:
        step = (n // n_vectors) + 1 if n%n_vectors != 0 else n/n_vectors

        for i in range(n_vectors):
            j = int(min(i+step, n))
            new_encoding.append(sentence_encodings[sidx][i:j].mean(axis=0))

    new_song_encodings.append(np.concatenate(new_encoding))
    
new_song_encodings = np.array(new_song_encodings)

In [10]:
new_song_encodings.shape

(1556, 1280)

In [11]:
with open("../data/processed/song_encodings_big.pkl", "wb") as outfile:
    pickle.dump(new_song_encodings, outfile)

---

Nearest Neighbors

In [65]:
nn = NearestNeighbors()

In [66]:
nn.fit(list(song_encodings_map.values()))
distances, indices = nn.kneighbors(list(song_encodings_map.values()))

In [68]:
lyrics_df.sample()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
12035,262,I watched with glee,Guns N' Roses,Sympathy For The Devil,Please allow me to introduce myself\nI'm a man...,2


In [70]:
indices[262]

array([262,  82, 576, 284,  48])

In [71]:
distances[262]

array([0.        , 2.69580373, 2.77703246, 2.92027145, 3.0496992 ])

In [72]:
lyrics_df.query("song_idx in @indices[262]")["musica"].unique()

array(['Eraser', 'Love The Way You Lie (feat. Rihanna)',
       'Sympathy For The Devil', 'And I Love Her'], dtype=object)

---

In [73]:
tsne = TSNE(perplexity=2)

In [74]:
song_embeddings = tsne.fit_transform(list(song_encodings_map.values()))

In [75]:
song_df = pd.DataFrame(
    zip(list(lyrics_df["song_idx"].unique()), song_embeddings[:,0], song_embeddings[:,1]),
    columns=["song_idx", "x", "y"]
)

In [76]:
song_df = song_df.merge(lyrics_df, left_on="song_idx", right_on="song_idx")
song_df.head()

Unnamed: 0,song_idx,x,y,sentence,artista,musica,letra,lang
0,0,-30.573359,44.256809,[DaBaby:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
1,0,-30.573359,44.256809,"Billboard Baby, Dua Lipa make 'em dance when i...",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
2,0,-30.573359,44.256809,Everybody lookin' for a dance floor to run on,Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
3,0,-30.573359,44.256809,[Dua Lipa:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
4,0,-30.573359,44.256809,"If you wanna run away with me, I know a galaxy",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2


In [57]:
# song_df.to_csv("../data/processed/songs_with_coordinates.csv", sep=";", index=False)

---

In [77]:
unique_df = song_df[["song_idx", "x", "y", "artista", "musica", "letra"]].drop_duplicates()

In [78]:
unique_df.head()

Unnamed: 0,song_idx,x,y,artista,musica,letra
0,0,-30.573359,44.256809,Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d..."
97,1,-45.428459,6.612777,Dua Lipa,Don't Start Now,"If you don't wanna see me\n\nDid a full 180, c..."
147,2,69.441444,-1.723847,Dua Lipa,Break My Heart,I've always been the one to say the first good...
214,3,-30.329786,44.456078,Dua Lipa,Levitating,If you wanna run away with me\nI know a galaxy...
297,4,-53.102619,2.33693,Dua Lipa,New Rules,Talkin' in my sleep at night\nMakin' myself cr...


In [79]:
fig = px.scatter(
    unique_df.sort_values("artista"),
    x="x", y="y",
    color="artista",
    hover_data=["musica"],
    opacity=0.5
)
fig.show()