In [141]:
import pickle

import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer

import plotly.express as px

In [2]:
lyrics_df = pd.read_csv("../data/processed/lyrics.csv", sep=";")

In [3]:
with open("../data/processed/song_encodings.pkl", "rb") as infile:
    song_encodings_map = pickle.load(infile)

In [4]:
with open("../data/processed/song_encodings_big.pkl", "rb") as infile:
    new_song_encodings = pickle.load(infile)

In [5]:
with open("../data/processed/sentence_encodings.pkl", "rb") as infile:
    sentence_encodings = pickle.load(infile)

In [6]:
song_idxs = lyrics_df.reset_index().groupby("song_idx")["index"].apply(list)

In [13]:
# # save setences by song to improve performance
# for i, sidx in song_idxs.iteritems():
#     np.savez_compressed(
#         f"../data/processed/song_sentence_encodings/{i}", 
#         sentence_encodings[sidx]
#     )

---

FAILED

Reduce and aggregate vectors

In [9]:
new_song_encodings = []
n_vectors = 10


for _, sidx in song_idxs.iteritems():
    new_encoding = []
    n = sentence_encodings[sidx].shape[0]
    
    if n < n_vectors:
        new_encoding = sentence_encodings[sidx].tolist()
        for i in range(n, n_vectors):
            new_encoding.append(sentence_encodings[sidx][-1])
    else:
        step = (n // n_vectors) + 1 if n%n_vectors != 0 else n/n_vectors

        for i in range(n_vectors):
            j = int(min(i+step, n))
            new_encoding.append(sentence_encodings[sidx][i:j].mean(axis=0))

    new_song_encodings.append(np.concatenate(new_encoding))
    
new_song_encodings = np.array(new_song_encodings)

In [10]:
new_song_encodings.shape

(1556, 1280)

In [11]:
with open("../data/processed/song_encodings_big.pkl", "wb") as outfile:
    pickle.dump(new_song_encodings, outfile)

---

Tf-idf

In [26]:
songs_df = lyrics_df[["song_idx", "artista", "musica", "letra"]].drop_duplicates().reset_index(drop=True)

In [142]:
vectorizer = Pipeline([
    ('vec', TfidfVectorizer(ngram_range=(1,3))),
    ('svd', TruncatedSVD(n_components=100)),
    ('norm', Normalizer())
])

In [143]:
song_vecs = vectorizer.fit_transform(songs_df["letra"])

In [145]:
song_vecs.shape

(1556, 100)

---

Nearest Neighbors

In [146]:
nn = NearestNeighbors()

In [147]:
nn.fit(song_vecs)
distances, indices = nn.kneighbors(song_vecs)

In [158]:
songs_df.sample()

Unnamed: 0,song_idx,artista,musica,letra,x,y
979,1388,Avril Lavigne,Girlfriend,Hey hey you you \nI don't like your girlfriend...,-27.474003,-62.575726


In [160]:
indices[979]

array([979, 436, 985,  12, 120])

In [161]:
distances[979]

array([0.        , 0.62037047, 0.62417973, 0.66745753, 0.68273186])

In [162]:
songs_df.iloc[indices[979]]

Unnamed: 0,song_idx,artista,musica,letra,x,y
979,1388,Avril Lavigne,Girlfriend,Hey hey you you \nI don't like your girlfriend...,-27.474003,-62.575726
436,613,The Weeknd,Blinding Lights,I been tryna call\nI've been on my own for lon...,-28.711708,-62.503078
985,1394,Avril Lavigne,Rock N Roll,Let em know that we're still Rock n' Roll\n\nI...,-27.736774,-63.173401
12,12,Dua Lipa,Future Nostalgia,Future\n(Future nostalgia)\n(Future nostalgia)...,-26.718521,-61.385666
120,121,Imagine Dragons,West Coast,One more day we'll spend together\nLay your ey...,-28.535753,-63.887722


---

In [163]:
tsne = TSNE(perplexity=10)

In [164]:
song_embeddings = tsne.fit_transform(song_vecs)

In [165]:
songs_df["x"] = song_embeddings[:,0]
songs_df["y"] = song_embeddings[:,1]

In [170]:
songs_df.to_csv("../data/processed/songs_with_coordinates.csv", sep=";", index=False)

---

In [168]:
fig = px.scatter(
    songs_df.sort_values("artista"),
    x="x", y="y",
    color="artista",
    hover_data=["musica"],
    opacity=0.7
)
fig.show()