In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances

from text_autoencoders import TrainedAutoEncoder

In [2]:
ae = TrainedAutoEncoder("../src/text_autoencoders/checkpoints/yelp/daae/")

Model loaded.


---

TODO:
- [x] relate phrases with lyrics and artitst (may have to rebuild the set)
- [x] encode phrases in lyrics
- [x] get the mean vector of each lyric
- [ ] get the mean vector of each artist
- [x] reduce dim for lyrics and artists
- [x] create map of lyric -> 2dim
- [ ] create map of artist -> 2dim
- [x] plot lyrics space
- [ ] plot artists space

In [13]:
df = pd.read_csv("../data/data_raw.csv", sep=";")
df["clean_lyrics"] = df["letra"].apply(lambda x: [phrase for phrase in x.split("\n") if len(phrase) > 3])
df = df.reset_index()

In [46]:
lyrics_df = pd.DataFrame([(i, l) for i, lyrics in df.query("lang == 2")[["index", "clean_lyrics"]].values for l in lyrics], columns=["song_idx", "sentence"])
lyrics_df = lyrics_df.merge(df, left_on="song_idx", right_on="index").drop(columns=["clean_lyrics", "index"])

In [47]:
lyrics_df.head()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
0,0,[DaBaby:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
1,0,"Billboard Baby, Dua Lipa make 'em dance when i...",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
2,0,Everybody lookin' for a dance floor to run on,Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
3,0,[Dua Lipa:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
4,0,"If you wanna run away with me, I know a galaxy",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2


---

In [133]:
sentence_encodings = ae.encode(lyrics_df["sentence"].str.split(" ").values)

In [135]:
sentence_encodings.shape

(82579, 128)

In [136]:
song_idxs = lyrics_df.reset_index().groupby("song_idx")["index"].apply(list)

In [137]:
song_encodings_map = {}

for i, idxs in song_idxs.iteritems():
    song_encodings_map[i] = sentence_encodings[idxs].mean(axis=0)

In [138]:
with open("../data/processed/sentence_encodings.pkl", "wb") as outfile:
    pickle.dump(sentence_encodings, outfile)

In [139]:
with open("../data/processed/song_encodings.pkl", "wb") as outfile:
    pickle.dump(song_encodings_map, outfile)

In [140]:
lyrics_df.to_csv("../data/processed/lyrics.csv", sep=";", index=False)

In [147]:
len(song_encodings_map)

1556

---

Song vector space

In [141]:
tsne = TSNE()

In [142]:
# song_embeddings = {
#     i: emb 
#     for i, emb in 
#     zip(list(song_encodings_map.keys()), tsne.fit_transform(list(song_encodings_map.values())))
# }

song_embeddings = tsne.fit_transform(list(song_encodings_map.values()))

In [156]:
song_df = pd.DataFrame(
    zip(list(song_encodings_map.keys()), song_embeddings[:,0], song_embeddings[:,1]),
    columns=["song_idx", "x", "y"]
)

In [157]:
song_df.shape

(1556, 3)

In [165]:
song_df.head()

Unnamed: 0,song_idx,x,y
0,0,7.424465,-1.148654
1,1,-0.293809,0.497196
2,2,6.21529,17.715231
3,3,7.43025,-1.086189
4,4,1.919331,-0.682756


In [166]:
song_df = song_df.merge(lyrics_df, left_on="song_idx", right_on="song_idx")
song_df.head()

Unnamed: 0,song_idx,x,y,sentence,artista,musica,letra,lang
0,0,7.424465,-1.148654,[DaBaby:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
1,0,7.424465,-1.148654,"Billboard Baby, Dua Lipa make 'em dance when i...",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
2,0,7.424465,-1.148654,Everybody lookin' for a dance floor to run on,Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
3,0,7.424465,-1.148654,[Dua Lipa:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
4,0,7.424465,-1.148654,"If you wanna run away with me, I know a galaxy",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2


In [168]:
lyrics_df.shape

(82579, 6)

In [167]:
song_df.shape

(82579, 8)

In [169]:
song_df.to_csv("../data/processed/songs_with_coordinates.csv", sep=";", index=False)

---

In [180]:
lyrics_df.sample()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
47144,1156,We could to turn darkness into light,John Legend,Love Me Now,"Pulling me further, further than I've been bef...",2


In [197]:
distances = cosine_distances(sentence_encodings[47144].reshape(1, -1), sentence_encodings)

In [203]:
lyrics_df.iloc[distances.argsort()[0]].head()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
47144,1156,We could to turn darkness into light,John Legend,Love Me Now,"Pulling me further, further than I've been bef...",2
74573,2103,"Shut the door, turn the light off",One Direction,Moments,"Shut the door, turn the light off\nI wanna be ...",2
37927,864,Pink when I turn out the light,Aerosmith,Pink,Pink it´s my new obsession\nPink it´s not even...,2
37918,864,Pink when I turn out the light,Aerosmith,Pink,Pink it´s my new obsession\nPink it´s not even...,2
18879,437,There will be niks when I'm done,Beyoncé,"My Power (With Tierra Whack, Moonchild Sanelly...","They'll never take my power, my power, my powe...",2


Distances seens to work

17043

In [219]:
random_idx = np.random.randint(0, lyrics_df.shape[0])
distances = cosine_distances(sentence_encodings[random_idx].reshape(1, -1), sentence_encodings)
lyrics_df.iloc[distances.argsort()[0]].head()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
58813,1556,"Come on listen, listen",John Lennon,Whatever Gets You Through The Night,Whatever gets you through the night 's alright...,2
55414,1407,You! You need to listen,Avril Lavigne,Losing Grip,"Are you aware of what you make me feel, baby?\...",2
58820,1556,"Come on listen, listen",John Lennon,Whatever Gets You Through The Night,Whatever gets you through the night 's alright...,2
55393,1407,"You, You need to listen",Avril Lavigne,Losing Grip,"Are you aware of what you make me feel, baby?\...",2
37654,858,But I just couldn't listen,Aerosmith,Amazing,I kept the right ones out \nAnd let the wrong ...,2
