In [8]:
import pickle
import string

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_distances

from text_autoencoders import TrainedAutoEncoder

In [3]:
ae = TrainedAutoEncoder("../src/text_autoencoders/checkpoints/lyrics/daae/")

Model loaded.


---

TODO:
- [x] relate phrases with lyrics and artitst (may have to rebuild the set)
- [x] encode phrases in lyrics
- [x] get the mean vector of each lyric
- [ ] get the mean vector of each artist
- [x] reduce dim for lyrics and artists
- [x] create map of lyric -> 2dim
- [ ] create map of artist -> 2dim
- [x] plot lyrics space
- [ ] plot artists space

In [4]:
df = pd.read_csv("../data/data_raw.csv", sep=";")
df["clean_lyrics"] = df["letra"].apply(lambda x: [phrase for phrase in x.split("\n") if len(phrase) > 3])
df = df.reset_index()

In [5]:
lyrics_df = pd.DataFrame([(i, l) for i, lyrics in df.query("lang == 2")[["index", "clean_lyrics"]].values for l in lyrics], columns=["song_idx", "sentence"])
lyrics_df = lyrics_df.merge(df, left_on="song_idx", right_on="index").drop(columns=["clean_lyrics", "index"])

In [7]:
lyrics_df.head()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
0,0,[DaBaby:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
1,0,"Billboard Baby, Dua Lipa make 'em dance when i...",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
2,0,Everybody lookin' for a dance floor to run on,Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
3,0,[Dua Lipa:],Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2
4,0,"If you wanna run away with me, I know a galaxy",Dua Lipa,Levitating (feat. DaBaby),"[DaBaby:]\nBillboard Baby, Dua Lipa make 'em d...",2


In [9]:
def process_text(text):
    return "".join([f" {c} " if c in string.punctuation else c for c in text.lower().strip()])

---

In [29]:
lyrics_df["sentence"].apply(process_text).str.split(" ").values

array([list(['', '[', 'dababy', ':', '', ']', '']),
       list(['billboard', 'baby', ',', '', 'dua', 'lipa', 'make', '', "'", 'em', 'dance', 'when', 'it', 'come', 'on']),
       list(['everybody', 'lookin', "'", '', 'for', 'a', 'dance', 'floor', 'to', 'run', 'on']),
       ...,
       list(['just', 'call', 'me', 'angel', 'of', 'the', 'morning', ',', '', 'angel']),
       list(['then', 'slowly', 'turn', 'away', 'from', 'me']),
       list(['oh', 'baby', ',', '', 'i', 'love', 'you', 'baby', ',', '', 'oh', 'baby'])],
      dtype=object)

In [13]:
sentence_encodings = ae.encode(lyrics_df["sentence"].apply(process_text).str.split(" ").values)

In [14]:
sentence_encodings.shape

(82579, 128)

In [15]:
song_idxs = lyrics_df.reset_index().groupby("song_idx")["index"].apply(list)

In [16]:
song_encodings_map = {}

for i, idxs in song_idxs.iteritems():
    song_encodings_map[i] = sentence_encodings[idxs].mean(axis=0)

In [17]:
with open("../data/processed/sentence_encodings.pkl", "wb") as outfile:
    pickle.dump(sentence_encodings, outfile)

In [18]:
with open("../data/processed/song_encodings.pkl", "wb") as outfile:
    pickle.dump(song_encodings_map, outfile)

In [19]:
lyrics_df.to_csv("../data/processed/lyrics.csv", sep=";", index=False)

In [20]:
len(song_encodings_map)

1556

---

In [24]:
lyrics_df.sample()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
81990,2349,"Don't cry, cause on earth we wasn't meant to stay",Whitney Houston,My Love Is Your Love,If tomorrow is judgement day (sing mommy)\nAnd...,2


In [25]:
distances = cosine_distances(sentence_encodings[81990].reshape(1, -1), sentence_encodings)

In [26]:
lyrics_df.iloc[distances.argsort()[0]].head()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
81990,2349,"Don't cry, cause on earth we wasn't meant to stay",Whitney Houston,My Love Is Your Love,If tomorrow is judgement day (sing mommy)\nAnd...,2
49640,1204,"Don't get me wrong, I wasn't out to deceive",Alicia Keys,Gramercy Park,Are you gonna see me when light gets dark?\nTh...,2
33585,748,"Don't say I didn't, say I didn't warn ya",Taylor Swift,Blank Space,"Nice to meet you, where you been?\nI could sho...",2
33587,748,"Don't say I didn't, say I didn't warn ya",Taylor Swift,Blank Space,"Nice to meet you, where you been?\nI could sho...",2
46277,1139,But I can't move on if we're still gonna talk,Shawn Mendes,If I Can't Have You,I can't write one song that's not about you\nC...,2


Distances seens to work

In [27]:
random_idx = np.random.randint(0, lyrics_df.shape[0])
distances = cosine_distances(sentence_encodings[random_idx].reshape(1, -1), sentence_encodings)
lyrics_df.iloc[distances.argsort()[0]].head()

Unnamed: 0,song_idx,sentence,artista,musica,letra,lang
61779,1635,Yesterday seems as though it never existed,Metallica,Fade To Black,"Life, it seems, will fade away\nDrifting furth...",2
51936,1245,Life goes on as it never ends,Backstreet Boys,Show Me The Meaning Of Being Lonely,Show me the meaning of being lonely\nSo many w...,2
52146,1250,Impossible as it may seem,Backstreet Boys,Quit Playing Games (With My Heart),"Baby...Oh...\n\nEven in my heart, I see\nYou'r...",2
43409,1035,True as it can be,Ariana Grande,Beauty and The Beast (Feat. John Legend),Tale as old as time\nTrue as it can be\nBarely...,2
47976,1172,True as it can be,John Legend,Beauty and The Beast,Tale as old as time\nTrue as it can be\nBarely...,2
