In [3]:
import numpy as np
import pandas as pd
import os
import nltk
import pickle
import re

from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

In [4]:
# get songs data from test set

with open('../App/pickle_objects/train_test.pickle', 'rb') as f:
        l_train_test = pickle.load(f)

songs_set = l_train_test[1]

In [7]:
# uncomment to extract .csv data
# get the cleaned songs data
# songs_set = pd.read_csv('../songsdata/cleaned_data/cleaned_songs.csv')

In [9]:
songs_set['lyrics'].head()

70228    Wait for a Minute Lyrics[Intro: Tyga]\nYeah, w...
80689    ’Til A Tear Becomes A Rose LyricsDarling I can...
31384    Once You Hit the Road Lyrics3:45, you come hom...
89877    13 Lyrics[Instrumental Intro]\n\n[Verse 1]\nTh...
36590    Superstar Talking Blues LyricsYou say you've b...
Name: lyrics, dtype: object

In [10]:
# get songs lyrics set
lyrics_set = songs_set['lyrics']

In [11]:
# pickle the sets
l_pickle = [songs_set, lyrics_set]
with open('../App/pickle_objects/song_lyrics_set_test.pickle', 'wb') as f:
    pickle.dump(l_pickle, f)

In [12]:
# get embedding function
def embeddings_gen(lyrics_set, model_name = "all-distilroberta-v1"):
    # Create mdoel
    model = SentenceTransformer(model_name)
    # Create bag of lyrics lines with their corresponding song_ids
    l_lyrics_lines =[]
    l_song_idx =[]
    for idx in lyrics_set.index:
        lyrics = lyrics_set[idx]
        lyrics_lines = re.split('\n',lyrics)
        # Condition: do not include lyrics lines that are more than 512 tokens
        if any(len(word_tokenize(x)) >= 512 for x in lyrics_lines):
            continue
        l_lyrics_lines.extend(lyrics_lines)
        l_song_idx.extend([idx] * len(lyrics_lines))

    # For invert indexing //Store related song ids as np array
    arr_song_idx = np.array(l_song_idx)
    # Store lyrics lines as np array
    arr_lyrics_idx = np.array(l_lyrics_lines)

    embeddings = model.encode(l_lyrics_lines, convert_to_numpy=True)

    return embeddings, arr_song_idx, arr_lyrics_idx


In [13]:
# uncomment if further down-sampling on test set is needed
#with open('../pickle_objects/song_lyrics_set.pickle', 'rb') as f:
    # l_pickle = pickle.load(f)

#downsample_size = 

#lyrics_set = l_pickle[1].sample(downsample_size, random_state=42)

In [14]:
# # transform lyrics to vector embedding
# # get the vector embedding of the sampling 1000 dataset from the 100k songs data >> preprocess_lyrics.ipynb
# with open('../App/pickle_objects/embeddings_indices.obj', 'rb') as f:
#     embeddings, arr_song_idx, arr_lyrics_idx = pickle.load(f)

# with open('../App/pickle_objects/sample_song_lyrics_set.obj', 'rb') as f:
#     songs_set, lyrics_set = pickle.load(f)

# # getting songs index to be trained with bertopic
# songs_idx = sorted(list(set(arr_song_idx)))

# # getting songs in the index
# sample_songs_set = songs_set.loc[songs_set.index.isin(songs_idx)]
# lyrics_set = sample_songs_set['lyrics']

In [15]:
embeddings, arr_song_idx, arr_lyrics_idx = embeddings_gen(lyrics_set)

In [16]:
with open('../App/pickle_objects/embeddings_test.pickle', 'wb') as f:
    pickle.dump(embeddings, f)

In [18]:
with open('../App/pickle_objects/arr_song_idx_test.pickle', 'wb') as f:
    pickle.dump(arr_song_idx, f)

In [19]:
with open('../App/pickle_objects/arr_lyrics_idx_test.pickle', 'wb') as f:
    pickle.dump(arr_lyrics_idx, f)

In [23]:
# get embedding function
def embeddings_gen_clip(lyrics_set, model_name = "clip-ViT-B-32"):
    # Create mdoel
    model = SentenceTransformer(model_name)
    # Create bag of lyrics lines with their corresponding song_ids
    l_lyrics_lines =[]
    l_song_idx =[]
    for idx in lyrics_set.index:
        lyrics = lyrics_set[idx]
        lyrics_lines = re.split('\n',lyrics)
        # Condition: CLIP max token limit: 77; Do not include lyrics lines that are more than 77 tokens
        if any(len(word_tokenize(x)) >= 77 for x in lyrics_lines):
            continue
        l_lyrics_lines.extend(lyrics_lines)
        l_song_idx.extend([idx] * len(lyrics_lines))

    # For invert indexing //Store related song ids as np array
    arr_song_idx = np.array(l_song_idx)
    # Store lyrics lines as np array
    arr_lyrics_idx = np.array(l_lyrics_lines)

    embeddings = model.encode(l_lyrics_lines, convert_to_numpy=True)

    return embeddings, arr_song_idx, arr_lyrics_idx

embeddings_clip, arr_song_idx_clip, arr_lyrics_idx_clip = embeddings_gen_clip(lyrics_set)

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [None]:
with open('../App/pickle_objects/embeddings_clip.pickle', 'wb') as f:
    pickle.dump(embeddings_clip, f)

In [None]:
with open('../App/pickle_objects/arr_song_idx_clip.pickle', 'wb') as f:
    pickle.dump(arr_song_idx_clip, f)

In [None]:
with open('../App/pickle_objects/arr_lyrics_idx_clip.pickle', 'wb') as f:
    pickle.dump(arr_lyrics_idx_clip, f)