In [1]:
import numpy as np
import pandas as pd
import os
import nltk
import pickle
import re

from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

In [2]:
# get the cleaned songs data
songs_set = pd.read_csv('../cleaned_songs.csv')

In [3]:
# get songs lyrics set
lyrics_set = songs_set['lyrics']

In [4]:
# pickle the sets
l_pickle = [songs_set, lyrics_set]
with open('../App/pickle_objects/sample_song_lyrics_set.pickle', 'wb') as f:
    pickle.dump(l_pickle, f)

In [5]:
# # for based-line model embedding pls uncomment this line
with open('../App/pickle_objects/sample_song_lyrics_set.pickle', 'rb') as f:
    l_pickle = pickle.load(f)

lyrics_set = l_pickle[1].sample(1000, random_state=42)

In [8]:
# get embedding function
def embeddings_gen(lyrics_set, model_name = "all-distilroberta-v1"):
    # Create mdoel
    model = SentenceTransformer(model_name)
    # Create bag of lyrics lines with their corresponding song_ids
    l_lyrics_lines =[]
    l_song_idx =[]
    for idx in lyrics_set.index:
        lyrics = lyrics_set[idx]
        lyrics_lines = re.split('\n',lyrics)
        # Condition: do not include lyrics lines that are more than 512 tokens
        if any(len(word_tokenize(x)) >= 512 for x in lyrics_lines):
            continue
        l_lyrics_lines.extend(lyrics_lines)
        l_song_idx.extend([idx] * len(lyrics_lines))

    # For invert indexing //Store related song ids as np array
    arr_song_idx = np.array(l_song_idx)
    # Store lyrics lines as np array
    arr_lyrics_idx = np.array(l_lyrics_lines)

    embeddings = model.encode(l_lyrics_lines, convert_to_tensor=True)
    
    return embeddings, arr_song_idx, arr_lyrics_idx


In [16]:
# transform lyrics to vector embedding
# get the vector embedding of the sampling 1000 dataset from the 100k songs data >> preprocess_lyrics.ipynb
with open('../App/pickle_objects/embeddings_ft_model.obj', 'rb') as f:
    embeddings, arr_song_idx, arr_lyrics_idx = pickle.load(f)

with open('../App/pickle_objects/sample_song_lyrics_set.pickle', 'rb') as f:
    songs_set, lyrics_set = pickle.load(f)

# getting songs index to be trained with bertopic
songs_idx = sorted(list(set(arr_song_idx)))

# getting songs in the index
sample_songs_set = songs_set.loc[songs_set.index.isin(songs_idx)]
lyrics_set = sample_songs_set['lyrics']

In [17]:
# fine-tuned BERT model local location
ft_model = '../../BERT-Fine-Tuning/Data/ft_model'
# re-create embeddings using local model
embeddings_ft_model, arr_song_idx, arr_lyrics_idx = embeddings_gen(lyrics_set, ft_model)

In [18]:
# extract embeddings and rename pickle file
l_pickle = [embeddings_ft_model, arr_song_idx, arr_lyrics_idx]
with open('../App/pickle_objects/embeddings_ft_model.obj', 'wb') as f:
    pickle.dump(l_pickle, f)