In [3]:
# Import packages
# Data manipulation
import re
import pickle
import math
import numpy as np
import pandas as pd
import nltk
#first time usage: download addtional packages form nltk first:
#nltk.download()
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

### Ranking Generation

In [4]:
# Helper functions to main ranking function

# Get closest lyrics lines matches from user text input
def text_get_similar_lyrics_lines(user_text_input, embeddings, lyrics_set, model_name = "all-distilroberta-v1"):
    model = SentenceTransformer(model_name)
    input_emb = model.encode(user_text_input, convert_to_tensor=True)
    res_cos_sim = util.semantic_search(input_emb, embeddings, score_function=util.cos_sim, top_k=100)
    # Convert results and mapped lyrics id as pd dataframe
    res_df = pd.DataFrame(res_cos_sim[0])
    res_df.rename(columns = {'corpus_id':'lyrics_id'}, inplace = True)
    res_df['lyrics_line'] = arr_lyrics_idx[res_df['lyrics_id']]
    return res_df

# For invert indexing // Look up ids of corresponding songs
def lyrics_id_mapping(res_df, arr_lyrics_idx):
    arr_lyrics_id = res_df['lyrics_id'].to_numpy()
    arr_idx = arr_lyrics_id.astype(int)
    arr_song_row_idx = arr_lyrics_idx[arr_idx]
    res_df['song_idx'] = arr_song_row_idx
    return res_df

# Suppress utterances which have low similarity scores
def score_low_sim_weighting(df, threshold = 0.9, weight_low_sim = 1):
    df['score_weighted'] = df['score'].apply(lambda x: x * weight_low_sim if x < threshold else x)
    return df

# Re-rank on songs level based on average lyrics line scores
def songs_ranking(df_results_lyrics_mapped):
    res = df_results_lyrics_mapped.groupby('song_idx')['score_weighted'].mean()
    res = res.sort_values(ascending=False)
    return res

# Combine songs information to ranked songs
def combine_songs_info(s_songs_ranking, sample_artists_set, results_limit = 10):
    df_songs_candidates = sample_artists_set.filter(items = s_songs_ranking.index, axis=0)
    df_songs_candidates['score'] = s_songs_ranking
    res_df = df_songs_candidates[['artist', 'title', 'score']][:10]
    return res_df

In [5]:
# Overall function to generate songs ranking based on lyrics lines semantic textual similarity 
def similar_songs_ranked(user_input, embeddings, sample_artists_set, lyrics_set, arr_song_idx):
    df_results_lyrics = text_get_similar_lyrics_lines(user_input, embeddings, lyrics_set)
    df_results_lyrics_mapped = lyrics_id_mapping(df_results_lyrics, arr_song_idx)
    df_results_lyrics_mapped = score_low_sim_weighting(df_results_lyrics_mapped)
    s_songs_ranking = songs_ranking(df_results_lyrics_mapped)
    df_results_songs = combine_songs_info(s_songs_ranking, sample_artists_set)
    return df_results_songs, df_results_lyrics_mapped

In [6]:
# PLEASE REFER TO preprocessing.ipynb FOR PREPROCESSING STEP
with open('./pickle_objects/sample_song_lyrics_set.obj', 'rb') as f:
    l_pickle = pickle.load(f)

sample_artists_set = l_pickle[0]
lyrics_set = l_pickle[1]

# PLEASE REFER TO get_embeddings.ipynb FOR EMBEDDINGS GENERATION STEP
with open('./pickle_objects/embeddings_indices.obj', 'rb') as f:
    l_pickle = pickle.load(f)

embeddings = l_pickle[0]
arr_song_idx = l_pickle[1] 
arr_lyrics_idx = l_pickle[2] 

In [7]:
user_input = "I am happy today"

df_results_songs, df_results_lyrics_mapped = similar_songs_ranked(user_input, embeddings, sample_artists_set, lyrics_set, arr_song_idx)
df_results_songs

Unnamed: 0,artist,title,score
407,a-ha,Between Your Mama and Yourself,0.489485
405,a-ha,How Sweet It Was,0.46608
318,a-ha,And You Tell Me,0.434451
451,a-ha,Mother Nature Goes to Heaven,0.412798
415,a-ha,Thought That It Was You,0.406768
312,a-ha,Take on Me,0.399959
236,A$AP Ferg,Cocaine Castle,0.398855
418,a-ha,I Won't Forget Her,0.363266
392,a-ha,Slender Frame,0.362153
393,a-ha,East of the Sun,0.361501


In [8]:
# Helper function to support getting songs/ lyrics results

# Look up relevant lyrics lines an their similarity scores
def lyrics_scores_lookup(song_id, df_results_lyrics_mapped):
    res = df_results_lyrics_mapped[df_results_lyrics_mapped['song_idx'] == song_id][['lyrics_line', 'score']]
    res = res.sort_values(by=['score'], ascending=False)
    return res

In [9]:
# Generate output on both songs and lyrics level, as a list of dictionaries
def similar_songs_lyrics_ranked(df_results_song, df_results_lyrics_mapped):

    result_list = []

    for song_id in df_results_songs.index:
        song_title = df_results_songs['title'].loc[song_id]
        song_artist = df_results_songs['artist'].loc[song_id]
        song_score = df_results_songs['score'].loc[song_id]
        df_lyrics_scores = lyrics_scores_lookup(song_id, df_results_lyrics_mapped)
        d_lyrics = dict(zip(df_lyrics_scores['lyrics_line'], df_lyrics_scores['score']))
        dict_object = {"artist":song_artist, "song title":song_title, "song_score":song_score, "lyrics_scores":d_lyrics}
        result_list.append(dict_object)
    
    return result_list

similar_songs_lyrics_ranked(df_results_songs, df_results_lyrics_mapped)

[{'artist': 'a-ha',
  'song title': 'Between Your Mama and Yourself',
  'song_score': 0.489484965801239,
  'lyrics_scores': {"And today's that day": 0.489484965801239}},
 {'artist': 'a-ha',
  'song title': 'How Sweet It Was',
  'song_score': 0.4660801738500595,
  'lyrics_scores': {'We were so happy now': 0.48340049386024475,
   'We were so happy': 0.44875985383987427}},
 {'artist': 'a-ha',
  'song title': 'And You Tell Me',
  'song_score': 0.4344506859779358,
  'lyrics_scores': {'And tomorrow is the day': 0.4344506859779358}},
 {'artist': 'a-ha',
  'song title': 'Mother Nature Goes to Heaven',
  'song_score': 0.41279757022857666,
  'lyrics_scores': {'And there will be no sadder day': 0.41279757022857666}},
 {'artist': 'a-ha',
  'song title': 'Thought That It Was You',
  'song_score': 0.4067680537700653,
  'lyrics_scores': {'You found me in your joy': 0.4067680537700653}},
 {'artist': 'a-ha',
  'song title': 'Take on Me',
  'song_score': 0.3999585807323456,
  'lyrics_scores': {"Today's 