In [249]:
# Import packages
# Data manipulation
import re
import math
import numpy as np
import pandas as pd
import nltk
#first time usage: download addtional packages form nltk first:
#nltk.download()
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util

In [168]:
# Get sample lyrics set based on "well-known" artists
# In this csv we can identify A$AP Ferg and A-ha
df = pd.read_csv("../songsdata/songsdata_1500.csv")
df['artist'].unique()

array(['A Storm of Light', 'A Sunset Diary', 'Various Artists',
       'A Taste Of Honey', 'Guatauba', 'A Thorn For Every Heart',
       'A Thousand Times Repent', 'A Textbook Tragedy', 'A TiRO',
       'A Traitor Like Judas', 'A Tribe Called Quest',
       'Original Soundtrack', 'A Toys Orchestra',
       'A Trillion Barnacle Lapse', 'A Trunk Full of Dead Bodies',
       'A Vain Attempt', 'A Turma Do Balão Mágico', 'A Verse Unsung',
       'A Well Thought Tragedy', 'A Voice Like Rhetoric', 'A Weather',
       'A Week In July', 'A$AP Ferg', 'A Whisper in the Noise',
       'James Newton Howard', 'A Wilhelm Scream', 'A-bros', 'a-ha',
       'A$AP Rocky', 'Smoke DZA', 'Lloyd Banks', 'A-Wax',
       'A tirador láser'], dtype=object)

In [251]:
sample_artists_set = df[(df['artist']=='A$AP Ferg') | (df['artist']=='a-ha')].dropna().drop_duplicates()

### preprocessing ideas

In [252]:
# remove lyrics that are not english
from langdetect import detect

sample_artists_set['lang'] = sample_artists_set['lyrics'].apply(detect)
sample_artists_set = sample_artists_set[sample_artists_set['lang']=='en']
sample_artists_set

Unnamed: 0,artist,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration,time_signature,lyrics,lang
225,A$AP Ferg,Let It Go,0.785,0.778,9,-6.003,0,0.1650,0.2470,0.00000,0.6520,0.769,119.976,282640,4,Let It Go Lyrics[Intro: A$AP Yams]\nTrap Lord ...,en
226,A$AP Ferg,Shabba (feat. A$AP Rocky),0.869,0.712,9,-6.136,1,0.1030,0.0715,0.00000,0.2720,0.435,120.000,275867,4,This Is A$AP Ferg Lyrics1. A$AP Ferg- Move Ya ...,en
227,A$AP Ferg,Lord (feat. Bone Thugs-n-Harmony),0.730,0.890,7,-6.410,1,0.1680,0.4030,0.00000,0.6110,0.446,125.405,317187,4,Lord Lyrics[Produced by: Ozhora Miyagi & Cryst...,en
228,A$AP Ferg,Hood Pope,0.731,0.631,9,-7.006,1,0.1550,0.1060,0.00000,0.2200,0.513,131.977,210747,4,"Hood Pope Lyrics[Chorus]\nOhh, let me sing my ...",en
229,A$AP Ferg,Fergivicious,0.859,0.842,6,-5.170,0,0.1960,0.1710,0.00000,0.1650,0.610,125.981,230520,4,"Fergivicious Lyrics[Intro]\nYeah, a lot of you...",en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,a-ha,Sunny Mystery,0.612,0.637,9,-8.012,0,0.0251,0.0189,0.22600,0.1010,0.273,135.005,210627,4,Sunny Mystery LyricsYou can run through every ...,en
453,a-ha,Start The Simulator,0.491,0.633,9,-6.352,1,0.0249,0.4080,0.00967,0.1100,0.172,95.945,317160,3,Start the Simulator LyricsStart the simulator\...,en
454,a-ha,Case Closed On Silver Shore,0.655,0.707,11,-8.707,1,0.0290,0.1350,0.06960,0.0647,0.515,100.069,268973,4,Case Closed On Silver Shore LyricsHere's a sta...,en
455,a-ha,Lifelines - Demo,0.632,0.400,7,-9.238,1,0.0252,0.2000,0.00291,0.1150,0.174,77.997,316987,4,Lifelines (Demo) LyricsOne time to know that i...,en


In [253]:
lyrics_set = sample_artists_set['lyrics']
lyrics_set

225    Let It Go Lyrics[Intro: A$AP Yams]\nTrap Lord ...
226    This Is A$AP Ferg Lyrics1. A$AP Ferg- Move Ya ...
227    Lord Lyrics[Produced by: Ozhora Miyagi & Cryst...
228    Hood Pope Lyrics[Chorus]\nOhh, let me sing my ...
229    Fergivicious Lyrics[Intro]\nYeah, a lot of you...
                             ...                        
452    Sunny Mystery LyricsYou can run through every ...
453    Start the Simulator LyricsStart the simulator\...
454    Case Closed On Silver Shore LyricsHere's a sta...
455    Lifelines (Demo) LyricsOne time to know that i...
456    Summer Moved On (Remix) LyricsSummer moved on\...
Name: lyrics, Length: 120, dtype: object

### Embedding generation

In [250]:
def embeddings_gen(lyrics_set, model_name = "all-distilroberta-v1"):
    # Create mdoel
    model = SentenceTransformer(model_name)
    # Create bag of lyrics lines with their corresponding song_ids
    l_lyrics_lines =[]
    l_song_idx =[]
    for idx in lyrics_set.index:
        lyrics = lyrics_set[idx]
        lyrics_lines = re.split('\n',lyrics)
        # Condition: do not include lyrics lines that are more than 512 tokens
        if any(len(word_tokenize(x)) >= 512 for x in lyrics_lines):
            continue
        l_lyrics_lines.extend(lyrics_lines)
        l_song_idx.extend([idx] * len(lyrics_lines))

    # For invert indexing //Store related song ids as np array
    arr_song_idx = np.array(l_song_idx)
    # Store lyrics lines as np array
    arr_lyrics_idx = np.array(l_lyrics_lines)

    embeddings = model.encode(l_lyrics_lines, convert_to_tensor=True)
    
    return embeddings, arr_song_idx, arr_lyrics_idx

embeddings, arr_song_idx, arr_lyrics_idx = embeddings_gen(lyrics_set)

### Ranking Generation

In [216]:
# Helper functions to main ranking function

# Get closest lyrics lines matches from user text input
def text_get_similar_lyrics_lines(user_text_input, embeddings, lyrics_set, model_name = "all-distilroberta-v1"):
    model = SentenceTransformer(model_name)
    input_emb = model.encode(user_text_input, convert_to_tensor=True)
    res_cos_sim = util.semantic_search(input_emb, embeddings, score_function=util.cos_sim, top_k=100)
    # Convert results and mapped lyrics id as pd dataframe
    res_df = pd.DataFrame(res_cos_sim[0])
    res_df.rename(columns = {'corpus_id':'lyrics_id'}, inplace = True)
    res_df['lyrics_line'] = arr_lyrics_idx[res_df['lyrics_id']]
    return res_df

# For invert indexing // Look up ids of corresponding songs
def lyrics_id_mapping(res_df, arr_lyrics_idx):
    arr_lyrics_id = res_df['lyrics_id'].to_numpy()
    arr_idx = arr_lyrics_id.astype(int)
    arr_song_row_idx = arr_lyrics_idx[arr_idx]
    res_df['song_idx'] = arr_song_row_idx
    return res_df

# Suppress utterances which have low similarity scores
def score_low_sim_weighting(df, threshold = 0.9, weight_low_sim = 1):
    df['score_weighted'] = df['score'].apply(lambda x: x * weight_low_sim if x < threshold else x)
    return df

# Re-rank on songs level based on average lyrics line scores
def songs_ranking(df_results_lyrics_mapped):
    res = df_results_lyrics_mapped.groupby('song_idx')['score_weighted'].mean()
    res = res.sort_values(ascending=False)
    return res

# Combine songs information to ranked songs
def combine_songs_info(s_songs_ranking, sample_artists_set, results_limit = 10):
    df_songs_candidates = sample_artists_set.filter(items = s_songs_ranking.index, axis=0)
    df_songs_candidates['score'] = s_songs_ranking
    res_df = df_songs_candidates[['artist', 'title', 'score']][:10]
    return res_df

In [219]:
# Overall function to generate songs ranking based on lyrics lines semantic textual similarity 
def similar_songs_ranked(user_input, embeddings, sample_artists_set, lyrics_set, arr_song_idx):
    df_results_lyrics = text_get_similar_lyrics_lines(user_input, embeddings, lyrics_set)
    df_results_lyrics_mapped = lyrics_id_mapping(df_results_lyrics, arr_song_idx)
    df_results_lyrics_mapped = score_low_sim_weighting(df_results_lyrics_mapped)
    s_songs_ranking = songs_ranking(df_results_lyrics_mapped)
    df_results_songs = combine_songs_info(s_songs_ranking, sample_artists_set)
    return df_results_songs, df_results_lyrics_mapped

In [227]:
user_input = "I am happy today"

df_results_songs, df_results_lyrics_mapped = similar_songs_ranked(user_input, embeddings, sample_artists_set, lyrics_set, arr_song_idx)
df_results_songs

Unnamed: 0,artist,title,score
407,a-ha,Between Your Mama and Yourself,0.489485
405,a-ha,How Sweet It Was,0.46608
318,a-ha,And You Tell Me,0.434451
451,a-ha,Mother Nature Goes to Heaven,0.412798
415,a-ha,Thought That It Was You,0.406768
312,a-ha,Take on Me,0.399958
236,A$AP Ferg,Cocaine Castle,0.398855
418,a-ha,I Won't Forget Her,0.363266
392,a-ha,Slender Frame,0.362153
393,a-ha,East of the Sun,0.361501


In [226]:
# Helper function to support getting songs/ lyrics results

# Look up relevant lyrics lines an their similarity scores
def lyrics_scores_lookup(song_id, df_results_lyrics_mapped):
    res = df_results_lyrics_mapped[df_results_lyrics_mapped['song_idx'] == song_id][['lyrics_line', 'score']]
    res = res.sort_values(by=['score'], ascending=False)
    return res

In [228]:
# Generate output on both songs and lyrics level, as a list of dictionaries
def similar_songs_lyrics_ranked(df_results_song, df_results_lyrics_mapped):

    result_list = []

    for song_id in df_results_songs.index:
        song_title = df_results_songs['title'].loc[song_id]
        song_artist = df_results_songs['artist'].loc[song_id]
        song_score = df_results_songs['score'].loc[song_id]
        df_lyrics_scores = lyrics_scores_lookup(song_id, df_results_lyrics_mapped)
        d_lyrics = dict(zip(df_lyrics_scores['lyrics_line'], df_lyrics_scores['score']))
        dict_object = {"artist":song_artist, "song title":song_title, "song_score":song_score, "lyrics_scores":d_lyrics}
        result_list.append(dict_object)
    
    return result_list

similar_songs_lyrics_ranked(df_results_songs, df_results_lyrics_mapped)

[{'artist': 'a-ha',
  'song title': 'Between Your Mama and Yourself',
  'song_score': 0.489484965801239,
  'lyrics_scores': {"And today's that day": 0.489484965801239}},
 {'artist': 'a-ha',
  'song title': 'How Sweet It Was',
  'song_score': 0.46608029305934906,
  'lyrics_scores': {'We were so happy now': 0.48340049386024475,
   'We were so happy': 0.44876009225845337}},
 {'artist': 'a-ha',
  'song title': 'And You Tell Me',
  'song_score': 0.4344506859779358,
  'lyrics_scores': {'And tomorrow is the day': 0.4344506859779358}},
 {'artist': 'a-ha',
  'song title': 'Mother Nature Goes to Heaven',
  'song_score': 0.41279757022857666,
  'lyrics_scores': {'And there will be no sadder day': 0.41279757022857666}},
 {'artist': 'a-ha',
  'song title': 'Thought That It Was You',
  'song_score': 0.4067680537700653,
  'lyrics_scores': {'You found me in your joy': 0.4067680537700653}},
 {'artist': 'a-ha',
  'song title': 'Take on Me',
  'song_score': 0.39995843172073364,
  'lyrics_scores': {"Today'