In [1]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np

In [3]:
df_all_lyrics_with_lang = pd.read_csv('df_all_lyrics_with_lang.csv', sep='|')
df_all_lyrics_with_lang.tail()

Unnamed: 0,Artist,Title,Lyric,lang
5606,Khalid,Young dumb,so you're still thinking of me just like i kno...,en
5607,Khalid,Khalid - Vertigo (Tradução Português),será que é melhor apenas acreditar nas teorias...,pt
5608,Khalid,Better (Miles Away Remix),i'm not really drunk i never get that fucked u...,en
5609,Khalid,Khalid - Better (Official Music Video),users considering it's a virus or malware must...,en
5610,Khalid,Better (Rennie! Remix),love to see you shine in the night like the di...,en


In [4]:
! head -5 df_all_lyrics_with_lang.csv

Artist|Title|Lyric|lang
Dua Lipa|New Rules|one one one one one   talkin' in my sleep at night makin' myself crazy out of my mind out of my mind wrote it down and read it out hopin' it would save me too many times too many times  refrain my love he makes me feel like nobody else nobody else but my love he doesn't love me so i tell myself i tell myself  pre one don't pick up the phone you know he's only callin' 'cause he's drunk and alone two don't let him in you'll have to kick him out again three don't be his friend you know you're gonna wake up in his bed in the morning and if you're under him you ain't gettin' over him   i got new rules i count 'em i got new rules i count 'em i gotta tell them to myself i got new rules i count 'em i gotta tell them to myself   i keep pushin' forwards but he keeps pullin' me backwards nowhere to turn no way nowhere to turn no now i'm standin' back from it i finally see the pattern i never learn i never learn  refrain but my love he doesn't love me so 

In [5]:
df_all_lyrics_english = df_all_lyrics_with_lang[df_all_lyrics_with_lang['lang']=='en'].reset_index(drop=True)
df_all_lyrics_english.tail()

Unnamed: 0,Artist,Title,Lyric,lang
4679,Khalid,Better (noclue? Remix),love to see you shine in the night like the di...,en
4680,Khalid,Young dumb,so you're still thinking of me just like i kno...,en
4681,Khalid,Better (Miles Away Remix),i'm not really drunk i never get that fucked u...,en
4682,Khalid,Khalid - Better (Official Music Video),users considering it's a virus or malware must...,en
4683,Khalid,Better (Rennie! Remix),love to see you shine in the night like the di...,en


In [6]:
lyrics = list(df_all_lyrics_english['Lyric'])
song_names = list(df_all_lyrics_english['Title'])

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)


In [8]:
def generate_batch(lst, batch_size):
  for i in range(0, len(lst), batch_size):
    yield lst[i : i + batch_size]

In [9]:
def get_batch_embedding(lyrics_batch, tokenizer, model, mean_pooling):
  # Tokenize sentences
  encoded_input = tokenizer(lyrics_batch, padding=True, truncation=True, return_tensors='pt')

  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)

  # Perform pooling
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  # Normalize embeddings
  return F.normalize(sentence_embeddings, p=2, dim=1)

In [10]:
batch_size = 50
input_batches = generate_batch(lyrics, batch_size)
first_batch = next(input_batches)
embeddings = get_batch_embedding(first_batch, tokenizer, model, mean_pooling)

In [11]:
embeddings.size()

torch.Size([50, 384])

In [12]:
for batch in input_batches:
  batch_embeddings = get_batch_embedding(batch, tokenizer, model, mean_pooling)
  embeddings = torch.cat((batch_embeddings, embeddings), 0)


In [13]:
embeddings.size()

torch.Size([4684, 384])

In [14]:
def nxn_cos_sim(A, dim=1, eps=1e-8):
  numerator = A @ A.T
  A_l2 = torch.mul(A, A).sum(axis=dim)
  denominator = torch.max(torch.sqrt(torch.outer(A_l2, A_l2)), torch.tensor(eps))
  return torch.div(numerator, denominator)

cos_sim = nxn_cos_sim(embeddings)
similar_songs_indices = torch.argsort(cos_sim, dim=1, descending=True)

In [15]:
def get_most_similar_top_n_songs(similar_songs_indices, song_names, song_index, top_n):
  top_n_most_similar_songs_indices = similar_songs_indices[song_index][:(top_n+1)]
  songs = [song_names[index] for index in top_n_most_similar_songs_indices]
  return songs[0], songs[1:]


In [16]:
song_index = 580
top_n=5
get_most_similar_top_n_songs(similar_songs_indices, song_names, song_index, top_n)

('Cameras',
 ['Toosie Slide',
  'Charged Up',
  'Do It Now',
  'Successful',
  'Open Letter Critiquing Sexism'])

In [17]:
df_all_lyrics_english['similar_songs_indices'] = similar_songs_indices.numpy().tolist()

In [25]:
df_all_lyrics_english.to_parquet('df_similar_lyrics.parquet.gzip',compression='gzip')  

In [26]:
df_similar_lyrics = pd.read_parquet('df_similar_lyrics.parquet.gzip')
df_similar_lyrics.tail()

Unnamed: 0,Artist,Title,Lyric,lang,similar_songs_indices
4679,Khalid,Better (noclue? Remix),love to see you shine in the night like the di...,en,"[4679, 253, 154, 3992, 167, 3614, 2452, 3395, ..."
4680,Khalid,Young dumb,so you're still thinking of me just like i kno...,en,"[4680, 4630, 4657, 4509, 1811, 2247, 4292, 340..."
4681,Khalid,Better (Miles Away Remix),i'm not really drunk i never get that fucked u...,en,"[4681, 4556, 2799, 3078, 4271, 2067, 4193, 441..."
4682,Khalid,Khalid - Better (Official Music Video),users considering it's a virus or malware must...,en,"[4682, 4622, 1831, 4628, 4668, 2389, 4417, 458..."
4683,Khalid,Better (Rennie! Remix),love to see you shine in the night like the di...,en,"[4683, 0, 3340, 4288, 4456, 4069, 4292, 4325, ..."
