In [1]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
lyrics = ['That is a happy person', 'That is a happy dog', 'That is a very happy person', 'Today is a sunny day']

checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

# Tokenize sentences
encoded_input = tokenizer(lyrics, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)



In [4]:
matrices_1d = np.vstack(sentence_embeddings)
norm_vec = np.linalg.norm(matrices_1d , ord=2, axis=1)
cos_sim = matrices_1d .dot(matrices_1d .T) / np.outer(norm_vec ,norm_vec)
cos_sim

array([[1.        , 0.69457746, 0.94291484, 0.25687602],
       [0.69457746, 1.0000001 , 0.6210502 , 0.24906288],
       [0.94291484, 0.6210502 , 1.        , 0.2106153 ],
       [0.25687602, 0.24906288, 0.2106153 , 1.0000002 ]], dtype=float32)

In [5]:
lyrics

['That is a happy person',
 'That is a happy dog',
 'That is a very happy person',
 'Today is a sunny day']

In [6]:
song_index = 0
def get_most_similar_top_n_songs(cosine_similarity_matrix, lyrics, song_index, top_n):
  song = lyrics[song_index]
  similarity_array = cosine_similarity_matrix[song_index]
  top_n_most_similar_songs_indices = np.argsort(similarity_array)[-(top_n+1):][::-1]
  songs = [lyrics[index] for index in top_n_most_similar_songs_indices]
  return songs[0], songs[1:]

get_most_similar_top_n_songs(cosine_similarity_matrix=cos_sim, lyrics=lyrics, song_index=song_index, top_n=1)

('That is a happy person', ['That is a very happy person'])