In [1]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.0 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 56.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 31.6 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 53.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, sentencepiece
Successfully installed huggingface-hub-0.9.1 sentencepiece-0.1.97 tokenizers

In [2]:
import pandas as pd
import numpy as np

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


checkpoint = "sentence-transformers/all-MiniLM-L6-v2"
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [4]:
# this file can be found in the directory `resources/intermediate`
lyrics_df = pd.read_csv('sample_songs_df.tsv', sep='\t')
lyrics = list(lyrics_df['Lyrics'])
song_names = list(lyrics_df['Song'])

In [5]:
## Sample sentences we want sentence embeddings for to test
# lyrics = ['That is a happy person', 'That is a happy dog', 'That is a very happy person', 'Today is a sunny day']
# song_names = lyrics

In [6]:
# Tokenize sentences
encoded_input = tokenizer(lyrics, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

In [7]:
matrices_1d = np.vstack(sentence_embeddings)
norm_vec = np.linalg.norm(matrices_1d , ord=2, axis=1)
cos_sim = matrices_1d .dot(matrices_1d .T) / np.outer(norm_vec ,norm_vec)
cos_sim

array([[1.        , 0.535689  , 0.57074857, 0.23440118, 0.4339943 ,
        0.5735296 ],
       [0.535689  , 0.9999999 , 0.64875335, 0.28850615, 0.40718693,
        0.599264  ],
       [0.57074857, 0.64875335, 1.0000001 , 0.28119418, 0.46055537,
        0.60056293],
       [0.23440118, 0.28850615, 0.28119418, 1.        , 0.18446171,
        0.28805023],
       [0.4339943 , 0.40718693, 0.46055537, 0.18446171, 1.0000001 ,
        0.38801754],
       [0.5735296 , 0.599264  , 0.60056293, 0.28805023, 0.38801754,
        0.99999994]], dtype=float32)

In [8]:
song_names

['I Was Made For Loving You',
 'Africa',
 'Bitter Sweet Symphony',
 'Wishlist',
 'We Are All Made Of Stars',
 'Everything I Do I Do It For You']

In [9]:
def get_most_similar_top_n_songs(cosine_similarity_matrix, song_names, lyrics, song_index, top_n):
  song = lyrics[song_index]
  similarity_array = cosine_similarity_matrix[song_index]
  top_n_most_similar_songs_indices = np.argsort(similarity_array)[-(top_n+1):][::-1]
  songs = [song_names[index] for index in top_n_most_similar_songs_indices]
  return songs[0], songs[1:]


In [10]:
song_index = 3
get_most_similar_top_n_songs(cosine_similarity_matrix=cos_sim, song_names=song_names, lyrics=lyrics, song_index=song_index, top_n=3)

('Wishlist',
 ['Africa', 'Everything I Do I Do It For You', 'Bitter Sweet Symphony'])