## Setup Environment

In [12]:
# # we can remove this later 
# !pip install pandas \
#             nltk \
#             gensim \
#             scikit-learn \
#             numpy

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.3.2 threadpoolctl-3.2.0


In [2]:
import pandas as pd
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
import numpy as np

## Load dataset of songs

Dataset: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs, an open source dataset on Kaggle. It provides nearly 1.2 million of songs in Spotify. Those songs were retreived by using Spotify API.

In [78]:
file_path = '../tracks_features.csv'
df = pd.read_csv(file_path)
print(df.head())

                       id                   name                      album  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  The Battle Of Los Angeles   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  The Battle Of Los Angeles   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  The Battle Of Los Angeles   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  The Battle Of Los Angeles   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  The Battle Of Los Angeles   

                 album_id                       artists  \
0  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
1  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
2  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
3  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
4  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   

                   artist_ids  track_number  disc_number  explicit  \
0  ['2d0hyoQ5ynDBnkvAbJKORj']             1            1     False   
1  ['2d0hyoQ5ynDBnkvAbJKORj'] 

## Preprocessing data

We want to perform some operations to select the numeric audio features we want, and also convert those categorical values into numeric one to create the vector embeddings.
The selected features includes:
- id (not sure if we need this?)
- name
- artists
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acousticness
- instrumentalness
- liveness
- valence
- tempo
- duration_ms
- time_signature
- year (do we want this?)

In [79]:
selected_features = df.drop(columns=["album", "album_id", "artist_ids", "track_number", "disc_number", "explicit", "release_date"])
print(selected_features.head())

                       id                   name  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire   

                        artists  danceability  energy  key  loudness  mode  \
0  ['Rage Against The Machine']         0.470   0.978    7    -5.399     1   
1  ['Rage Against The Machine']         0.599   0.957   11    -5.764     1   
2  ['Rage Against The Machine']         0.315   0.970    7    -5.424     1   
3  ['Rage Against The Machine']         0.440   0.967   11    -5.830     0   
4  ['Rage Against The Machine']         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.

In [98]:
# check if our filted features contains any missing value
selected_features.isna().any()

id                  False
name                False
artists             False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
year                False
dtype: bool

In [81]:
print("Shape before drop NA: ", selected_features.shape)
selected_features = selected_features.dropna()
print("Shape before after NA: ", selected_features.shape)

Shape before drop NA:  (1204025, 17)
Shape before after NA:  (1204022, 17)


Some songs have multiple artists, we want to convert them from a list to string.
Example: ['Pietro Locatelli', 'Capella Istropolitana', 'Jaroslav Krcek'] to 'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [99]:
def convert_artists_name(artists_list):
    items_list = artists_list.strip("[]").replace("'", "").split(", ")
    return ", ".join(items_list)

selected_features["artists"] = selected_features["artists"].apply(convert_artists_name)
selected_features.iloc[1184]["artists"]

'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [100]:
# remove duplicated rows by song name and artists name
print("Shape before duplicated removal: ", selected_features.shape)
selected_features = selected_features.drop_duplicates(subset=['name', 'artists'])
print("Shape after duplicated removal: ", selected_features.shape)

Shape before duplicated removal:  (1141552, 17)
Shape after duplicated removal:  (1141552, 17)


In [101]:
print(selected_features.head())
print(selected_features.tail())

                       id                   name                   artists  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  Rage Against The Machine   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  Rage Against The Machine   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  Rage Against The Machine   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  Rage Against The Machine   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  Rage Against The Machine   

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.470   0.978    7    -5.399     1       0.0727       0.02610   
1         0.599   0.957   11    -5.764     1       0.1880       0.01290   
2         0.315   0.970    7    -5.424     1       0.4830       0.02340   
3         0.440   0.967   11    -5.830     0       0.2370       0.16300   
4         0.426   0.929    2    -6.729     1       0.0701       0.00162   

   instrumentalness  liveness  valence    tempo  duration_ms  time_signature  \


## Create vectors/embeddings

We first need to convert those song and artists name into vector. The converted vector representation will have length of 14, so we can combine these with 14 numeric column values. We will combine the song name with artists name to one column for better tokenize

In [102]:
selected_features['string_summary'] = selected_features['name'] + ' - ' + selected_features['artists']
selected_features['string_summary'] = selected_features['string_summary'].astype(str)

# Drop the original 'name' and 'artists' columns
selected_features.drop(['name', 'artists'], axis=1, inplace=True)
print(selected_features.head())

                       id  danceability  energy  key  loudness  mode  \
0  7lmeHLHBe4nmXzuXc0HDjk         0.470   0.978    7    -5.399     1   
1  1wsRitfRRtWyEapl0q22o8         0.599   0.957   11    -5.764     1   
2  1hR0fIFK2qRG3f3RF70pb7         0.315   0.970    7    -5.424     1   
3  2lbASgTSoDO7MTuLAXlTW0         0.440   0.967   11    -5.830     0   
4  1MQTmpYOZ6fcMQc56Hdo7T         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.489  103.680   
2       0.4830       0.02340          0.000002    0.1220    0.370  149.749   
3       0.2370       0.16300          0.000004    0.1210    0.574   96.752   
4       0.0701       0.00162          0.105000    0.0789    0.539  127.059   

   duration_ms  time_signature  year  \
0       210133             4.0  1999   
1       206200    

In [103]:
# Convert string summaries to lowercase and then tokenize
selected_features['tokenized_summary'] = selected_features['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [104]:
# Define Word2Vec model parameters (may adjust later)
vector_size = 14
window_size = 5
min_count = 1

# Train Word2Vec model
word2vec_model = Word2Vec(selected_features['tokenized_summary'], vector_size=vector_size, window=window_size, min_count=min_count)

In [105]:
# Convert string summaries to vectors
def get_summary_vector(summary, model):
    summary_vector = [model.wv[word] for word in summary if word in model.wv]
    return sum(summary_vector) / len(summary_vector) if summary_vector else [0] * vector_size

summary_vector = selected_features['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))

In [106]:
selected_features.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)
print(summary_vector[0])

[ 0.8646738   1.4013695  -2.0540395   0.9210277   1.6148752   0.6943056
  0.37184477  1.6948727  -1.8480023   0.04738571 -0.07262218  0.3477416
 -2.3059337   1.9146067 ]


The numerical columns are audio characteristics of the song, and we want to scale all the values to make it become the embeddings.

In [29]:
# Extract the numeric columns (excluding 'id' and 'summary_vector')
numeric_columns = selected_features.drop(['id'], axis=1)

# Standardize the numeric columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_columns)

# Display the resulting DataFrame
print(scaled_data[0])

[-0.11476022  1.59348117  0.51038804  0.9231026   0.70123154 -0.10405628
 -1.0974239  -0.76136317  0.8647423   0.28528598  0.01103847 -0.23857505
  0.30032139 -0.70179981]


Finally, we want to merge those summary vector (name & artisits) with scaled vector (audio charactersitcs) to make the embeddings for each song.

In [30]:
song_embeddings = [
    np.concatenate([summary_row, scaled_row])
    for summary_row, scaled_row in zip(summary_vector, scaled_data)
]
print(song_embeddings[0])
print(len(song_embeddings), ", ", len(song_embeddings[0]))

[ 0.72331452  0.47999406 -1.67081606 -1.37377441  1.07354558  1.32752478
 -0.41009608  0.37724844 -0.14633438 -1.82648313  0.62498635 -0.54044133
 -2.98192334  2.15660644 -0.11476022  1.59348117  0.51038804  0.9231026
  0.70123154 -0.10405628 -1.0974239  -0.76136317  0.8647423   0.28528598
  0.01103847 -0.23857505  0.30032139 -0.70179981]
1141555 ,  28


Combining those things into our final table for uploading to Pinecone. The table should have two columns, one is id, and another one is song embeddings representation.

In [57]:
embedded_features = selected_features[["id"]].copy()
embedded_features.loc[:, "values"] = song_embeddings
print(embedded_features.head())
print(embedded_features.shape)

                       id                                             values
0  7lmeHLHBe4nmXzuXc0HDjk  [0.7233145236968994, 0.4799940586090088, -1.67...
1  1wsRitfRRtWyEapl0q22o8  [0.7651256918907166, 0.7351270318031311, -1.51...
2  1hR0fIFK2qRG3f3RF70pb7  [1.1690152883529663, 1.0357924699783325, -1.99...
3  2lbASgTSoDO7MTuLAXlTW0  [0.9501161575317383, 0.7967706918716431, -1.63...
4  1MQTmpYOZ6fcMQc56Hdo7T  [1.0759998559951782, 0.8795831799507141, -1.78...
(1141555, 2)


In [59]:
len(embedded_features['values'][0])

28

## Store embeddings to Pinecone

In [33]:
!pip install -qU \
  "pinecone-client[grpc]"==2.2.1

In [34]:
import os
import pinecone
import time

  from tqdm.autonotebook import tqdm


In [45]:
# PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') or 'YOUR_API_KEY'
# PINECONE_ENV = os.environ.get('PINECONE_ENVIRONMENT') or 'YOUR_ENV'

In [51]:
PINECONE_API_KEY = '03367330-5730-4400-ac60-9ab695a047c0'
PINECONE_ENV = 'us-east-1-aws'

In [52]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [70]:
index_name = 'music-recommender-test'
dim = len(embedded_features['values'][0])

In [71]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='cosine'
    )
    # wait a moment for the index to be fully initialized
    time.sleep(1)

# now connect to the index
index = pinecone.GRPCIndex(index_name)

In [72]:
index.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [77]:
test_data = embedded_features[:10]
store_data = embedded_features[10:]

In [78]:
index.upsert_from_dataframe(store_data, batch_size=1000)

sending upsert requests:   0%|          | 0/1141545 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141545

In [79]:
index.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141545}},
 'total_vector_count': 1141545}

## Search for similar songs

1. Our personal favofite song (feed 1 get top 10)
2. Our listening history (feed 10 get top 10)
3. Spotify 2023 top 100 song (most streamed 1 get 10 top)
4. Spotify 2023 top 100 song (feed 10 get top 10)

Pinecone search metric 

### Prepare Spotify top 100 song data

Get the most streamed songs in 2023 (datasets: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023/data, https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks)

In [3]:
# We are missing loudness information in here, so we need to use another dataset info
file_path_top_songs = '../spotify-2023.csv'
top_songs = pd.read_csv(file_path_top_songs, encoding='latin-1')
list(top_songs.columns)

['track_name',
 'artist(s)_name',
 'artist_count',
 'released_year',
 'released_month',
 'released_day',
 'in_spotify_playlists',
 'in_spotify_charts',
 'streams',
 'in_apple_playlists',
 'in_apple_charts',
 'in_deezer_playlists',
 'in_deezer_charts',
 'in_shazam_charts',
 'bpm',
 'key',
 'mode',
 'danceability_%',
 'valence_%',
 'energy_%',
 'acousticness_%',
 'instrumentalness_%',
 'liveness_%',
 'speechiness_%']

In [60]:
# get top 10 hits songs in 2023, need released year after 2014 and before 2023, so it will not in our original dataset
filtered_songs = top_songs[(top_songs['released_year'] > 2014) & (top_songs['released_year'] < 2023)]
top_10_songs = filtered_songs.sort_values(by = "streams", ascending = False).iloc[:10,:]

# Remove the last song from top_10_songs as the last one did not in other dataset
top_10_songs = top_10_songs.iloc[:-1, :]

# Get the next song in the sorted order
next_song = filtered_songs.sort_values(by="streams", ascending=False).iloc[10:11, :]

# Concatenate top_10_songs and next_song
top_10_songs = pd.concat([top_10_songs, next_song], ignore_index=True)
top_10_songs

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Anti-Hero,Taylor Swift,1,2022,10,21,9082,56,999748277,242,...,97,E,Major,64,51,63,12,0,19,5
1,Arcade,Duncan Laurence,1,2019,3,7,6646,0,991336132,107,...,72,A,Minor,45,27,33,82,0,14,4
2,Glimpse of Us,Joji,1,2022,6,10,6330,6,988515741,109,...,170,G#,Major,44,27,32,89,0,14,5
3,Seek & Destroy,SZA,1,2022,12,9,1007,0,98709329,5,...,152,C#,Major,65,35,65,44,18,21,7
4,"Come Back Home - From ""Purple Hearts""",Sofia Carson,1,2022,7,12,367,0,97610446,28,...,145,G,Major,56,43,53,24,0,12,4
5,Where Are You Now,"Lost Frequencies, Calum Scott",2,2021,7,30,10565,44,972509632,238,...,121,F#,Minor,67,26,64,52,0,17,10
6,Alone,Burna Boy,1,2022,11,4,782,2,96007391,27,...,90,E,Minor,61,32,67,15,0,11,5
7,No Lie,"Sean Paul, Dua Lipa",2,2016,11,18,7370,0,956865266,92,...,102,G,Major,74,45,89,5,0,26,13
8,HEARTBREAK ANNIVERSARY,Giveon,1,2020,2,21,5398,4,951637566,111,...,129,,Major,61,59,46,56,0,13,5
9,Used (feat. Don Toliver),"SZA, Don Toliver",2,2022,12,8,1042,0,94005786,7,...,150,A#,Minor,73,71,69,53,0,32,9


In [73]:
# extract top 10 songs name to search in all songs dataset
top_10_songs_to_search = top_10_songs[['track_name', 'artist(s)_name']]
top_10_songs_to_search = top_10_songs_to_search.rename(columns={'track_name': 'track_name', 'artist(s)_name': 'artist_name'})

# Split 'artists_name' and keep only the first part, because the another dataset only keep one artist
top_10_songs_to_search['artist_name'] = top_10_songs_to_search['artist_name'].str.split(',').str[0]

top_10_songs_to_search

Unnamed: 0,track_name,artist_name
0,Anti-Hero,Taylor Swift
1,Arcade,Duncan Laurence
2,Glimpse of Us,Joji
3,Seek & Destroy,SZA
4,"Come Back Home - From ""Purple Hearts""",Sofia Carson
5,Where Are You Now,Lost Frequencies
6,Alone,Burna Boy
7,No Lie,Sean Paul
8,HEARTBREAK ANNIVERSARY,Giveon
9,Used (feat. Don Toliver),SZA


In [74]:
# manually exchange song name value as they did not same across the dataset
top_10_songs_to_search.loc[4, "track_name"] = "Come Back Home"
top_10_songs_to_search.loc[8, "track_name"] = "Heartbreak Anniversary"
top_10_songs_to_search

Unnamed: 0,track_name,artist_name
0,Anti-Hero,Taylor Swift
1,Arcade,Duncan Laurence
2,Glimpse of Us,Joji
3,Seek & Destroy,SZA
4,Come Back Home,Sofia Carson
5,Where Are You Now,Lost Frequencies
6,Alone,Burna Boy
7,No Lie,Sean Paul
8,Heartbreak Anniversary,Giveon
9,Used (feat. Don Toliver),SZA


In [86]:
file_path_all_songs = '../spotify_data.csv'
all_songs = pd.read_csv(file_path_all_songs, index_col = 0)
print(all_songs.head())

     artist_name        track_name                track_id  popularity  year  \
0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6          68  2012   
1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218          50  2012   
2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F          57  2012   
3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz          58  2012   
4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8          54  2012   

      genre  danceability  energy  key  loudness  mode  speechiness  \
0  acoustic         0.483   0.303    4   -10.058     1       0.0429   
1  acoustic         0.572   0.454    3   -10.286     1       0.0258   
2  acoustic         0.409   0.234    3   -13.711     1       0.0323   
3  acoustic         0.392   0.251   10    -9.845     1       0.0363   
4  acoustic         0.430   0.791    6    -5.419     0       0.0302   

   acousticness  instrumentalness  liveness  valence    tempo  duration_ms  \
0        0.694

In [111]:
# check if all songs is in
selected_10_songs = pd.merge(all_songs, top_10_songs_to_search, on=['track_name', 'artist_name'], how='inner')
selected_10_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Sean Paul,No Lie,1Vb4HQnN2kZ5Y2KgYF5TDV,57,2016,dance,0.742,0.882,7,-2.862,1,0.117,0.0466,0.0,0.206,0.463,102.04,221176,4
1,Duncan Laurence,Arcade,1Xi84slp6FryDSCbzq4UCD,77,2019,pop,0.45,0.329,9,-12.603,0,0.0441,0.818,0.00109,0.135,0.266,71.884,183624,3
2,Giveon,Heartbreak Anniversary,3FAJ6O0NOHQV8Mc5Ri6ENp,79,2020,pop,0.449,0.465,0,-8.964,1,0.0791,0.524,1e-06,0.303,0.543,89.087,198371,3
3,Lost Frequencies,Where Are You Now,3uUuGVFu1V7jTQL60S1r8z,84,2021,dance,0.671,0.636,6,-8.117,0,0.103,0.515,0.000411,0.172,0.262,120.966,148197,4
4,Burna Boy,Alone,0AoBY2Y3qs6dtGgOD6c91N,77,2022,dance,0.6,0.659,4,-7.264,0,0.0542,0.176,0.0,0.111,0.307,89.955,221747,4
5,Taylor Swift,Anti-Hero,0V3wPSX9ygBnCm8psDIegu,92,2022,pop,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4
6,SZA,Seek & Destroy,6eT2V7nKXyMf47TwPbtgAD,79,2022,pop,0.651,0.647,1,-5.415,1,0.0654,0.437,0.175,0.205,0.345,152.069,203733,4
7,Joji,Glimpse of Us,6xGruZOHLs39ZbVccQTuPZ,85,2022,pop,0.44,0.317,8,-9.258,1,0.0531,0.891,5e-06,0.141,0.268,169.914,233456,3
8,SZA,Used (feat. Don Toliver),1TweDM3JC49LNeelLVg3yX,76,2022,pop,0.734,0.689,10,-6.454,0,0.0871,0.532,8.5e-05,0.322,0.705,149.579,70160,4
9,Sofia Carson,Come Back Home,1I4dwH7C0jBAEtz5DjlJgQ,73,2022,pop,0.552,0.531,7,-7.732,1,0.0421,0.241,1.2e-05,0.122,0.438,144.946,176859,4


In [112]:
# format spotify top 10 songs to make sure it have same data format
selected_10_songs = selected_10_songs.drop(columns=["track_id", "popularity", "genre"])
moved_column = selected_10_songs.pop("year")
selected_10_songs["year"] = moved_column

# add the song summary
selected_10_songs['string_summary'] = selected_10_songs['track_name'] + ' - ' + selected_10_songs['artist_name']
selected_10_songs['string_summary'] = selected_10_songs['string_summary'].astype(str)

# Drop the original 'name' and 'artists' columns
selected_10_songs.drop(['track_name', 'artist_name'], axis=1, inplace=True)

selected_10_songs

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,string_summary
0,0.742,0.882,7,-2.862,1,0.117,0.0466,0.0,0.206,0.463,102.04,221176,4,2016,No Lie - Sean Paul
1,0.45,0.329,9,-12.603,0,0.0441,0.818,0.00109,0.135,0.266,71.884,183624,3,2019,Arcade - Duncan Laurence
2,0.449,0.465,0,-8.964,1,0.0791,0.524,1e-06,0.303,0.543,89.087,198371,3,2020,Heartbreak Anniversary - Giveon
3,0.671,0.636,6,-8.117,0,0.103,0.515,0.000411,0.172,0.262,120.966,148197,4,2021,Where Are You Now - Lost Frequencies
4,0.6,0.659,4,-7.264,0,0.0542,0.176,0.0,0.111,0.307,89.955,221747,4,2022,Alone - Burna Boy
5,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4,2022,Anti-Hero - Taylor Swift
6,0.651,0.647,1,-5.415,1,0.0654,0.437,0.175,0.205,0.345,152.069,203733,4,2022,Seek & Destroy - SZA
7,0.44,0.317,8,-9.258,1,0.0531,0.891,5e-06,0.141,0.268,169.914,233456,3,2022,Glimpse of Us - Joji
8,0.734,0.689,10,-6.454,0,0.0871,0.532,8.5e-05,0.322,0.705,149.579,70160,4,2022,Used (feat. Don Toliver) - SZA
9,0.552,0.531,7,-7.732,1,0.0421,0.241,1.2e-05,0.122,0.438,144.946,176859,4,2022,Come Back Home - Sofia Carson


In [113]:
selected_10_songs['tokenized_summary'] = selected_10_songs['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [114]:
top_10_summary_vector = selected_10_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))

In [115]:
selected_10_songs.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)
print(top_10_summary_vector[0])

[ 3.255381   -0.21609983 -3.0687718  -1.0209198  -2.378174   -0.44674015
  0.35360897  2.0732138  -1.2961953   0.8578722   0.5954691   1.7431103
 -0.3678921   2.0115974 ]


In [117]:
top_10_songs_scaled = scaler.fit_transform(selected_10_songs)

# Display the resulting DataFrame
print(top_10_songs_scaled[0])

[ 1.3650332   1.84352071  0.4463037   1.91004076  0.81649658  1.93012033
 -1.42607907 -0.33672787  0.28590593  0.35268685 -0.52754349  0.78345199
  0.65465367 -2.54399491]


In [118]:
top_10_song_embeddings = [
    np.concatenate([summary_row, scaled_row])
    for summary_row, scaled_row in zip(top_10_summary_vector, top_10_songs_scaled)
]
print(top_10_song_embeddings[0])
print(len(top_10_song_embeddings), ", ", len(top_10_song_embeddings[0]))

[ 3.25538111 -0.21609983 -3.06877184 -1.0209198  -2.37817407 -0.44674015
  0.35360897  2.07321382 -1.29619527  0.85787219  0.59546912  1.7431103
 -0.36789209  2.01159739  1.3650332   1.84352071  0.4463037   1.91004076
  0.81649658  1.93012033 -1.42607907 -0.33672787  0.28590593  0.35268685
 -0.52754349  0.78345199  0.65465367 -2.54399491]
10 ,  28


In [119]:
# mean aggregation method
mean_top_10_song_embeddings = np.mean(top_10_song_embeddings, axis = 0)

# sum & normalize aggregation method
sum_top_10_song_embeddings = np.sum(top_10_song_embeddings, axis = 0)
sum_top_10_song_embeddings /= np.linalg.norm(sum_top_10_song_embeddings)

### Prepare individual personal song data

Seanna's top 10 favorite song has various genre and style:
1. Teeth - 5 Seconds of Summer
2. I WANNA BE YOUR SLAVE - Måneskin
3. Enemy - from the series Arcane League of Legends - Imagine Dragons
4. Say Something - A Great Big World
5. Marry You - Bruno Mars
6. Gotta Have You - The Weepies
7. 100 Degrees - Rich Brian
8. The Monster - Eminem
9. You Belong With Me - Taylor Swift
10. Bailando - Spanish Version - Enrique Iglesias

In [187]:
seanna_data = {
    'track_name': [
        'Teeth',
        'I WANNA BE YOUR SLAVE',
        'Enemy - from the series Arcane League of Legends',
        'Say Something',
        'Marry You',
        'Gotta Have You',
        '100 Degrees',
        'The Monster',
        'You Belong With Me',
        'Bailando - Spanish Version'
    ],
    'artist_name': [
        '5 Seconds of Summer',
        'Måneskin',
        'Imagine Dragons',
        'A Great Big World',
        'Bruno Mars',
        'The Weepies',
        'Rich Brian',
        'Eminem',
        'Taylor Swift',
        'Enrique Iglesias'
    ]
}

# Create DataFrame
seanna_favorite_song = pd.DataFrame(seanna_data)
seanna_favorite_song

Unnamed: 0,track_name,artist_name
0,Teeth,5 Seconds of Summer
1,I WANNA BE YOUR SLAVE,Måneskin
2,Enemy - from the series Arcane League of Legends,Imagine Dragons
3,Say Something,A Great Big World
4,Marry You,Bruno Mars
5,Gotta Have You,The Weepies
6,100 Degrees,Rich Brian
7,The Monster,Eminem
8,You Belong With Me,Taylor Swift
9,Bailando - Spanish Version,Enrique Iglesias


In [188]:
seanna_favorite_song = pd.merge(all_songs, seanna_favorite_song, on=['track_name', 'artist_name'], how='inner')
seanna_favorite_song

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Eminem,The Monster,48RrDBpOSSl1aLVCalGl5C,78,2013,hip-hop,0.781,0.853,1,-3.68,0,0.0715,0.0525,0.0,0.12,0.624,110.049,250189,4
1,A Great Big World,Say Something,78TKtlSLWK8pZAKKW3MyQL,56,2013,piano,0.453,0.146,2,-8.976,1,0.0343,0.867,3e-06,0.0945,0.0915,137.905,229400,3
2,Enrique Iglesias,Bailando - Spanish Version,32lm3769IRfcnrQV11LO4E,67,2014,pop,0.723,0.777,7,-3.503,1,0.108,0.0426,4e-06,0.0451,0.961,91.017,243413,4
3,5 Seconds of Summer,Teeth,26wLOs3ZuHJa2Ihhx6QIE6,76,2019,dance,0.756,0.448,3,-2.993,0,0.0404,0.0508,4e-06,0.11,0.431,139.031,204887,4
4,Rich Brian,100 Degrees,2ZDpSQfBdgkooeXw6oj3Uz,57,2019,hip-hop,0.756,0.648,0,-5.287,1,0.0731,0.118,0.0,0.515,0.657,80.979,166146,4
5,Måneskin,I WANNA BE YOUR SLAVE,4pt5fDVTg5GhEvEtlz9dKk,81,2021,indie-pop,0.75,0.608,1,-4.008,1,0.0387,0.00165,0.0,0.178,0.958,132.507,173347,4
6,Imagine Dragons,Enemy - from the series Arcane League of Legends,45lFaFCHXmpCiiMDvtihIv,1,2023,rock,0.728,0.783,11,-4.424,0,0.266,0.237,0.0,0.434,0.555,77.011,173381,4
7,The Weepies,Gotta Have You,1YjMWOorkBaP4MdKkKtp4y,50,2005,acoustic,0.678,0.363,11,-10.9,1,0.0318,0.872,0.000101,0.0798,0.543,75.004,199787,5
8,Taylor Swift,You Belong With Me,3GCL1PydwsLodcpv0Ll1ch,68,2008,pop,0.687,0.783,6,-4.44,1,0.0386,0.162,1.3e-05,0.114,0.443,129.964,231133,4
9,Bruno Mars,Marry You,22PMfvdz35fFKYnJyMn077,74,2010,dance,0.621,0.82,10,-4.865,1,0.0367,0.332,0.0,0.104,0.452,144.905,230192,4


In [189]:
# format spotify top 10 songs to make sure it have same data format
seanna_favorite_song = seanna_favorite_song.drop(columns=["track_id", "popularity", "genre"])
moved_column = seanna_favorite_song.pop("year")
seanna_favorite_song["year"] = moved_column

# add the song summary
seanna_favorite_song['string_summary'] = seanna_favorite_song['track_name'] + ' - ' + seanna_favorite_song['artist_name']
seanna_favorite_song['string_summary'] = seanna_favorite_song['string_summary'].astype(str)

# Drop the original 'name' and 'artists' columns
seanna_favorite_song.drop(['track_name', 'artist_name'], axis=1, inplace=True)

seanna_favorite_song['tokenized_summary'] = seanna_favorite_song['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [191]:
seanna_summary_vector = seanna_favorite_song['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
seanna_favorite_song.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)
seanna_songs_scaled = scaler.fit_transform(seanna_favorite_song)

seanna_favorite_song_embeddings = [
    np.concatenate([summary_row, scaled_row])
    for summary_row, scaled_row in zip(seanna_summary_vector, seanna_songs_scaled)
]
print(seanna_favorite_song_embeddings[0])
print(len(seanna_favorite_song_embeddings), ", ", len(seanna_favorite_song_embeddings[0]))

[ 1.10363603  0.62966162 -1.85561883  0.80391067  0.601749    0.14951998
 -0.07459654  1.21377826 -1.28332782  0.16437174  0.17598364  0.71812558
 -2.21516895  1.33831322  0.95631581  1.03607778 -1.01388955  0.6677343
 -1.52752523 -0.03537872 -0.70704936 -0.42204769 -0.39115239  0.21535434
 -0.06651864  1.35618345  0.         -0.26832816]
10 ,  28


In [192]:
# mean aggregation method
mean_seanna_song_embeddings = np.mean(seanna_favorite_song_embeddings, axis = 0)

# sum & normalize aggregation method
sum_seanna_song_embeddings = np.sum(seanna_favorite_song_embeddings, axis = 0)
sum_seanna_song_embeddings /= np.linalg.norm(sum_seanna_song_embeddings)

Yuhan's top 10 favorite song has similar genre and style:
1. Anti-Hero - Taylor Swift
2. Lover - Taylor Swift
3. Question...? - Taylor Swift
4. deja vu - Olivia Rodrigo
5. RADIO - HENRY
6. Wonderful U - AGA
7. Forever Young - Eve Ai
8. Something's Wrong with the Morning - Margo Guryan
9. The Most Beautiful Thing - Bruno Major
10. At My Worst - Pink Sweat$

In [194]:
yuhan_data = {
    'track_name': [
        'Anti-Hero',
        'Lover',
        'Question...?',
        'deja vu',
        'RADIO',
        'Wonderful U',
        'Forever Young',
        "Something's Wrong with the Morning",
        'The Most Beautiful Thing',
        'At My Worst'
    ],
    'artist_name': [
        'Taylor Swift',
        'Taylor Swift',
        'Taylor Swift',
        'Olivia Rodrigo',
        'HENRY',
        'AGA',
        'Eve Ai',
        'Margo Guryan',
        'Bruno Major',
        'Pink Sweat$'
    ]
}

# Create DataFrame
yuhan_favorite_song = pd.DataFrame(yuhan_data)
yuhan_favorite_song = pd.merge(all_songs, yuhan_favorite_song, on=['track_name', 'artist_name'], how='inner')
yuhan_favorite_song

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Margo Guryan,Something's Wrong with the Morning,0IqQoCYYaSeM2ThWKPGoXX,52,2014,pop,0.656,0.567,2,-8.128,0,0.0352,0.682,0.000315,0.106,0.71,133.558,105573,4
1,AGA,Wonderful U,2eSNpIOFoi1Q8wxw6CycXJ,47,2016,cantopop,0.557,0.436,6,-8.569,1,0.0676,0.809,0.0,0.151,0.246,179.997,248551,3
2,Eve Ai,Forever Young,25sQT3yCEgd1uE6LC9ivcs,51,2018,singer-songwriter,0.304,0.226,0,-10.707,1,0.0329,0.929,0.0,0.161,0.323,139.593,313907,4
3,Taylor Swift,Lover,1dGr1c8CrMLDpV6mPbImSI,83,2019,pop,0.359,0.543,7,-7.582,1,0.0919,0.492,1.6e-05,0.118,0.453,68.534,221307,4
4,Pink Sweat$,At My Worst,0ri0Han4IRJhzvERHOZTMr,71,2020,chill,0.813,0.415,0,-5.926,1,0.0349,0.777,0.0,0.131,0.667,91.921,170345,4
5,HENRY,RADIO,4Dyb1oDEx4togM79cHL8UK,48,2020,k-pop,0.761,0.766,0,-5.414,1,0.143,0.118,0.0,0.111,0.266,146.879,191985,4
6,Bruno Major,The Most Beautiful Thing,07koEqsKHZTlGVMC9eoEjO,67,2020,pop,0.806,0.362,7,-10.386,1,0.0344,0.541,0.0489,0.111,0.418,127.498,235427,4
7,Olivia Rodrigo,deja vu,6HU7h9RYOaPRFeh0R3UeAr,83,2021,pop,0.442,0.612,2,-7.222,1,0.112,0.584,6e-06,0.37,0.178,180.917,215507,4
8,Taylor Swift,Anti-Hero,0V3wPSX9ygBnCm8psDIegu,92,2022,pop,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4
9,Taylor Swift,Question...?,0heeNYlwOGuUSe7TgUD27B,74,2022,pop,0.751,0.502,7,-8.763,1,0.167,0.2,0.0,0.296,0.106,108.943,210557,4


In [195]:
# format spotify top 10 songs to make sure it have same data format
yuhan_favorite_song = yuhan_favorite_song.drop(columns=["track_id", "popularity", "genre"])
moved_column = yuhan_favorite_song.pop("year")
yuhan_favorite_song["year"] = moved_column

# add the song summary
yuhan_favorite_song['string_summary'] = yuhan_favorite_song['track_name'] + ' - ' + yuhan_favorite_song['artist_name']
yuhan_favorite_song['string_summary'] = yuhan_favorite_song['string_summary'].astype(str)

# Drop the original 'name' and 'artists' columns
yuhan_favorite_song.drop(['track_name', 'artist_name'], axis=1, inplace=True)

yuhan_favorite_song['tokenized_summary'] = yuhan_favorite_song['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [196]:
yuhan_summary_vector = yuhan_favorite_song['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
yuhan_favorite_song.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)
yuhan_songs_scaled = scaler.fit_transform(yuhan_favorite_song)

yuhan_favorite_song_embeddings = [
    np.concatenate([summary_row, scaled_row])
    for summary_row, scaled_row in zip(yuhan_summary_vector, yuhan_songs_scaled)
]
print(yuhan_favorite_song_embeddings[0])
print(len(yuhan_favorite_song_embeddings), ", ", len(yuhan_favorite_song_embeddings[0]))

[ 1.94422388  0.56498098 -2.01481271  1.00435936 -0.18755628  1.02973306
 -0.66433185  2.19081354 -0.60373712 -0.10503092 -0.36114064  0.2264642
 -2.22639585  2.45293164  0.26800525  0.40781972 -0.51601569 -0.12088828
 -3.         -0.89490988  0.5636146  -0.31440234 -0.74871726  1.66240608
  0.17398382 -2.07684355  0.33333333 -2.13000299]
10 ,  28


In [197]:
# mean aggregation method
mean_yuhan_song_embeddings = np.mean(yuhan_favorite_song_embeddings, axis = 0)

# sum & normalize aggregation method
sum_yuhan_song_embeddings = np.sum(yuhan_favorite_song_embeddings, axis = 0)
sum_yuhan_song_embeddings /= np.linalg.norm(sum_yuhan_song_embeddings)

### Individual Song Vector as Separate Queries

In [80]:
test_data

Unnamed: 0,id,values
0,7lmeHLHBe4nmXzuXc0HDjk,"[0.7233145236968994, 0.4799940586090088, -1.67..."
1,1wsRitfRRtWyEapl0q22o8,"[0.7651256918907166, 0.7351270318031311, -1.51..."
2,1hR0fIFK2qRG3f3RF70pb7,"[1.1690152883529663, 1.0357924699783325, -1.99..."
3,2lbASgTSoDO7MTuLAXlTW0,"[0.9501161575317383, 0.7967706918716431, -1.63..."
4,1MQTmpYOZ6fcMQc56Hdo7T,"[1.0759998559951782, 0.8795831799507141, -1.78..."
5,2LXPNLSMAauNJfnC58lSqY,"[1.2189991474151611, 0.6400238275527954, -2.23..."
6,3moeHk8eIajvUEzVocXukf,"[0.1430703103542328, 0.9378200173377991, -2.30..."
7,4llunZfVXv3NvUzXVB3VVL,"[0.6131278872489929, -0.008440256118774414, -0..."
8,21Mq0NzFoVRvOmLTOnJjng,"[1.0829685926437378, 0.1389971375465393, -0.98..."
9,6s2FgJbnnMwFTpWJZzvb6z,"[0.38173824548721313, 0.20188066363334656, -1...."


In [81]:
# query with song "7lmeHLHBe4nmXzuXc0HDjk"

# create the query vector
xq = test_data['values'][0]

# now query
xc = index.query(xq, top_k=5, include_metadata=True)
xc

{'matches': [{'id': '5S4fFQBvN1DigjO6XqRM16',
              'metadata': {},
              'score': 0.962494,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '4ARUrlst7m0CLVMOyFg6XZ',
              'metadata': {},
              'score': 0.9586928,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '5mhW5HmdaH1c2Eyor47W80',
              'metadata': {},
              'score': 0.9574417,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '3yjyLEq4lSr0CbZuUc2uZr',
              'metadata': {},
              'score': 0.9555039,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '0paR0kCOxCUKqmtNDHjPaL',
              'metadata': {},
              'score': 0.94905937,
              'sparse_values': {'indices': [], 'values': []},
              'values': 

In [85]:
df[df['id'] == '5S4fFQBvN1DigjO6XqRM16']

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
802610,5S4fFQBvN1DigjO6XqRM16,Projecting Power,Emo Diaries - Chapter Ten - The Hope I Hide In...,13W3r4Kq7uMOA94IXEoEYk,['The Holiday Plan'],['2Y7RpEHJ35w7FjeLZIefGd'],3,1,False,0.454,...,0.0525,0.103,0.0247,0.43,0.539,110.937,210853,4.0,2004,2004-04-27


In [86]:
df[df['id'] == '7lmeHLHBe4nmXzuXc0HDjk']

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02


### Combined Song Vector as Single Query