## Setup Environment

In [1]:
# Required package for this project 
# !pip install pandas \
#             nltk \
#             gensim \
#             scikit-learn \
#             numpy

In [2]:
import pandas as pd
import nltk
# nltk.download('punkt') # for the first time need to download this for tokenization
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
import numpy as np

## Load dataset of songs

Dataset: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs, an open source dataset on Kaggle. It provides nearly 1.2 million of songs in Spotify. Those songs were retreived by using Spotify API.

In [3]:
file_path = '../tracks_features.csv'
songs_df = pd.read_csv(file_path)
print(songs_df.head())

                       id                   name                      album  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  The Battle Of Los Angeles   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  The Battle Of Los Angeles   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  The Battle Of Los Angeles   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  The Battle Of Los Angeles   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  The Battle Of Los Angeles   

                 album_id                       artists  \
0  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
1  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
2  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
3  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
4  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   

                   artist_ids  track_number  disc_number  explicit  \
0  ['2d0hyoQ5ynDBnkvAbJKORj']             1            1     False   
1  ['2d0hyoQ5ynDBnkvAbJKORj'] 

## Preprocessing data

We want to perform some operations to select the numeric audio features we want, and also convert those categorical values into numeric one to create the vector embeddings.
The selected features include 2 categorical features (name + artists), and 14 numeric audio features:
- id (unique index)
- name
- artists
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acousticness
- instrumentalness
- liveness
- valence
- tempo
- duration_ms
- time_signature
- year

In [4]:
selected_features_df = songs_df.drop(columns=["album", "album_id", "artist_ids", "track_number", "disc_number", "explicit", "release_date"])
print(selected_features_df.head())

                       id                   name  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire   

                        artists  danceability  energy  key  loudness  mode  \
0  ['Rage Against The Machine']         0.470   0.978    7    -5.399     1   
1  ['Rage Against The Machine']         0.599   0.957   11    -5.764     1   
2  ['Rage Against The Machine']         0.315   0.970    7    -5.424     1   
3  ['Rage Against The Machine']         0.440   0.967   11    -5.830     0   
4  ['Rage Against The Machine']         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.

In [5]:
# check if our filtered features contain any missing value
selected_features_df.isna().any()

id                  False
name                 True
artists             False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
year                False
dtype: bool

In [6]:
# remove those missing value rows
print("Shape before drop missing value: ", selected_features_df.shape)
selected_features_df = selected_features_df.dropna()
print("Shape after drop missing value: ", selected_features_df.shape)

Shape before drop missing value:  (1204025, 17)
Shape after drop missing value:  (1204022, 17)


In [7]:
# some rows contain 0 value for year, we want to filter those row out as well
selected_features_df = selected_features_df[selected_features_df['year'] != 0] 
print("Shape after drop invalid year: ", selected_features_df.shape)

Shape after drop invalid year:  (1204012, 17)


Some songs have multiple artists, we want to convert them from a list to string.
Example: ['Pietro Locatelli', 'Capella Istropolitana', 'Jaroslav Krcek'] to 'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [8]:
def convert_artists_name(artists_list):
    items_list = artists_list.strip("[]").replace("'", "").split(", ")
    return ", ".join(items_list)

selected_features_df["artists"] = selected_features_df["artists"].apply(convert_artists_name)
selected_features_df.iloc[1184]["artists"]

'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [9]:
# remove duplicated rows by song name and artists name
selected_features_df = selected_features_df.drop_duplicates(subset=['name', 'artists'])
print("Shape after duplicated removal: ", selected_features_df.shape)

Shape after duplicated removal:  (1141542, 17)


## Create vector embeddings model

### Create categorical feature vector embeddings

We first need to convert those song and artists name into vector. The converted vector representation will have length of 14, so we can combine these with 14 numeric column values. We will combine the song name with artists name to one column for better tokenize

In [10]:
# perform tokenization operation on the song name and artist columns
def create_tokenized_summary(df, name_col, artist_col):
    # Combine song name and artists columns into a new 'string_summary' column
    df['string_summary'] = df[name_col] + ' - ' + df[artist_col]
    df['string_summary'] = df['string_summary'].astype(str)

    # Drop the original 'name' and 'artists' columns
    df.drop([name_col, artist_col], axis=1, inplace=True)

    # Convert string summaries to lowercase and then tokenize
    df['tokenized_summary'] = df['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [11]:
# Convert string (tokenized) summaries to vectors
def get_summary_vector(summary, model):
    summary_vector = [model.wv[word] for word in summary if word in model.wv]
    return sum(summary_vector) / len(summary_vector) if summary_vector else [0] * vector_size

In [12]:
def clean_tokenized_summary(df):
    df.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)

In [13]:
create_tokenized_summary(selected_features_df, 'name', 'artists')

In [14]:
# Define Word2Vec model parameters (may adjust later)
vector_size = 14
window_size = 5
min_count = 1

# Train Word2Vec model
word2vec_model = Word2Vec(selected_features_df['tokenized_summary'], vector_size=vector_size, window=window_size, min_count=min_count)

In [15]:
summary_vector = selected_features_df['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(selected_features_df)
print(summary_vector[0])

[ 1.4143943   0.4900008  -2.089417    1.5587121   2.7189271   0.4879061
 -0.39689735  1.626835   -0.9530144  -0.2434497  -0.10700735 -0.37622285
 -2.0893767   1.1830873 ]


### Create numerical features vector embeddings

The numerical columns are audio characteristics of the song, and we want to scale all the values to make it become the embeddings.

In [16]:
# Extract the numeric columns (excluding 'id')
numeric_columns = selected_features_df.drop(['id'], axis=1)
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(numeric_columns)
# Display the resulting DataFrame
print(scaled_columns[0])

[-0.11474546  1.59347817  0.5103876   0.92309668  0.70122776 -0.1040544
 -1.09743536 -0.76135497  0.86473578  0.28529403  0.01103821 -0.23857829
  0.30032326 -0.80939027]


### Merged vector embeddings to create final one

Finally, we want to merge those summary vector (name & artisits) with scaled vector (audio charactersitcs) to make the embeddings for each song.

In [17]:
def merged_embeddings(summary_vector, scaled_columns):
    song_embeddings = [
        np.concatenate([summary_row, scaled_row])
        for summary_row, scaled_row in zip(summary_vector, scaled_columns)
    ]
    print("First song's embedding: ", song_embeddings[0])
    print("Size for entire dataset: ", len(song_embeddings), ", ", len(song_embeddings[0]))
    return song_embeddings

In [18]:
song_embeddings = merged_embeddings(summary_vector, scaled_columns)

First song's embedding:  [ 1.41439426  0.49000081 -2.08941698  1.55871212  2.71892715  0.4879061
 -0.39689735  1.62683499 -0.95301437 -0.2434497  -0.10700735 -0.37622285
 -2.08937669  1.18308735 -0.11474546  1.59347817  0.5103876   0.92309668
  0.70122776 -0.1040544  -1.09743536 -0.76135497  0.86473578  0.28529403
  0.01103821 -0.23857829  0.30032326 -0.80939027]
Size for entire dataset:  1141542 ,  28


In [19]:
# Combining those things into our final table for uploading to Pinecone. The table should have two columns, one is id, and another one is song embeddings representation.
embedded_features = selected_features_df[["id"]].copy()
embedded_features.loc[:, "values"] = song_embeddings
print(embedded_features.head())
print(embedded_features.shape)

                       id                                             values
0  7lmeHLHBe4nmXzuXc0HDjk  [1.4143942594528198, 0.49000081419944763, -2.0...
1  1wsRitfRRtWyEapl0q22o8  [2.1823840141296387, -0.17278139293193817, -1....
2  1hR0fIFK2qRG3f3RF70pb7  [1.8902316093444824, 0.7664139866828918, -2.60...
3  2lbASgTSoDO7MTuLAXlTW0  [1.4215434789657593, 0.40790510177612305, -1.9...
4  1MQTmpYOZ6fcMQc56Hdo7T  [2.0814242362976074, 0.12002609670162201, -2.7...
(1141542, 2)


## Prepare dataset for searching similar songs

Two different search strategies:
1. Combined all history songs into one embedding for query, get top 10 recommendations
2. Convert each individual into one embedding, perform 10 queries to get the top 1 recommendation for each one

Two query sources:
1. Personal favorite song & listening history
2. Spotify 2023 top hit 100 songs

Pinecone search metrics:
1. Cosine
2. Euclidean
3. Dotproduct

### Prepare Spotify top 100 song data

Get the most streamed songs in 2023 (datasets: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023/data, https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks)

In [20]:
# We are missing loudness information in here, so we need to use another dataset info
file_path_top_songs = '../spotify-2023.csv'
top_songs = pd.read_csv(file_path_top_songs, encoding='latin-1')
list(top_songs.columns)

['track_name',
 'artist(s)_name',
 'artist_count',
 'released_year',
 'released_month',
 'released_day',
 'in_spotify_playlists',
 'in_spotify_charts',
 'streams',
 'in_apple_playlists',
 'in_apple_charts',
 'in_deezer_playlists',
 'in_deezer_charts',
 'in_shazam_charts',
 'bpm',
 'key',
 'mode',
 'danceability_%',
 'valence_%',
 'energy_%',
 'acousticness_%',
 'instrumentalness_%',
 'liveness_%',
 'speechiness_%']

In [21]:
# get top 10 hits songs in 2023 that released in recent 10 years
filtered_songs = top_songs[(top_songs['released_year'] > 2014) & (top_songs['released_year'] < 2023)]
top_10_songs = filtered_songs.sort_values(by = "streams", ascending = False).iloc[:10,:]

# Remove the last song from top_10_songs as the last one did not in all songs dataset
top_10_songs = top_10_songs.iloc[:-1, :]

# Get the next song in the sorted order
next_song = filtered_songs.sort_values(by="streams", ascending=False).iloc[10:11, :]

# Concatenate top_10_songs and next_song
top_10_songs = pd.concat([top_10_songs, next_song], ignore_index=True)
top_10_songs

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Anti-Hero,Taylor Swift,1,2022,10,21,9082,56,999748277,242,...,97,E,Major,64,51,63,12,0,19,5
1,Arcade,Duncan Laurence,1,2019,3,7,6646,0,991336132,107,...,72,A,Minor,45,27,33,82,0,14,4
2,Glimpse of Us,Joji,1,2022,6,10,6330,6,988515741,109,...,170,G#,Major,44,27,32,89,0,14,5
3,Seek & Destroy,SZA,1,2022,12,9,1007,0,98709329,5,...,152,C#,Major,65,35,65,44,18,21,7
4,"Come Back Home - From ""Purple Hearts""",Sofia Carson,1,2022,7,12,367,0,97610446,28,...,145,G,Major,56,43,53,24,0,12,4
5,Where Are You Now,"Lost Frequencies, Calum Scott",2,2021,7,30,10565,44,972509632,238,...,121,F#,Minor,67,26,64,52,0,17,10
6,Alone,Burna Boy,1,2022,11,4,782,2,96007391,27,...,90,E,Minor,61,32,67,15,0,11,5
7,No Lie,"Sean Paul, Dua Lipa",2,2016,11,18,7370,0,956865266,92,...,102,G,Major,74,45,89,5,0,26,13
8,HEARTBREAK ANNIVERSARY,Giveon,1,2020,2,21,5398,4,951637566,111,...,129,,Major,61,59,46,56,0,13,5
9,Used (feat. Don Toliver),"SZA, Don Toliver",2,2022,12,8,1042,0,94005786,7,...,150,A#,Minor,73,71,69,53,0,32,9


In [22]:
# extract top 10 songs name to search in all songs dataset
top_10_songs_to_search = top_10_songs[['track_name', 'artist(s)_name']]
top_10_songs_to_search = top_10_songs_to_search.rename(columns={'track_name': 'track_name', 'artist(s)_name': 'artist_name'})

# Split 'artists_name' and keep only the first part, because the another dataset only keep one artist
top_10_songs_to_search['artist_name'] = top_10_songs_to_search['artist_name'].str.split(',').str[0]

top_10_songs_to_search

Unnamed: 0,track_name,artist_name
0,Anti-Hero,Taylor Swift
1,Arcade,Duncan Laurence
2,Glimpse of Us,Joji
3,Seek & Destroy,SZA
4,"Come Back Home - From ""Purple Hearts""",Sofia Carson
5,Where Are You Now,Lost Frequencies
6,Alone,Burna Boy
7,No Lie,Sean Paul
8,HEARTBREAK ANNIVERSARY,Giveon
9,Used (feat. Don Toliver),SZA


In [23]:
# manually exchange song name values as they are not the same across the dataset
top_10_songs_to_search.loc[4, "track_name"] = "Come Back Home"
top_10_songs_to_search.loc[8, "track_name"] = "Heartbreak Anniversary"
top_10_songs_to_search

Unnamed: 0,track_name,artist_name
0,Anti-Hero,Taylor Swift
1,Arcade,Duncan Laurence
2,Glimpse of Us,Joji
3,Seek & Destroy,SZA
4,Come Back Home,Sofia Carson
5,Where Are You Now,Lost Frequencies
6,Alone,Burna Boy
7,No Lie,Sean Paul
8,Heartbreak Anniversary,Giveon
9,Used (feat. Don Toliver),SZA


In [24]:
file_path_all_songs = '../spotify_data.csv'
all_songs = pd.read_csv(file_path_all_songs, index_col = 0)
print(all_songs.head())

     artist_name        track_name                track_id  popularity  year  \
0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6          68  2012   
1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218          50  2012   
2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F          57  2012   
3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz          58  2012   
4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8          54  2012   

      genre  danceability  energy  key  loudness  mode  speechiness  \
0  acoustic         0.483   0.303    4   -10.058     1       0.0429   
1  acoustic         0.572   0.454    3   -10.286     1       0.0258   
2  acoustic         0.409   0.234    3   -13.711     1       0.0323   
3  acoustic         0.392   0.251   10    -9.845     1       0.0363   
4  acoustic         0.430   0.791    6    -5.419     0       0.0302   

   acousticness  instrumentalness  liveness  valence    tempo  duration_ms  \
0        0.694

In [25]:
# Get all the top songs completed information
selected_10_songs = pd.merge(all_songs, top_10_songs_to_search, on=['track_name', 'artist_name'], how='inner')
selected_10_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Sean Paul,No Lie,1Vb4HQnN2kZ5Y2KgYF5TDV,57,2016,dance,0.742,0.882,7,-2.862,1,0.117,0.0466,0.0,0.206,0.463,102.04,221176,4
1,Duncan Laurence,Arcade,1Xi84slp6FryDSCbzq4UCD,77,2019,pop,0.45,0.329,9,-12.603,0,0.0441,0.818,0.00109,0.135,0.266,71.884,183624,3
2,Giveon,Heartbreak Anniversary,3FAJ6O0NOHQV8Mc5Ri6ENp,79,2020,pop,0.449,0.465,0,-8.964,1,0.0791,0.524,1e-06,0.303,0.543,89.087,198371,3
3,Lost Frequencies,Where Are You Now,3uUuGVFu1V7jTQL60S1r8z,84,2021,dance,0.671,0.636,6,-8.117,0,0.103,0.515,0.000411,0.172,0.262,120.966,148197,4
4,Burna Boy,Alone,0AoBY2Y3qs6dtGgOD6c91N,77,2022,dance,0.6,0.659,4,-7.264,0,0.0542,0.176,0.0,0.111,0.307,89.955,221747,4
5,Taylor Swift,Anti-Hero,0V3wPSX9ygBnCm8psDIegu,92,2022,pop,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4
6,SZA,Seek & Destroy,6eT2V7nKXyMf47TwPbtgAD,79,2022,pop,0.651,0.647,1,-5.415,1,0.0654,0.437,0.175,0.205,0.345,152.069,203733,4
7,Joji,Glimpse of Us,6xGruZOHLs39ZbVccQTuPZ,85,2022,pop,0.44,0.317,8,-9.258,1,0.0531,0.891,5e-06,0.141,0.268,169.914,233456,3
8,SZA,Used (feat. Don Toliver),1TweDM3JC49LNeelLVg3yX,76,2022,pop,0.734,0.689,10,-6.454,0,0.0871,0.532,8.5e-05,0.322,0.705,149.579,70160,4
9,Sofia Carson,Come Back Home,1I4dwH7C0jBAEtz5DjlJgQ,73,2022,pop,0.552,0.531,7,-7.732,1,0.0421,0.241,1.2e-05,0.122,0.438,144.946,176859,4


In [26]:
top_10_songs_to_search = selected_10_songs[['artist_name', 'track_name']]
top_10_songs_to_search

Unnamed: 0,artist_name,track_name
0,Sean Paul,No Lie
1,Duncan Laurence,Arcade
2,Giveon,Heartbreak Anniversary
3,Lost Frequencies,Where Are You Now
4,Burna Boy,Alone
5,Taylor Swift,Anti-Hero
6,SZA,Seek & Destroy
7,Joji,Glimpse of Us
8,SZA,Used (feat. Don Toliver)
9,Sofia Carson,Come Back Home


In [27]:
# format dataset to make sure it has same data format
def format_dataset(df):
    df = df.drop(columns=["track_id", "popularity", "genre"])
    moved_column = df.pop("year")
    df["year"] = moved_column
    return df

In [28]:
selected_10_songs = format_dataset(selected_10_songs)
create_tokenized_summary(selected_10_songs, 'track_name', 'artist_name')
top_10_summary_vector = selected_10_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(selected_10_songs)
print(top_10_summary_vector[0])

[ 3.3443604  -1.5928901  -2.3530564  -0.71844226 -1.1618369   0.4698829
  1.4303792   2.9331908  -0.91425055 -1.1251037   0.10982971  1.5624715
 -2.142384    0.51317865]


In [29]:
top_10_songs_scaled = scaler.transform(selected_10_songs)
# Display the resulting DataFrame
print(top_10_songs_scaled[0])

[ 1.31791498  1.26844471  0.5103876   1.28543019  0.70122776  0.27467875
 -1.04434575 -0.76138381  0.02881781  0.13741314 -0.50136249 -0.17135925
  0.30032326  0.82074786]


In [30]:
top_10_song_embeddings = merged_embeddings(top_10_summary_vector, top_10_songs_scaled)

First song's embedding:  [ 3.34436035 -1.59289014 -2.35305643 -0.71844226 -1.16183686  0.46988291
  1.43037915  2.93319082 -0.91425055 -1.12510371  0.10982971  1.56247151
 -2.14238405  0.51317865  1.31791498  1.26844471  0.5103876   1.28543019
  0.70122776  0.27467875 -1.04434575 -0.76138381  0.02881781  0.13741314
 -0.50136249 -0.17135925  0.30032326  0.82074786]
Size for entire dataset:  10 ,  28


In [31]:
# mean aggregation method
mean_top_10_song_embeddings = np.mean(top_10_song_embeddings, axis = 0)

### Prepare individual personal song data

Seanna's top 10 favorite song has various genre and style:
1. Teeth - 5 Seconds of Summer
2. I WANNA BE YOUR SLAVE - Måneskin
3. Enemy - from the series Arcane League of Legends - Imagine Dragons
4. Say Something - A Great Big World
5. Marry You - Bruno Mars
6. Gotta Have You - The Weepies
7. 100 Degrees - Rich Brian
8. The Monster - Eminem
9. You Belong With Me - Taylor Swift
10. Bailando - Spanish Version - Enrique Iglesias

In [32]:
seanna_data = {
    'track_name': [
        'Teeth',
        'I WANNA BE YOUR SLAVE',
        'Enemy - from the series Arcane League of Legends',
        'Say Something',
        'Marry You',
        'Gotta Have You',
        '100 Degrees',
        'The Monster',
        'You Belong With Me',
        'Bailando - Spanish Version'
    ],
    'artist_name': [
        '5 Seconds of Summer',
        'Måneskin',
        'Imagine Dragons',
        'A Great Big World',
        'Bruno Mars',
        'The Weepies',
        'Rich Brian',
        'Eminem',
        'Taylor Swift',
        'Enrique Iglesias'
    ]
}

# Create DataFrame
seanna_favorite_songs = pd.DataFrame(seanna_data)

In [33]:
seanna_favorite_songs = pd.merge(all_songs, seanna_favorite_songs, on=['track_name', 'artist_name'], how='inner')
seanna_favorite_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Eminem,The Monster,48RrDBpOSSl1aLVCalGl5C,78,2013,hip-hop,0.781,0.853,1,-3.68,0,0.0715,0.0525,0.0,0.12,0.624,110.049,250189,4
1,A Great Big World,Say Something,78TKtlSLWK8pZAKKW3MyQL,56,2013,piano,0.453,0.146,2,-8.976,1,0.0343,0.867,3e-06,0.0945,0.0915,137.905,229400,3
2,Enrique Iglesias,Bailando - Spanish Version,32lm3769IRfcnrQV11LO4E,67,2014,pop,0.723,0.777,7,-3.503,1,0.108,0.0426,4e-06,0.0451,0.961,91.017,243413,4
3,5 Seconds of Summer,Teeth,26wLOs3ZuHJa2Ihhx6QIE6,76,2019,dance,0.756,0.448,3,-2.993,0,0.0404,0.0508,4e-06,0.11,0.431,139.031,204887,4
4,Rich Brian,100 Degrees,2ZDpSQfBdgkooeXw6oj3Uz,57,2019,hip-hop,0.756,0.648,0,-5.287,1,0.0731,0.118,0.0,0.515,0.657,80.979,166146,4
5,Måneskin,I WANNA BE YOUR SLAVE,4pt5fDVTg5GhEvEtlz9dKk,81,2021,indie-pop,0.75,0.608,1,-4.008,1,0.0387,0.00165,0.0,0.178,0.958,132.507,173347,4
6,Imagine Dragons,Enemy - from the series Arcane League of Legends,45lFaFCHXmpCiiMDvtihIv,1,2023,rock,0.728,0.783,11,-4.424,0,0.266,0.237,0.0,0.434,0.555,77.011,173381,4
7,The Weepies,Gotta Have You,1YjMWOorkBaP4MdKkKtp4y,50,2005,acoustic,0.678,0.363,11,-10.9,1,0.0318,0.872,0.000101,0.0798,0.543,75.004,199787,5
8,Taylor Swift,You Belong With Me,3GCL1PydwsLodcpv0Ll1ch,68,2008,pop,0.687,0.783,6,-4.44,1,0.0386,0.162,1.3e-05,0.114,0.443,129.964,231133,4
9,Bruno Mars,Marry You,22PMfvdz35fFKYnJyMn077,74,2010,dance,0.621,0.82,10,-4.865,1,0.0367,0.332,0.0,0.104,0.452,144.905,230192,4


In [34]:
seanna_favorite_songs_to_search = seanna_favorite_songs[['artist_name', 'track_name']]
seanna_favorite_songs_to_search

Unnamed: 0,artist_name,track_name
0,Eminem,The Monster
1,A Great Big World,Say Something
2,Enrique Iglesias,Bailando - Spanish Version
3,5 Seconds of Summer,Teeth
4,Rich Brian,100 Degrees
5,Måneskin,I WANNA BE YOUR SLAVE
6,Imagine Dragons,Enemy - from the series Arcane League of Legends
7,The Weepies,Gotta Have You
8,Taylor Swift,You Belong With Me
9,Bruno Mars,Marry You


In [35]:
seanna_favorite_songs = format_dataset(seanna_favorite_songs)
create_tokenized_summary(seanna_favorite_songs, 'track_name', 'artist_name')
seanna_summary_vector = seanna_favorite_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(seanna_favorite_songs)

seanna_songs_scaled = scaler.transform(seanna_favorite_songs)
seanna_songs_embeddings = merged_embeddings(seanna_summary_vector, seanna_songs_scaled)
mean_seanna_song_embeddings = np.mean(seanna_songs_embeddings, axis = 0)

First song's embedding:  [ 1.23613584 -0.18252549 -1.61758268  0.95301306  1.38821602  0.26498181
 -0.60538548  1.81439257 -0.67097795 -0.545286    0.02993587  0.16141915
 -2.32821345  0.60970014  1.5233332   1.17025752 -1.18626933  1.1686037
 -1.42607018 -0.11431354 -1.02906631 -0.76138381 -0.45044182  0.73263371
 -0.24270768  0.00524366  0.30032326  0.53307642]
Size for entire dataset:  10 ,  28


Yuhan's top 10 favorite song has similar genre and style:
1. Anti-Hero - Taylor Swift
2. Lover - Taylor Swift
3. Question...? - Taylor Swift
4. deja vu - Olivia Rodrigo
5. RADIO - HENRY
6. Wonderful U - AGA
7. Forever Young - Eve Ai
8. Something's Wrong with the Morning - Margo Guryan
9. The Most Beautiful Thing - Bruno Major
10. At My Worst - Pink Sweat$

In [36]:
yuhan_data = {
    'track_name': [
        'Anti-Hero',
        'Lover',
        'Question...?',
        'deja vu',
        'RADIO',
        'Wonderful U',
        'Forever Young',
        "Something's Wrong with the Morning",
        'The Most Beautiful Thing',
        'At My Worst'
    ],
    'artist_name': [
        'Taylor Swift',
        'Taylor Swift',
        'Taylor Swift',
        'Olivia Rodrigo',
        'HENRY',
        'AGA',
        'Eve Ai',
        'Margo Guryan',
        'Bruno Major',
        'Pink Sweat$'
    ]
}

# Create DataFrame
yuhan_favorite_songs = pd.DataFrame(yuhan_data)
yuhan_favorite_songs = pd.merge(all_songs, yuhan_favorite_songs, on=['track_name', 'artist_name'], how='inner')
yuhan_favorite_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Margo Guryan,Something's Wrong with the Morning,0IqQoCYYaSeM2ThWKPGoXX,52,2014,pop,0.656,0.567,2,-8.128,0,0.0352,0.682,0.000315,0.106,0.71,133.558,105573,4
1,AGA,Wonderful U,2eSNpIOFoi1Q8wxw6CycXJ,47,2016,cantopop,0.557,0.436,6,-8.569,1,0.0676,0.809,0.0,0.151,0.246,179.997,248551,3
2,Eve Ai,Forever Young,25sQT3yCEgd1uE6LC9ivcs,51,2018,singer-songwriter,0.304,0.226,0,-10.707,1,0.0329,0.929,0.0,0.161,0.323,139.593,313907,4
3,Taylor Swift,Lover,1dGr1c8CrMLDpV6mPbImSI,83,2019,pop,0.359,0.543,7,-7.582,1,0.0919,0.492,1.6e-05,0.118,0.453,68.534,221307,4
4,Pink Sweat$,At My Worst,0ri0Han4IRJhzvERHOZTMr,71,2020,chill,0.813,0.415,0,-5.926,1,0.0349,0.777,0.0,0.131,0.667,91.921,170345,4
5,HENRY,RADIO,4Dyb1oDEx4togM79cHL8UK,48,2020,k-pop,0.761,0.766,0,-5.414,1,0.143,0.118,0.0,0.111,0.266,146.879,191985,4
6,Bruno Major,The Most Beautiful Thing,07koEqsKHZTlGVMC9eoEjO,67,2020,pop,0.806,0.362,7,-10.386,1,0.0344,0.541,0.0489,0.111,0.418,127.498,235427,4
7,Olivia Rodrigo,deja vu,6HU7h9RYOaPRFeh0R3UeAr,83,2021,pop,0.442,0.612,2,-7.222,1,0.112,0.584,6e-06,0.37,0.178,180.917,215507,4
8,Taylor Swift,Anti-Hero,0V3wPSX9ygBnCm8psDIegu,92,2022,pop,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4
9,Taylor Swift,Question...?,0heeNYlwOGuUSe7TgUD27B,74,2022,pop,0.751,0.502,7,-8.763,1,0.167,0.2,0.0,0.296,0.106,108.943,210557,4


In [37]:
yuhan_favorite_songs_to_search = yuhan_favorite_songs[['artist_name', 'track_name']]
yuhan_favorite_songs_to_search

Unnamed: 0,artist_name,track_name
0,Margo Guryan,Something's Wrong with the Morning
1,AGA,Wonderful U
2,Eve Ai,Forever Young
3,Taylor Swift,Lover
4,Pink Sweat$,At My Worst
5,HENRY,RADIO
6,Bruno Major,The Most Beautiful Thing
7,Olivia Rodrigo,deja vu
8,Taylor Swift,Anti-Hero
9,Taylor Swift,Question...?


In [38]:
yuhan_favorite_songs = format_dataset(yuhan_favorite_songs)
create_tokenized_summary(yuhan_favorite_songs, 'track_name', 'artist_name')
yuhan_summary_vector = yuhan_favorite_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(yuhan_favorite_songs)

yuhan_songs_scaled = scaler.transform(yuhan_favorite_songs)
yuhan_songs_embeddings = merged_embeddings(yuhan_summary_vector, yuhan_songs_scaled)
mean_yuhan_song_embeddings = np.mean(yuhan_songs_embeddings, axis = 0)

First song's embedding:  [ 2.75204897  0.25129673 -1.84239984  1.85605049  0.67711145  0.92590892
 -0.48102432  2.19697952 -0.26205739 -0.4420675  -0.58080137  1.06051302
 -2.29001856  0.75146335  0.86494145  0.20192866 -0.90349317  0.53334179
 -1.42607018 -0.42465243  0.60117314 -0.76055037 -0.52846083  1.05057762
  0.51652767 -0.87503781  0.30032326  0.6289669 ]
Size for entire dataset:  10 ,  28


## Store Embeddings to Pinecone

In [39]:
# !pip install -qU \
#   "pinecone-client[grpc]"==2.2.1

In [40]:
import os
import pinecone
import time

  from tqdm.autonotebook import tqdm


In [41]:
PINECONE_API_KEY = '03367330-5730-4400-ac60-9ab695a047c0'
PINECONE_ENV = 'us-east-1-aws'

In [42]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

### Store embeddings to Pinecone - Cosine

In [43]:
index_name = 'music-recommender-cosine'
dim = len(embedded_features['values'][0])

In [45]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='cosine'
    )
    # wait a moment for the index to be fully initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [44]:
# now connect to the index
index_c = pinecone.GRPCIndex(index_name)
index_c.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

In [48]:
index_c.upsert_from_dataframe(embedded_features, batch_size=1000)

sending upsert requests:   0%|          | 0/1141542 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141542

In [49]:
index_c.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

### Store embeddings to Pinecone - Euclidean

In [45]:
index_name = 'music-recommender-euclidean'
dim = len(embedded_features['values'][0])

In [46]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='euclidean'
    )
    # wait a moment for the index to be fully initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [47]:
# now connect to the index
index_e = pinecone.GRPCIndex(index_name)
index_e.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

In [54]:
index_e.upsert_from_dataframe(embedded_features, batch_size=1000)

sending upsert requests:   0%|          | 0/1141542 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141542

In [55]:
index_e.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

### Store embeddings to Pinecone - Dotproduct

In [48]:
index_name = 'music-recommender-dotproduct'
dim = len(embedded_features['values'][0])

In [49]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='dotproduct'
    )
    # wait a moment for the index to be fully initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [50]:
# now connect to the index
index_d = pinecone.GRPCIndex(index_name)
index_d.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

In [59]:
index_d.upsert_from_dataframe(embedded_features, batch_size=1000)

sending upsert requests:   0%|          | 0/1141542 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141542

In [60]:
index_d.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1140000}},
 'total_vector_count': 1140000}

## Query

- We will generate recommendations for 3 sets of data (Seanna’s top 10 favorite songs, Yuhan’s top 10 favorite songs, and the top 10 most streamed songs in 2023) across 3 different similarity metrics.
- For each similarity metric, we will generate recommendation for each set of data by either providing 1 combined embedding of 10 songs or a list of 10 embeddings.
- If the query song exists in our database, the most similiar song will be itself. Thus, we take its 2nd most similar song as recommendation.

In [64]:
# query and display results in dataframe with recommended songs and similarity scores
# get top 10 recommendations based on a combined embedding vector
def combined_query_result(index, song_embeddings):
    query_response = index.query(song_embeddings, top_k=10, include_metadata=True)
    result_songs = []
    matches = query_response['matches']
    for e in matches:
        id = e['id']
        song = songs_df[songs_df['id'] == id]
        result_songs.append([song['name'].item(), song['artists'].item(), str(e['score'])])
    return pd.DataFrame(result_songs, columns=['song_name', 'artists', 'similarity_score'])

# get top 10 recommendations based on individual embedding vector
# if any given song is contained in the database, take its 2nd most similar song as recommendation 
def individual_query_result(index, query_songs, query_embeddings):
    result = []
    fav_song_names = query_songs['track_name'].tolist()
    fav_song_artists = query_songs['artist_name'].tolist()

    # get recommendation for each song 
    for i in range(len(fav_song_names)):
        xc = index.query(query_embeddings[i], top_k=2, include_metadata=True)
        id = xc['matches'][0]['id']
        score = xc['matches'][0]['score']
        song = songs_df[songs_df['id'] == id]
        song_name = song['name'].item()
        song_artists = song['artists'].item()
        # get the 2nd most similar song
        if fav_song_names[i].lower() == song_name.lower() and fav_song_artists[i].lower() in song_artists.lower():
            id = xc['matches'][1]['id']
            score = xc['matches'][1]['score']
            song = songs_df[songs_df['id'] == id]
        result.append([fav_song_names[i], fav_song_artists[i], song['name'].item(), song['artists'].item(), str(score)])
    
    return pd.DataFrame(result, columns=['fav_song', 'artists', 'match_name', 'match_artists', 'similarity_score'])

### Query - Dotproduct

#### Combined Song Vector as a Single Query

In [66]:
# personal listening histroy top 10 averaged - Yuhan
dp_yuhan_10 = combined_query_result(index_d, mean_yuhan_song_embeddings)
dp_yuhan_10

Unnamed: 0,song_name,artists,similarity_score
0,Can't Get Mad At You,['Downchild Blues Band'],30.636154
1,Can't Take It with You,['Pilot'],30.395498
2,Can't Stop Me,['Dub'],30.091648
3,You Can't Take It Away,['Odyssey'],30.082003
4,Can't Get It Out,['Brand New'],29.852036
5,Can't Go Back,['Dirty Streets'],29.802073
6,Just Can't Take It,['Restless'],29.496151
7,Don't Let It Get You Down,['Aloud'],29.462996
8,Can't Take It With You When You Go,['Mike Love'],29.450842
9,Can't Get Enough,['Bad Company'],29.27934


In [67]:
# personal listening histroy top 10 averaged - Seanna
dp_seanna_10 = combined_query_result(index_d, mean_seanna_song_embeddings)
dp_seanna_10

Unnamed: 0,song_name,artists,similarity_score
0,Don't You Want It,['Five'],39.277
1,You Can't Take It Away,['Odyssey'],39.27087
2,Just Can't Take It,['Restless'],38.98331
3,Can't Do,['Everything Everything'],38.88067
4,Can't Let You Go,['Freestyle'],38.267757
5,Can't Get Mad At You,['Downchild Blues Band'],38.164177
6,Just Can't Do It,['Quictamac'],38.051643
7,Can't Get It Out,['Brand New'],38.01498
8,Can't Go Back,['Dirty Streets'],37.997864
9,Don't Let It Go Out,['GUM'],37.810455


In [68]:
# spotify top 10 averaged
dp_spotify_10 = combined_query_result(index_d, mean_top_10_song_embeddings)
dp_spotify_10

Unnamed: 0,song_name,artists,similarity_score
0,Can't Get Mad At You,['Downchild Blues Band'],32.61857
1,Can't Get It Out,['Brand New'],32.59065
2,Can't Stop Me,['Dub'],32.083508
3,Don't Stop - 2018 Remaster,['Fleetwood Mac'],32.040375
4,Can't See at All,['Woods'],32.038857
5,Can't Let You Go,['Freestyle'],32.036858
6,You Can't Take It Away,['Odyssey'],32.03457
7,Can't Let You Go,['The Hound'],31.781559
8,Can't Go Back,['Dirty Streets'],31.532362
9,Can't Help It,['Money Man'],31.51015


In [74]:
# gather similarity scores of all recommendations
dp_10_scores = dp_yuhan_10['similarity_score'].tolist() + dp_seanna_10['similarity_score'].tolist() \
                + dp_spotify_10['similarity_score'].tolist()
dp_10_scores = [float(i) for i in dp_10_scores]

#### Individual Song Vector as a Single Query

In [75]:
# personal listening histroy top 10 1by1 - Yuhan
dp_yuhan_1 = individual_query_result(index_d, yuhan_favorite_songs_to_search, yuhan_songs_embeddings)
dp_yuhan_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,Something's Wrong with the Morning,Margo Guryan,Who Let The Dogs Out,['The Hit Crew'],42.341652
1,Wonderful U,AGA,I CAN'T,['NNAMDÏ'],46.416317
2,Forever Young,Eve Ai,Don't Get to Know Me,['Should'],51.835213
3,Lover,Taylor Swift,You Can't Take It,['Linda Jones'],39.14606
4,At My Worst,Pink Sweat$,Just Can't Take It,['Restless'],43.84322
5,RADIO,HENRY,Respect (with The Royal Philharmonic Orchestra),"['Aretha Franklin', 'Royal Philharmonic Orches...",58.218643
6,The Most Beautiful Thing,Bruno Major,Major Major Major,['The Jac'],97.77532
7,deja vu,Olivia Rodrigo,La Banda Del Ganador,['Biper Y Sus Amigos'],26.763449
8,Anti-Hero,Taylor Swift,Down & Dirty,['Little Mix'],30.854706
9,Question...?,Taylor Swift,Can't Help It,['Money Man'],39.04671


In [76]:
# personal listening histroy top 10 1by1 - Seanna
dp_seanna_1 = individual_query_result(index_d, seanna_favorite_songs_to_search, seanna_songs_embeddings)
dp_seanna_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,The Monster,Eminem,Midnight Crew,['The Dirty City Brothers'],35.21749
1,Say Something,A Great Big World,You Can't Do It (So Give Up Now),['Strong Bad'],62.341515
2,Bailando - Spanish Version,Enrique Iglesias,El Boom,['Cumbia Latin Band'],34.864376
3,Teeth,5 Seconds of Summer,New New New New New,['The Tripwires'],40.78542
4,100 Degrees,Rich Brian,Something for Nothing,['Tom Scott'],39.8871
5,I WANNA BE YOUR SLAVE,Måneskin,Can't Do,['Everything Everything'],76.80456
6,Enemy - from the series Arcane League of Legends,Imagine Dragons,Demons From Beyond,['The Hit Crew'],40.849136
7,Gotta Have You,The Weepies,Don't You Want It,['Five'],71.76749
8,You Belong With Me,Taylor Swift,Don't Let It Get You Down,['America'],69.60925
9,Marry You,Bruno Mars,You Can't Do That,['Bob Welch'],45.31166


In [77]:
# spotify top 10 1by1
dp_spotify_1 = individual_query_result(index_d, top_10_songs_to_search, top_10_song_embeddings)
dp_spotify_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,No Lie,Sean Paul,. . . So We . . .,['Illuminandi'],54.125313
1,Arcade,Duncan Laurence,Armstrong Street,['Armstrong Street'],47.438744
2,Heartbreak Anniversary,Giveon,Defense,['The Hit Crew'],29.742437
3,Where Are You Now,Lost Frequencies,Can't Do,['Everything Everything'],75.01314
4,Alone,Burna Boy,Roll It Up,['Lil B'],42.53032
5,Anti-Hero,Taylor Swift,Down & Dirty,['Little Mix'],30.854706
6,Seek & Destroy,SZA,On & On & On,['Caamp'],35.736588
7,Glimpse of Us,Joji,Four,['Sleeping At Last'],36.92406
8,Used (feat. Don Toliver),SZA,Thug,['Slim Thug'],50.24612
9,Come Back Home,Sofia Carson,Never Gonna Let You Go,['Wake Me'],57.373844


In [84]:
# gather similarity scores of all recommendations
dp_1_scores = dp_yuhan_1['similarity_score'].tolist() + dp_seanna_1['similarity_score'].tolist() \
                + dp_spotify_1['similarity_score'].tolist()
dp_1_scores = [float(i) for i in dp_1_scores]

In [85]:
# get stats of recommended songs' similarity scores
from statistics import median
print(max(dp_10_scores), min(dp_10_scores), median(dp_10_scores))
print(max(dp_1_scores), min(dp_1_scores), median(dp_1_scores))

39.277 29.27934 32.0378575
97.77532 26.763449 43.18677


### Combined Song Vector as Single Query - Euclidean

#### Combined Song Vector as a Single Query

In [86]:
# personal listening histroy top 10 averaged - Yuhan
eu_yuhan_10 = combined_query_result(index_e, mean_yuhan_song_embeddings)
eu_yuhan_10

Unnamed: 0,song_name,artists,similarity_score
0,Slow Dancing,['Tayla Parx'],4.1396484
1,Slow Mover,['Angie McMahon'],4.588917
2,Speechless - The Vocals,['Dan + Shay'],4.755533
3,Press Record,['Proper Einstein'],4.8064327
4,ok,['Mating Ritual'],4.8520107
5,Haunted,['Zoe Boekbinder'],4.8989677
6,Lately,['Wet'],4.915951
7,Wasted,['NNAMDÏ'],4.9482365
8,Legendary,['DJLC'],5.038601
9,Super Powers Enable Me to Blend in with Machinery,['Dave Hause'],5.0402527


In [87]:
# personal listening histroy top 10 averaged - Seanna
eu_seanna_10 = combined_query_result(index_e, mean_seanna_song_embeddings)
eu_seanna_10

Unnamed: 0,song_name,artists,similarity_score
0,Blueprint,['Stray Kids'],6.4315033
1,Stick Around,['AC/DC'],6.764824
2,Bad Talker,['Action Camp'],6.782444
3,Horseshoe Crabs,['Hop Along'],6.8797264
4,Remain,['Action Camp'],7.104904
5,Rock or Bust,['AC/DC'],7.1814003
6,Horsefly,['Dirty Heads'],7.404831
7,Intuition,['Hot October'],7.426361
8,Chronosaurus,['Stray Kids'],7.473938
9,Dry Bones,['April Verch'],7.660988


In [88]:
# spotify top 10 averaged
eu_spotify_10 = combined_query_result(index_e, mean_top_10_song_embeddings)
eu_spotify_10

Unnamed: 0,song_name,artists,similarity_score
0,Dry Bones,['April Verch'],5.923153
1,Tequila - The Vocals,['Dan + Shay'],6.210661
2,Horseshoe Crabs,['Hop Along'],6.668228
3,Anxious - Demo,['Hippo Campus'],6.8448544
4,The Cure,['Smileyface'],7.152439
5,Slow Mover,['Angie McMahon'],7.208351
6,Shehecheyanu,['Sheldon Low'],7.271639
7,Failure to Communicate,['Pluto Gang'],7.2978554
8,Super Powers Enable Me to Blend in with Machinery,['Dave Hause'],7.30233
9,Slow Dancing,['Tayla Parx'],7.3190994


In [89]:
# gather similarity scores of all recommendations
eu_10_scores = eu_yuhan_10['similarity_score'].tolist() + eu_seanna_10['similarity_score'].tolist() \
                + eu_spotify_10['similarity_score'].tolist()
eu_10_scores = [float(i) for i in eu_10_scores]

#### Individual Song Vector as a Single Query

In [90]:
# personal listening histroy top 10 1by1 - Yuhan
eu_yuhan_1 = individual_query_result(index_e, yuhan_favorite_songs_to_search, yuhan_songs_embeddings)
eu_yuhan_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,Something's Wrong with the Morning,Margo Guryan,Break the Ice at Parties,['Tesco Bombers'],6.318424
1,Wonderful U,AGA,Ya Cai,['Arpex'],9.998951
2,Forever Young,Eve Ai,Out For Kicks,['Super XX Man'],11.661575
3,Lover,Taylor Swift,Coat Off,['Annie Barker'],5.1978416
4,At My Worst,Pink Sweat$,Like a Gorilla,['Virtual Gorilla'],11.94331
5,RADIO,HENRY,Epic,['ARTHUR'],28.15828
6,The Most Beautiful Thing,Bruno Major,Social Club,['The Minor Leagues'],12.079643
7,deja vu,Olivia Rodrigo,Cartas Amarillas,['Nino Segarra'],7.075741
8,Anti-Hero,Taylor Swift,Tcr,['Jack Adaptor'],4.4171257
9,Question...?,Taylor Swift,run dry,['ilham'],8.631794


In [91]:
# personal listening histroy top 10 1by1 - Seanna
eu_seanna_1 = individual_query_result(index_e, seanna_favorite_songs_to_search, seanna_songs_embeddings)
eu_seanna_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,The Monster,Eminem,Dimension,['AZRA'],5.866844
1,Say Something,A Great Big World,Too Much,['The Hit Crew'],12.268929
2,Bailando - Spanish Version,Enrique Iglesias,Venezia,['Ryuichi Sakamoto'],16.432762
3,Teeth,5 Seconds of Summer,Molasses in January,['Team Dresch'],12.159313
4,100 Degrees,Rich Brian,Past to Present,"['DJ Jean Maron', 'Kool G Rap']",11.2381935
5,I WANNA BE YOUR SLAVE,Måneskin,Jesus Ain't Dead,['Gasoline Lollipops'],14.716843
6,Enemy - from the series Arcane League of Legends,Imagine Dragons,Series,['Infernal Noise Brigade'],17.169746
7,Gotta Have You,The Weepies,Don't Worry/ Be Happy,['The Hit Crew'],14.480202
8,You Belong With Me,Taylor Swift,Trying To Live Up,['Dolly Varden'],5.2338104
9,Marry You,Bruno Mars,Ik Drink Aan Jou,['Johnny Trash'],8.232044


In [92]:
# spotify top 10 1by1
eu_spotify_1 = individual_query_result(index_e, top_10_songs_to_search, top_10_song_embeddings)
eu_spotify_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,No Lie,Sean Paul,Wrong #,['Frank Carillo and the Bandoleros'],12.130318
1,Arcade,Duncan Laurence,Again,['Vance Gilbert'],5.990246
2,Heartbreak Anniversary,Giveon,Chili Packet,['Merce Lemon'],4.911289
3,Where Are You Now,Lost Frequencies,Nothing Left At All,['Jelly Roll'],13.326378
4,Alone,Burna Boy,HOP OUT,['BLCKK'],4.602253
5,Anti-Hero,Taylor Swift,Tcr,['Jack Adaptor'],4.4171257
6,Seek & Destroy,SZA,Messy,['Fifth Harmony'],10.340511
7,Glimpse of Us,Joji,New Candy Necklace,['Tandy'],11.979988
8,Used (feat. Don Toliver),SZA,Seven,['Lil Boom'],25.420288
9,Come Back Home,Sofia Carson,Wake Up Early Bc,['bc likes you'],6.3315735


In [93]:
# gather similarity scores of all recommendations
eu_1_scores = eu_yuhan_1['similarity_score'].tolist() + eu_seanna_1['similarity_score'].tolist() \
                + eu_spotify_1['similarity_score'].tolist()
eu_1_scores = [float(i) for i in eu_1_scores]

In [95]:
# get stats of recommended songs' similarity scores
print(max(eu_10_scores), min(eu_10_scores), median(eu_10_scores))
print(max(eu_1_scores), min(eu_1_scores), median(eu_1_scores))

7.660988 4.1396484 6.7736339999999995
28.15828 4.4171257 10.78935225


### Combined Song Vector as Single Query - Cosine

#### Combined Song Vector as a Single Query

In [96]:
# personal listening histroy top 10 averaged - Yuhan
cos_yuhan_10 = combined_query_result(index_c, mean_yuhan_song_embeddings)
cos_yuhan_10

Unnamed: 0,song_name,artists,similarity_score
0,Love In Store - 2018 Remaster,['Fleetwood Mac'],0.8972813
1,Red Shoes - from the Dumplin' Original Motion ...,['Dolly Parton'],0.8866133
2,Push and Pull - from the Dumplin' Original Mot...,['Dolly Parton'],0.88485247
3,I Know - Recorded at Spotify Studios NYC,['Shovels & Rope'],0.88435715
4,Bad Bitches Beat Heartbreak,['Dylan'],0.88037485
5,Slow Dancing,['Tayla Parx'],0.8800815
6,Super Powers Enable Me to Blend in with Machinery,['Dave Hause'],0.8797626
7,Inner World,['Dirty Projectors'],0.8796431
8,Mad At Me,['Quinn XCII'],0.8792892
9,Intuition,['Hot October'],0.87899476


In [97]:
# personal listening histroy top 10 averaged - Seanna
cos_seanna_10 = combined_query_result(index_c, mean_seanna_song_embeddings)
cos_seanna_10

Unnamed: 0,song_name,artists,similarity_score
0,Mad At It,['Dirty Heads'],0.881808
1,Stick Around,['AC/DC'],0.8796844
2,Blueprint,['Stray Kids'],0.8768941
3,Rock or Bust,['AC/DC'],0.87512755
4,Drop Me Off at the Honky Tonk,['Railbenders'],0.86163086
5,Dirty Elvis,['Upchurch'],0.8608472
6,Zoot Suit Riot,['The Hit Crew'],0.85932213
7,Bad Talker,['Action Camp'],0.8590816
8,Hella Bags,['Nine Up'],0.8586471
9,Gimme Action,['Wild Machine'],0.8557467


In [98]:
# spotify top 10 averaged
cos_spotify_10 = combined_query_result(index_c, mean_top_10_song_embeddings)
cos_spotify_10

Unnamed: 0,song_name,artists,similarity_score
0,Down on Rodeo - 2018 Remaster,['Lindsey Buckingham'],0.8771248
1,Up and Down - 2016 Remaster,['The Cars'],0.8730664
2,Love In Store - 2018 Remaster,['Fleetwood Mac'],0.8686049
3,From the Ground Up - Single Version,['Dan + Shay'],0.86332536
4,Arrested for Driving While Blind - 2019 Remaster,['ZZ Top'],0.86052436
5,Driveway to Driveway - 2019 Acoustic,['Superchunk'],0.85797894
6,Woke up with Wood - 2019 Remaster,['ZZ Top'],0.85746956
7,Dry Bones,['April Verch'],0.85599846
8,Beer Drinkers & Hell Raisers - 2019 Remaster,['ZZ Top'],0.85244715
9,To the Funky Beat,['Flying Pooh'],0.8520445


In [99]:
# gather similarity scores of all recommendations
cos_10_scores = cos_yuhan_10['similarity_score'].tolist() + cos_seanna_10['similarity_score'].tolist() \
                + cos_spotify_10['similarity_score'].tolist()
cos_10_scores = [float(i) for i in cos_10_scores]

#### Individual Song Vector as a Single Query

In [100]:
# personal listening histroy top 10 1by1 - Yuhan
cos_yuhan_1 = individual_query_result(index_c, yuhan_favorite_songs_to_search, yuhan_songs_embeddings)
cos_yuhan_1 

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,Something's Wrong with the Morning,Margo Guryan,Break the Ice at Parties,['Tesco Bombers'],0.90890646
1,Wonderful U,AGA,Ya Cai,['Arpex'],0.82988304
2,Forever Young,Eve Ai,Out For Kicks,['Super XX Man'],0.8752552
3,Lover,Taylor Swift,Coat Off,['Annie Barker'],0.91068643
4,At My Worst,Pink Sweat$,Like a Gorilla,['Virtual Gorilla'],0.8690864
5,RADIO,HENRY,MVA,['Joseph of Mercury'],0.80956733
6,The Most Beautiful Thing,Bruno Major,Damage Control,['Ursa Minor'],0.8768651
7,deja vu,Olivia Rodrigo,Hindi Laro Ang Ibigin Ka,['Anthony Castelo'],0.8227607
8,Anti-Hero,Taylor Swift,Beautiful,['Taylor Dayne'],0.9101022
9,Question...?,Taylor Swift,Pill Talking,['Jelly Roll'],0.8745187


In [101]:
# personal listening histroy top 10 1by1 - Seanna
cos_seanna_1 = individual_query_result(index_c, seanna_favorite_songs_to_search, seanna_songs_embeddings)
cos_seanna_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,The Monster,Eminem,Dimension,['AZRA'],0.89257246
1,Say Something,A Great Big World,Too Much,['The Hit Crew'],0.8908658
2,Bailando - Spanish Version,Enrique Iglesias,Venezia,['Ryuichi Sakamoto'],0.7707542
3,Teeth,5 Seconds of Summer,Molasses in January,['Team Dresch'],0.84874594
4,100 Degrees,Rich Brian,Past to Present,"['DJ Jean Maron', 'Kool G Rap']",0.84929955
5,I WANNA BE YOUR SLAVE,Måneskin,Volám Do Nebe,['Funky Chicken'],0.89763546
6,Enemy - from the series Arcane League of Legends,Imagine Dragons,Series,['Infernal Noise Brigade'],0.80447817
7,Gotta Have You,The Weepies,Don't Worry/ Be Happy,['The Hit Crew'],0.89034337
8,You Belong With Me,Taylor Swift,Trying To Live Up,['Dolly Varden'],0.9472602
9,Marry You,Bruno Mars,Can't Stop Rockin' - 2019 Remaster,['ZZ Top'],0.8733809


In [102]:
# spotify top 10 1by1
cos_spotify_1 = individual_query_result(index_c, top_10_songs_to_search, top_10_song_embeddings)
cos_spotify_1

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,No Lie,Sean Paul,Wrong #,['Frank Carillo and the Bandoleros'],0.87217396
1,Arcade,Duncan Laurence,Siberia,['Calling Blue Jay'],0.894874
2,Heartbreak Anniversary,Giveon,Chili Packet,['Merce Lemon'],0.8372519
3,Where Are You Now,Lost Frequencies,Nothing Left At All,['Jelly Roll'],0.8989225
4,Alone,Burna Boy,HOP OUT,['BLCKK'],0.92556196
5,Anti-Hero,Taylor Swift,Beautiful,['Taylor Dayne'],0.9101022
6,Seek & Destroy,SZA,On The Outside,['Tech N9ne'],0.8233894
7,Glimpse of Us,Joji,New Candy Necklace,['Tandy'],0.80437165
8,Used (feat. Don Toliver),SZA,Seven,['Lil Boom'],0.762704
9,Come Back Home,Sofia Carson,Wake Up Early Bc,['bc likes you'],0.91820276


In [103]:
# gather similarity scores of all recommendations
cos_1_scores = cos_yuhan_1['similarity_score'].tolist() + cos_seanna_1['similarity_score'].tolist() \
                + cos_spotify_1['similarity_score'].tolist()
cos_1_scores = [float(i) for i in cos_1_scores]

In [104]:
# get stats of recommended songs' similarity scores
print(max(cos_10_scores), min(cos_10_scores), median(cos_10_scores))
print(max(cos_1_scores), min(cos_1_scores), median(cos_1_scores))

0.8972813 0.8520445 0.874096975
0.9472602 0.762704 0.87488695
