## Setup Environment

In [1]:
# Required package for this project 
# !pip install pandas \
#             nltk \
#             gensim \
#             scikit-learn \
#             numpy

In [2]:
import pandas as pd
import nltk
# nltk.download('punkt') # for the first time need to download this for tokenization
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
import numpy as np

## Load dataset of songs

Dataset: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs, an open source dataset on Kaggle. It provides nearly 1.2 million of songs in Spotify. Those songs were retreived by using Spotify API.

In [3]:
file_path = '../tracks_features.csv'
songs_df = pd.read_csv(file_path)
print(songs_df.head())

                       id                   name                      album  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  The Battle Of Los Angeles   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  The Battle Of Los Angeles   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  The Battle Of Los Angeles   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  The Battle Of Los Angeles   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  The Battle Of Los Angeles   

                 album_id                       artists  \
0  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
1  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
2  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
3  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
4  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   

                   artist_ids  track_number  disc_number  explicit  \
0  ['2d0hyoQ5ynDBnkvAbJKORj']             1            1     False   
1  ['2d0hyoQ5ynDBnkvAbJKORj'] 

## Preprocessing data

We want to perform some operations to select the numeric audio features we want, and also convert those categorical values into numeric one to create the vector embeddings.
The selected features include 2 categorical features (name + artists), and 14 numeric audio features:
- id (unique index)
- name
- artists
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acousticness
- instrumentalness
- liveness
- valence
- tempo
- duration_ms
- time_signature
- year

In [4]:
selected_features_df = songs_df.drop(columns=["album", "album_id", "artist_ids", "track_number", "disc_number", "explicit", "release_date"])
print(selected_features_df.head())

                       id                   name  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire   

                        artists  danceability  energy  key  loudness  mode  \
0  ['Rage Against The Machine']         0.470   0.978    7    -5.399     1   
1  ['Rage Against The Machine']         0.599   0.957   11    -5.764     1   
2  ['Rage Against The Machine']         0.315   0.970    7    -5.424     1   
3  ['Rage Against The Machine']         0.440   0.967   11    -5.830     0   
4  ['Rage Against The Machine']         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.

In [5]:
# check if our filtered features contain any missing value
selected_features_df.isna().any()

id                  False
name                 True
artists             False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
year                False
dtype: bool

In [6]:
# remove those missing value rows
print("Shape before drop missing value: ", selected_features_df.shape)
selected_features_df = selected_features_df.dropna()
print("Shape after drop missing value: ", selected_features_df.shape)

Shape before drop missing value:  (1204025, 17)
Shape after drop missing value:  (1204022, 17)


In [7]:
# some rows contain 0 value for year, we want to filter those row out as well
selected_features_df = selected_features_df[selected_features_df['year'] != 0] 
print("Shape after drop invalid year: ", selected_features_df.shape)

Shape after drop invalid year:  (1204012, 17)


Some songs have multiple artists, we want to convert them from a list to string.
Example: ['Pietro Locatelli', 'Capella Istropolitana', 'Jaroslav Krcek'] to 'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [8]:
def convert_artists_name(artists_list):
    items_list = artists_list.strip("[]").replace("'", "").split(", ")
    return ", ".join(items_list)

selected_features_df["artists"] = selected_features_df["artists"].apply(convert_artists_name)
selected_features_df.iloc[1184]["artists"]

'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [9]:
# remove duplicated rows by song name and artists name
selected_features_df = selected_features_df.drop_duplicates(subset=['name', 'artists'])
print("Shape after duplicated removal: ", selected_features_df.shape)

Shape after duplicated removal:  (1141542, 17)


## Create vector embeddings model

### Create categorical feature vector embeddings

We first need to convert those song and artists name into vector. The converted vector representation will have length of 14, so we can combine these with 14 numeric column values. We will combine the song name with artists name to one column for better tokenize

In [10]:
# perform tokenization operation on the song name and artist columns
def create_tokenized_summary(df, name_col, artist_col):
    # Combine song name and artists columns into a new 'string_summary' column
    df['string_summary'] = df[name_col] + ' - ' + df[artist_col]
    df['string_summary'] = df['string_summary'].astype(str)

    # Drop the original 'name' and 'artists' columns
    df.drop([name_col, artist_col], axis=1, inplace=True)

    # Convert string summaries to lowercase and then tokenize
    df['tokenized_summary'] = df['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [11]:
# Convert string (tokenized) summaries to vectors
def get_summary_vector(summary, model):
    summary_vector = [model.wv[word] for word in summary if word in model.wv]
    return sum(summary_vector) / len(summary_vector) if summary_vector else [0] * vector_size

In [12]:
def clean_tokenized_summary(df):
    df.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)

In [13]:
create_tokenized_summary(selected_features_df, 'name', 'artists')

In [14]:
# Define Word2Vec model parameters (may adjust later)
vector_size = 14
window_size = 5
min_count = 1

# Train Word2Vec model
word2vec_model = Word2Vec(selected_features_df['tokenized_summary'], vector_size=vector_size, window=window_size, min_count=min_count)

In [15]:
summary_vector = selected_features_df['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(selected_features_df)
print(summary_vector[0])

[ 2.746404    0.58580625 -1.5455157   1.8050575   0.5681629  -0.7359647
 -0.6501735   2.0985696   0.1496685  -0.36264658  0.01724947  0.70348424
 -2.1843083   0.6678187 ]


### Create numerical features vector embeddings

The numerical columns are audio characteristics of the song, and we want to scale all the values to make it become the embeddings.

In [90]:
# Extract the numeric columns (excluding 'id')
numeric_columns = selected_features_df.drop(['id'], axis=1)
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(numeric_columns)
# Display the resulting DataFrame
print(scaled_columns[0])

[-0.11474546  1.59347817  0.5103876   0.92309668  0.70122776 -0.1040544
 -1.09743536 -0.76135497  0.86473578  0.28529403  0.01103821 -0.23857829
  0.30032326 -0.80939027]


### Merged vector embeddings to create final one

Finally, we want to merge those summary vector (name & artisits) with scaled vector (audio charactersitcs) to make the embeddings for each song.

In [91]:
def merged_embeddings(summary_vector, scaled_columns):
    song_embeddings = [
        np.concatenate([summary_row, scaled_row])
        for summary_row, scaled_row in zip(summary_vector, scaled_columns)
    ]
    print("First song's embedding: ", song_embeddings[0])
    print("Size for entire dataset: ", len(song_embeddings), ", ", len(song_embeddings[0]))
    return song_embeddings

In [92]:
song_embeddings = merged_embeddings(summary_vector, scaled_columns)

First song's embedding:  [ 2.74640393  0.58580625 -1.54551566  1.80505753  0.56816292 -0.73596472
 -0.65017349  2.09856963  0.1496685  -0.36264658  0.01724947  0.70348424
 -2.18430829  0.66781873 -0.11474546  1.59347817  0.5103876   0.92309668
  0.70122776 -0.1040544  -1.09743536 -0.76135497  0.86473578  0.28529403
  0.01103821 -0.23857829  0.30032326 -0.80939027]
Size for entire dataset:  1141542 ,  28


In [93]:
# Combining those things into our final table for uploading to Pinecone. The table should have two columns, one is id, and another one is song embeddings representation.
embedded_features = selected_features_df[["id"]].copy()
embedded_features.loc[:, "values"] = song_embeddings
print(embedded_features.head())
print(embedded_features.shape)

                       id                                             values
0  7lmeHLHBe4nmXzuXc0HDjk  [2.746403932571411, 0.5858062505722046, -1.545...
1  1wsRitfRRtWyEapl0q22o8  [2.894444704055786, 0.8064025044441223, -1.256...
2  1hR0fIFK2qRG3f3RF70pb7  [3.7159016132354736, 0.9189034700393677, -1.91...
3  2lbASgTSoDO7MTuLAXlTW0  [2.688814878463745, 0.6879237294197083, -1.393...
4  1MQTmpYOZ6fcMQc56Hdo7T  [3.273085832595825, 0.5289910435676575, -1.764...
(1141542, 2)


## Prepare dataset for searching similar songs

Two different search strategies:
1. Combined all history songs into one embedding for query, get top 10 recommendations
2. Convert each individual into one embedding, perform 10 queries to get the top 1 recommendation for each one

Two query sources:
1. Personal favorite song & listening history
2. Spotify 2023 top hit 100 songs

Pinecone search metrics:
1. Cosine
2. Euclidean
3. Dotproduct

### Prepare Spotify top 100 song data

Get the most streamed songs in 2023 (datasets: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023/data, https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks)

In [94]:
# We are missing loudness information in here, so we need to use another dataset info
file_path_top_songs = '../spotify-2023.csv'
top_songs = pd.read_csv(file_path_top_songs, encoding='latin-1')
list(top_songs.columns)

['track_name',
 'artist(s)_name',
 'artist_count',
 'released_year',
 'released_month',
 'released_day',
 'in_spotify_playlists',
 'in_spotify_charts',
 'streams',
 'in_apple_playlists',
 'in_apple_charts',
 'in_deezer_playlists',
 'in_deezer_charts',
 'in_shazam_charts',
 'bpm',
 'key',
 'mode',
 'danceability_%',
 'valence_%',
 'energy_%',
 'acousticness_%',
 'instrumentalness_%',
 'liveness_%',
 'speechiness_%']

In [95]:
# get top 10 hits songs in 2023 that released in recent 10 years
filtered_songs = top_songs[(top_songs['released_year'] > 2014) & (top_songs['released_year'] < 2023)]
top_10_songs = filtered_songs.sort_values(by = "streams", ascending = False).iloc[:10,:]

# Remove the last song from top_10_songs as the last one did not in all songs dataset
top_10_songs = top_10_songs.iloc[:-1, :]

# Get the next song in the sorted order
next_song = filtered_songs.sort_values(by="streams", ascending=False).iloc[10:11, :]

# Concatenate top_10_songs and next_song
top_10_songs = pd.concat([top_10_songs, next_song], ignore_index=True)
top_10_songs

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Anti-Hero,Taylor Swift,1,2022,10,21,9082,56,999748277,242,...,97,E,Major,64,51,63,12,0,19,5
1,Arcade,Duncan Laurence,1,2019,3,7,6646,0,991336132,107,...,72,A,Minor,45,27,33,82,0,14,4
2,Glimpse of Us,Joji,1,2022,6,10,6330,6,988515741,109,...,170,G#,Major,44,27,32,89,0,14,5
3,Seek & Destroy,SZA,1,2022,12,9,1007,0,98709329,5,...,152,C#,Major,65,35,65,44,18,21,7
4,"Come Back Home - From ""Purple Hearts""",Sofia Carson,1,2022,7,12,367,0,97610446,28,...,145,G,Major,56,43,53,24,0,12,4
5,Where Are You Now,"Lost Frequencies, Calum Scott",2,2021,7,30,10565,44,972509632,238,...,121,F#,Minor,67,26,64,52,0,17,10
6,Alone,Burna Boy,1,2022,11,4,782,2,96007391,27,...,90,E,Minor,61,32,67,15,0,11,5
7,No Lie,"Sean Paul, Dua Lipa",2,2016,11,18,7370,0,956865266,92,...,102,G,Major,74,45,89,5,0,26,13
8,HEARTBREAK ANNIVERSARY,Giveon,1,2020,2,21,5398,4,951637566,111,...,129,,Major,61,59,46,56,0,13,5
9,Used (feat. Don Toliver),"SZA, Don Toliver",2,2022,12,8,1042,0,94005786,7,...,150,A#,Minor,73,71,69,53,0,32,9


In [96]:
# extract top 10 songs name to search in all songs dataset
top_10_songs_to_search = top_10_songs[['track_name', 'artist(s)_name']]
top_10_songs_to_search = top_10_songs_to_search.rename(columns={'track_name': 'track_name', 'artist(s)_name': 'artist_name'})

# Split 'artists_name' and keep only the first part, because the another dataset only keep one artist
top_10_songs_to_search['artist_name'] = top_10_songs_to_search['artist_name'].str.split(',').str[0]

top_10_songs_to_search

Unnamed: 0,track_name,artist_name
0,Anti-Hero,Taylor Swift
1,Arcade,Duncan Laurence
2,Glimpse of Us,Joji
3,Seek & Destroy,SZA
4,"Come Back Home - From ""Purple Hearts""",Sofia Carson
5,Where Are You Now,Lost Frequencies
6,Alone,Burna Boy
7,No Lie,Sean Paul
8,HEARTBREAK ANNIVERSARY,Giveon
9,Used (feat. Don Toliver),SZA


In [97]:
# manually exchange song name values as they are not the same across the dataset
top_10_songs_to_search.loc[4, "track_name"] = "Come Back Home"
top_10_songs_to_search.loc[8, "track_name"] = "Heartbreak Anniversary"
top_10_songs_to_search

Unnamed: 0,track_name,artist_name
0,Anti-Hero,Taylor Swift
1,Arcade,Duncan Laurence
2,Glimpse of Us,Joji
3,Seek & Destroy,SZA
4,Come Back Home,Sofia Carson
5,Where Are You Now,Lost Frequencies
6,Alone,Burna Boy
7,No Lie,Sean Paul
8,Heartbreak Anniversary,Giveon
9,Used (feat. Don Toliver),SZA


In [98]:
file_path_all_songs = '../spotify_data.csv'
all_songs = pd.read_csv(file_path_all_songs, index_col = 0)
print(all_songs.head())

     artist_name        track_name                track_id  popularity  year  \
0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6          68  2012   
1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218          50  2012   
2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F          57  2012   
3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz          58  2012   
4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8          54  2012   

      genre  danceability  energy  key  loudness  mode  speechiness  \
0  acoustic         0.483   0.303    4   -10.058     1       0.0429   
1  acoustic         0.572   0.454    3   -10.286     1       0.0258   
2  acoustic         0.409   0.234    3   -13.711     1       0.0323   
3  acoustic         0.392   0.251   10    -9.845     1       0.0363   
4  acoustic         0.430   0.791    6    -5.419     0       0.0302   

   acousticness  instrumentalness  liveness  valence    tempo  duration_ms  \
0        0.694

In [99]:
# Get all the top songs completed information
selected_10_songs = pd.merge(all_songs, top_10_songs_to_search, on=['track_name', 'artist_name'], how='inner')
selected_10_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Sean Paul,No Lie,1Vb4HQnN2kZ5Y2KgYF5TDV,57,2016,dance,0.742,0.882,7,-2.862,1,0.117,0.0466,0.0,0.206,0.463,102.04,221176,4
1,Duncan Laurence,Arcade,1Xi84slp6FryDSCbzq4UCD,77,2019,pop,0.45,0.329,9,-12.603,0,0.0441,0.818,0.00109,0.135,0.266,71.884,183624,3
2,Giveon,Heartbreak Anniversary,3FAJ6O0NOHQV8Mc5Ri6ENp,79,2020,pop,0.449,0.465,0,-8.964,1,0.0791,0.524,1e-06,0.303,0.543,89.087,198371,3
3,Lost Frequencies,Where Are You Now,3uUuGVFu1V7jTQL60S1r8z,84,2021,dance,0.671,0.636,6,-8.117,0,0.103,0.515,0.000411,0.172,0.262,120.966,148197,4
4,Burna Boy,Alone,0AoBY2Y3qs6dtGgOD6c91N,77,2022,dance,0.6,0.659,4,-7.264,0,0.0542,0.176,0.0,0.111,0.307,89.955,221747,4
5,Taylor Swift,Anti-Hero,0V3wPSX9ygBnCm8psDIegu,92,2022,pop,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4
6,SZA,Seek & Destroy,6eT2V7nKXyMf47TwPbtgAD,79,2022,pop,0.651,0.647,1,-5.415,1,0.0654,0.437,0.175,0.205,0.345,152.069,203733,4
7,Joji,Glimpse of Us,6xGruZOHLs39ZbVccQTuPZ,85,2022,pop,0.44,0.317,8,-9.258,1,0.0531,0.891,5e-06,0.141,0.268,169.914,233456,3
8,SZA,Used (feat. Don Toliver),1TweDM3JC49LNeelLVg3yX,76,2022,pop,0.734,0.689,10,-6.454,0,0.0871,0.532,8.5e-05,0.322,0.705,149.579,70160,4
9,Sofia Carson,Come Back Home,1I4dwH7C0jBAEtz5DjlJgQ,73,2022,pop,0.552,0.531,7,-7.732,1,0.0421,0.241,1.2e-05,0.122,0.438,144.946,176859,4


In [100]:
top_10_songs_to_search = selected_10_songs[['artist_name', 'track_name']]
top_10_songs_to_search

Unnamed: 0,artist_name,track_name
0,Sean Paul,No Lie
1,Duncan Laurence,Arcade
2,Giveon,Heartbreak Anniversary
3,Lost Frequencies,Where Are You Now
4,Burna Boy,Alone
5,Taylor Swift,Anti-Hero
6,SZA,Seek & Destroy
7,Joji,Glimpse of Us
8,SZA,Used (feat. Don Toliver)
9,Sofia Carson,Come Back Home


In [101]:
# format dataset to make sure it has same data format
def format_dataset(df):
    df = df.drop(columns=["track_id", "popularity", "genre"])
    moved_column = df.pop("year")
    df["year"] = moved_column
    return df

In [102]:
selected_10_songs = format_dataset(selected_10_songs)
create_tokenized_summary(selected_10_songs, 'track_name', 'artist_name')
top_10_summary_vector = selected_10_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(selected_10_songs)
print(top_10_summary_vector[0])

[ 2.3160021 -0.2143867 -1.2958574 -0.3584495 -2.334128   0.6242531
  2.2698038  1.9013895  0.7554226 -1.120798   1.0283349  2.695714
 -1.2928238  2.043832 ]


In [103]:
top_10_songs_scaled = scaler.transform(selected_10_songs)
# Display the resulting DataFrame
print(top_10_songs_scaled[0])

[ 1.31791498  1.26844471  0.5103876   1.28543019  0.70122776  0.27467875
 -1.04434575 -0.76138381  0.02881781  0.13741314 -0.50136249 -0.17135925
  0.30032326  0.82074786]


In [104]:
top_10_song_embeddings = merged_embeddings(top_10_summary_vector, top_10_songs_scaled)

First song's embedding:  [ 2.31600213 -0.2143867  -1.29585743 -0.35844949 -2.3341279   0.62425309
  2.26980376  1.90138948  0.75542259 -1.12079799  1.02833486  2.695714
 -1.29282379  2.04383206  1.31791498  1.26844471  0.5103876   1.28543019
  0.70122776  0.27467875 -1.04434575 -0.76138381  0.02881781  0.13741314
 -0.50136249 -0.17135925  0.30032326  0.82074786]
Size for entire dataset:  10 ,  28


In [105]:
# mean aggregation method
mean_top_10_song_embeddings = np.mean(top_10_song_embeddings, axis = 0)

### Prepare individual personal song data

Seanna's top 10 favorite song has various genre and style:
1. Teeth - 5 Seconds of Summer
2. I WANNA BE YOUR SLAVE - Måneskin
3. Enemy - from the series Arcane League of Legends - Imagine Dragons
4. Say Something - A Great Big World
5. Marry You - Bruno Mars
6. Gotta Have You - The Weepies
7. 100 Degrees - Rich Brian
8. The Monster - Eminem
9. You Belong With Me - Taylor Swift
10. Bailando - Spanish Version - Enrique Iglesias

In [106]:
seanna_data = {
    'track_name': [
        'Teeth',
        'I WANNA BE YOUR SLAVE',
        'Enemy - from the series Arcane League of Legends',
        'Say Something',
        'Marry You',
        'Gotta Have You',
        '100 Degrees',
        'The Monster',
        'You Belong With Me',
        'Bailando - Spanish Version'
    ],
    'artist_name': [
        '5 Seconds of Summer',
        'Måneskin',
        'Imagine Dragons',
        'A Great Big World',
        'Bruno Mars',
        'The Weepies',
        'Rich Brian',
        'Eminem',
        'Taylor Swift',
        'Enrique Iglesias'
    ]
}

# Create DataFrame
seanna_favorite_songs = pd.DataFrame(seanna_data)

In [107]:
seanna_favorite_songs = pd.merge(all_songs, seanna_favorite_songs, on=['track_name', 'artist_name'], how='inner')
seanna_favorite_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Eminem,The Monster,48RrDBpOSSl1aLVCalGl5C,78,2013,hip-hop,0.781,0.853,1,-3.68,0,0.0715,0.0525,0.0,0.12,0.624,110.049,250189,4
1,A Great Big World,Say Something,78TKtlSLWK8pZAKKW3MyQL,56,2013,piano,0.453,0.146,2,-8.976,1,0.0343,0.867,3e-06,0.0945,0.0915,137.905,229400,3
2,Enrique Iglesias,Bailando - Spanish Version,32lm3769IRfcnrQV11LO4E,67,2014,pop,0.723,0.777,7,-3.503,1,0.108,0.0426,4e-06,0.0451,0.961,91.017,243413,4
3,5 Seconds of Summer,Teeth,26wLOs3ZuHJa2Ihhx6QIE6,76,2019,dance,0.756,0.448,3,-2.993,0,0.0404,0.0508,4e-06,0.11,0.431,139.031,204887,4
4,Rich Brian,100 Degrees,2ZDpSQfBdgkooeXw6oj3Uz,57,2019,hip-hop,0.756,0.648,0,-5.287,1,0.0731,0.118,0.0,0.515,0.657,80.979,166146,4
5,Måneskin,I WANNA BE YOUR SLAVE,4pt5fDVTg5GhEvEtlz9dKk,81,2021,indie-pop,0.75,0.608,1,-4.008,1,0.0387,0.00165,0.0,0.178,0.958,132.507,173347,4
6,Imagine Dragons,Enemy - from the series Arcane League of Legends,45lFaFCHXmpCiiMDvtihIv,1,2023,rock,0.728,0.783,11,-4.424,0,0.266,0.237,0.0,0.434,0.555,77.011,173381,4
7,The Weepies,Gotta Have You,1YjMWOorkBaP4MdKkKtp4y,50,2005,acoustic,0.678,0.363,11,-10.9,1,0.0318,0.872,0.000101,0.0798,0.543,75.004,199787,5
8,Taylor Swift,You Belong With Me,3GCL1PydwsLodcpv0Ll1ch,68,2008,pop,0.687,0.783,6,-4.44,1,0.0386,0.162,1.3e-05,0.114,0.443,129.964,231133,4
9,Bruno Mars,Marry You,22PMfvdz35fFKYnJyMn077,74,2010,dance,0.621,0.82,10,-4.865,1,0.0367,0.332,0.0,0.104,0.452,144.905,230192,4


In [108]:
seanna_favorite_songs_to_search = seanna_favorite_songs[['artist_name', 'track_name']]
seanna_favorite_songs_to_search

Unnamed: 0,artist_name,track_name
0,Eminem,The Monster
1,A Great Big World,Say Something
2,Enrique Iglesias,Bailando - Spanish Version
3,5 Seconds of Summer,Teeth
4,Rich Brian,100 Degrees
5,Måneskin,I WANNA BE YOUR SLAVE
6,Imagine Dragons,Enemy - from the series Arcane League of Legends
7,The Weepies,Gotta Have You
8,Taylor Swift,You Belong With Me
9,Bruno Mars,Marry You


In [109]:
seanna_favorite_songs = format_dataset(seanna_favorite_songs)
create_tokenized_summary(seanna_favorite_songs, 'track_name', 'artist_name')
seanna_summary_vector = seanna_favorite_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(seanna_favorite_songs)

seanna_songs_scaled = scaler.transform(seanna_favorite_songs)
seanna_songs_embeddings = merged_embeddings(seanna_summary_vector, seanna_songs_scaled)
mean_seanna_song_embeddings = np.mean(seanna_songs_embeddings, axis = 0)

First song's embedding:  [ 2.18169737  0.22475466 -1.17476308  1.14897251 -0.20428205 -0.55098599
 -0.45541486  1.33938098  0.45629466 -0.69557673  0.78355527  1.24850774
 -2.04820204  0.08276778  1.5233332   1.17025752 -1.18626933  1.1686037
 -1.42607018 -0.11431354 -1.02906631 -0.76138381 -0.45044182  0.73263371
 -0.24270768  0.00524366  0.30032326  0.53307642]
Size for entire dataset:  10 ,  28


Yuhan's top 10 favorite song has similar genre and style:
1. Anti-Hero - Taylor Swift
2. Lover - Taylor Swift
3. Question...? - Taylor Swift
4. deja vu - Olivia Rodrigo
5. RADIO - HENRY
6. Wonderful U - AGA
7. Forever Young - Eve Ai
8. Something's Wrong with the Morning - Margo Guryan
9. The Most Beautiful Thing - Bruno Major
10. At My Worst - Pink Sweat$

In [119]:
yuhan_data = {
    'track_name': [
        'Anti-Hero',
        'Lover',
        'Question...?',
        'deja vu',
        'RADIO',
        'Wonderful U',
        'Forever Young',
        "Something's Wrong with the Morning",
        'The Most Beautiful Thing',
        'At My Worst'
    ],
    'artist_name': [
        'Taylor Swift',
        'Taylor Swift',
        'Taylor Swift',
        'Olivia Rodrigo',
        'HENRY',
        'AGA',
        'Eve Ai',
        'Margo Guryan',
        'Bruno Major',
        'Pink Sweat$'
    ]
}

# Create DataFrame
yuhan_favorite_songs = pd.DataFrame(yuhan_data)
yuhan_favorite_songs = pd.merge(all_songs, yuhan_favorite_songs, on=['track_name', 'artist_name'], how='inner')
yuhan_favorite_songs

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Margo Guryan,Something's Wrong with the Morning,0IqQoCYYaSeM2ThWKPGoXX,52,2014,pop,0.656,0.567,2,-8.128,0,0.0352,0.682,0.000315,0.106,0.71,133.558,105573,4
1,AGA,Wonderful U,2eSNpIOFoi1Q8wxw6CycXJ,47,2016,cantopop,0.557,0.436,6,-8.569,1,0.0676,0.809,0.0,0.151,0.246,179.997,248551,3
2,Eve Ai,Forever Young,25sQT3yCEgd1uE6LC9ivcs,51,2018,singer-songwriter,0.304,0.226,0,-10.707,1,0.0329,0.929,0.0,0.161,0.323,139.593,313907,4
3,Taylor Swift,Lover,1dGr1c8CrMLDpV6mPbImSI,83,2019,pop,0.359,0.543,7,-7.582,1,0.0919,0.492,1.6e-05,0.118,0.453,68.534,221307,4
4,Pink Sweat$,At My Worst,0ri0Han4IRJhzvERHOZTMr,71,2020,chill,0.813,0.415,0,-5.926,1,0.0349,0.777,0.0,0.131,0.667,91.921,170345,4
5,HENRY,RADIO,4Dyb1oDEx4togM79cHL8UK,48,2020,k-pop,0.761,0.766,0,-5.414,1,0.143,0.118,0.0,0.111,0.266,146.879,191985,4
6,Bruno Major,The Most Beautiful Thing,07koEqsKHZTlGVMC9eoEjO,67,2020,pop,0.806,0.362,7,-10.386,1,0.0344,0.541,0.0489,0.111,0.418,127.498,235427,4
7,Olivia Rodrigo,deja vu,6HU7h9RYOaPRFeh0R3UeAr,83,2021,pop,0.442,0.612,2,-7.222,1,0.112,0.584,6e-06,0.37,0.178,180.917,215507,4
8,Taylor Swift,Anti-Hero,0V3wPSX9ygBnCm8psDIegu,92,2022,pop,0.637,0.643,4,-6.571,1,0.0519,0.13,2e-06,0.142,0.533,97.008,200690,4
9,Taylor Swift,Question...?,0heeNYlwOGuUSe7TgUD27B,74,2022,pop,0.751,0.502,7,-8.763,1,0.167,0.2,0.0,0.296,0.106,108.943,210557,4


In [120]:
yuhan_favorite_songs_to_search = yuhan_favorite_songs[['artist_name', 'track_name']]
yuhan_favorite_songs_to_search

Unnamed: 0,artist_name,track_name
0,Margo Guryan,Something's Wrong with the Morning
1,AGA,Wonderful U
2,Eve Ai,Forever Young
3,Taylor Swift,Lover
4,Pink Sweat$,At My Worst
5,HENRY,RADIO
6,Bruno Major,The Most Beautiful Thing
7,Olivia Rodrigo,deja vu
8,Taylor Swift,Anti-Hero
9,Taylor Swift,Question...?


In [121]:
yuhan_favorite_songs = format_dataset(yuhan_favorite_songs)
create_tokenized_summary(yuhan_favorite_songs, 'track_name', 'artist_name')
yuhan_summary_vector = yuhan_favorite_songs['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))
clean_tokenized_summary(yuhan_favorite_songs)

yuhan_songs_scaled = scaler.transform(yuhan_favorite_songs)
yuhan_songs_embeddings = merged_embeddings(yuhan_summary_vector, yuhan_songs_scaled)
mean_yuhan_song_embeddings = np.mean(yuhan_songs_embeddings, axis = 0)

First song's embedding:  [ 3.79312015  0.443066   -1.0267359   1.2555654  -0.88453609 -0.05224168
  0.15272658  0.52212608  2.03030443 -1.15595007  0.75490409  1.32537699
 -2.10083055 -0.04597534  0.86494145  0.20192866 -0.90349317  0.53334179
 -1.42607018 -0.42465243  0.60117314 -0.76055037 -0.52846083  1.05057762
  0.51652767 -0.87503781  0.30032326  0.6289669 ]
Size for entire dataset:  10 ,  28


## Store Embeddings to Pinecone

In [40]:
# !pip install -qU \
#   "pinecone-client[grpc]"==2.2.1

In [41]:
import os
import pinecone
import time

  from tqdm.autonotebook import tqdm


In [42]:
PINECONE_API_KEY = '03367330-5730-4400-ac60-9ab695a047c0'
PINECONE_ENV = 'us-east-1-aws'

In [43]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

### Store embeddings to Pinecone - Cosine

In [44]:
index_name = 'music-recommender-cosine'
dim = len(embedded_features['values'][0])

In [45]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='cosine'
    )
    # wait a moment for the index to be fully initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [47]:
# now connect to the index
index_c = pinecone.GRPCIndex(index_name)
index_c.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [48]:
index_c.upsert_from_dataframe(embedded_features, batch_size=1000)

sending upsert requests:   0%|          | 0/1141542 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141542

In [49]:
index_c.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

### Store embeddings to Pinecone - Euclidean

In [50]:
index_name = 'music-recommender-euclidean'
dim = len(embedded_features['values'][0])

In [52]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='euclidean'
    )
    # wait a moment for the index to be fully initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [53]:
# now connect to the index
index_e = pinecone.GRPCIndex(index_name)
index_e.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [54]:
index_e.upsert_from_dataframe(embedded_features, batch_size=1000)

sending upsert requests:   0%|          | 0/1141542 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141542

In [55]:
index_e.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1141542}},
 'total_vector_count': 1141542}

### Store embeddings to Pinecone - Dotproduct

In [56]:
index_name = 'music-recommender-dotproduct'
dim = len(embedded_features['values'][0])

In [57]:
# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=dim,
        metric='dotproduct'
    )
    # wait a moment for the index to be fully initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [58]:
# now connect to the index
index_d = pinecone.GRPCIndex(index_name)
index_d.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [59]:
index_d.upsert_from_dataframe(embedded_features, batch_size=1000)

sending upsert requests:   0%|          | 0/1141542 [00:00<?, ?it/s]

collecting async responses:   0%|          | 0/1142 [00:00<?, ?it/s]

upserted_count: 1141542

In [60]:
index_d.describe_index_stats()

{'dimension': 28,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 1140000}},
 'total_vector_count': 1140000}

## Query

In [122]:
# display query results in dataframe with recommended songs and similarity scores
def combined_query_result(index, song_embeddings):
    query_response = index.query(song_embeddings, top_k=10, include_metadata=True)
    result_songs = []
    matches = query_response['matches']
    for e in matches:
        id = e['id']
        song = songs_df[songs_df['id'] == id]
        result_songs.append([song['name'].item(), song['artists'].item(), str(e['score'])])
    return pd.DataFrame(result_songs, columns=['song_name', 'artists', 'similarity_score'])
    
def individual_query_result(index, query_songs, query_embeddings):
    result = []
    fav_song_names = query_songs['track_name'].tolist()
    fav_song_artists = query_songs['artist_name'].tolist()
    
    for i in range(len(fav_song_names)):
        xc = index.query(query_embeddings[i], top_k=2, include_metadata=True)
        id = xc['matches'][0]['id']
        score = xc['matches'][0]['score']
        song = songs_df[songs_df['id'] == id]
        song_name = song['name'].item()
        song_artists = song['artists'].item()
        if fav_song_names[i] == song_name and fav_song_artists[i] in song_artists:
            id = xc['matches'][1]['id']
            score = xc['matches'][1]['score']
            song = songs_df[songs_df['id'] == id]
        result.append([fav_song_names[i], fav_song_artists[i], song['name'].item(), song['artists'].item(), str(score)])
    
    return pd.DataFrame(result, columns=['fav_song', 'artists', 'match_name', 'match_artists', 'similarity_score'])

### Query - Dotproduct

#### Combined Song Vector as a Single Query

In [123]:
# personal listening histroy top 10 averaged - Yuhan
combined_query_result(index_d, mean_yuhan_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,What You Are,['We Ride'],35.791786
1,Can't Take It With You When You Go,['Mike Love'],35.692028
2,Don't You Want To Know,['Oh So'],35.629147
3,Life's What You Make It,['Baby'],35.595047
4,Love Is What You Make It,['Detour'],35.16781
5,You're All I Want,['Prince'],35.15793
6,Don't Want You to Go,['Angel'],35.106518
7,You Can't (Make Somebody Love You),['Music Band'],34.95245
8,I Need You More Than You Want Me,['Fly By Midnight'],34.93988
9,Love You Too,['SISTERS'],34.874126


In [124]:
# personal listening histroy top 10 averaged - Seanna
combined_query_result(index_d, mean_seanna_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Don't You Want It,['Five'],47.863876
1,Life's What You Make It,['Baby'],46.080036
2,What You Are,['We Ride'],45.944195
3,Do You Don't You,['Haywyre'],45.940643
4,Don't You Want It,['Lovers'],45.695755
5,What Do You Want,['HER'],45.514782
6,Don't You Want To Know,['Oh So'],45.386955
7,You Can't Take What You Don't Have (You Don't ...,['Whiteheart'],45.296844
8,Do What I Say Don't Do What I Do,['Bill Blue'],45.09262
9,I Don't Know What You Got,['Tiffany'],44.93443


In [125]:
# spotify top 10 averaged
combined_query_result(index_d, mean_top_10_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Life's What You Make It,['Baby'],36.660213
1,Don't Want You to Go,['Angel'],36.223186
2,Don't You Want To Know,['Oh So'],35.984177
3,Can't Take It With You When You Go,['Mike Love'],35.924175
4,What You See Is What You Get,['Daryl Hall & John Oates'],35.61452
5,What You Are,['We Ride'],35.46794
6,Can't Be Friends,['J. Morgan'],35.356186
7,It Ain't You It's Me,['Kane Brown'],35.34243
8,Get It Right,['Baby'],35.33868
9,You Can't (Make Somebody Love You),['Music Band'],35.327198


#### Individual Song Vector as a Single Query

In [126]:
# personal listening histroy top 10 1by1 - Yuhan
individual_query_result(index_d, yuhan_favorite_songs_to_search, yuhan_songs_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,Something's Wrong with the Morning,Margo Guryan,Life's What You Make It,['Baby'],54.92357
1,Wonderful U,AGA,You Don't Know Me,['Eddie Money'],49.40794
2,Forever Young,Eve Ai,How Come You Do Me Like You Do Do Do,['Debby Moore'],64.69143
3,Lover,Taylor Swift,You Can't Take It,['Linda Jones'],40.181194
4,At My Worst,Pink Sweat$,Don't Let It Get You Down,['Chuck Foster'],51.926258
5,RADIO,HENRY,Conversation,['Peter Peter'],80.43664
6,The Most Beautiful Thing,Bruno Major,Major Major Major,['The Jac'],101.47213
7,deja vu,Olivia Rodrigo,Por Tu Amor - En Vivo,['Marimba Orquesta Maya Excelsior'],33.24936
8,Anti-Hero,Taylor Swift,Sam Hall,['Kevin Evans'],30.645512
9,Question...?,Taylor Swift,Don't You Want To Know,['Oh So'],43.074165


In [127]:
# personal listening histroy top 10 1by1 - Seanna
individual_query_result(index_d, seanna_favorite_songs_to_search, seanna_songs_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,The Monster,Eminem,Who Let The Dogs Out,['The Hit Crew'],37.866177
1,Say Something,A Great Big World,It Is Not Zen,['It It'],69.66454
2,Bailando - Spanish Version,Enrique Iglesias,Disco light (feat. LA LE) [prod. by CAKEBOY] (...,"['IROH', 'LA LE']",48.740467
3,Teeth,5 Seconds of Summer,1 For 2 For 1,['D-Styles'],56.754463
4,100 Degrees,Rich Brian,Better Than,['John Butler Trio'],42.906075
5,I WANNA BE YOUR SLAVE,Måneskin,Do What You Say You're Gonna Do,['SaraBeth'],84.269714
6,Enemy - from the series Arcane League of Legends,Imagine Dragons,The First Present,"[""It's a Death Metal X-mas""]",53.738186
7,Gotta Have You,The Weepies,Do What You Say You're Gonna Do,['SaraBeth'],82.51027
8,You Belong With Me,Taylor Swift,You Can't Always Get What You Want,['Aretha Franklin'],71.76399
9,Marry You,Bruno Mars,Get It Right,['Baby'],45.0694


In [128]:
# spotify top 10 1by1
individual_query_result(index_d, top_10_songs_to_search, top_10_song_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,No Lie,Sean Paul,. . . So We . . .,['Illuminandi'],59.50638
1,Arcade,Duncan Laurence,Growl!,"['John Morris Russell', 'Cincinnati Pops Orche...",45.365906
2,Heartbreak Anniversary,Giveon,You Can't Do It (So Give Up Now),['Strong Bad'],34.810604
3,Where Are You Now,Lost Frequencies,Don't You Want It,['Lovers'],81.949745
4,Alone,Burna Boy,Don't You Want It,['Lovers'],49.171337
5,Anti-Hero,Taylor Swift,Sam Hall,['Kevin Evans'],30.645512
6,Seek & Destroy,SZA,On & On & On,['Caamp'],40.427998
7,Glimpse of Us,Joji,No One is Alone (Glee Cast Version),['Glee Cast'],44.311615
8,Used (feat. Don Toliver),SZA,The Goonies (Bonus Track) [feat. J. Padron],"['Tom P', 'J. Padron']",59.944088
9,Come Back Home,Sofia Carson,Get It Right,['Baby'],62.000214


### Combined Song Vector as Single Query - Euclidean

#### Combined Song Vector as a Single Query

In [131]:
# personal listening histroy top 10 averaged - Yuhan
combined_query_result(index_e, mean_yuhan_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Erase Me,"['Said the Sky', 'NÉONHÈART']",2.3954582
1,Say The Word,"['tyDi', 'JES']",2.694992
2,Best Shot,"['Birdy', 'Jaymes Young']",2.7018433
3,What We Asked For,"['Goodnight Moonshine', 'Molly Venter', 'Roose...",3.304924
4,Without You,"['Tinashe', 'Brennin Hunt']",3.3830605
5,Forever,['Elisha'],3.3859558
6,Kissing Other People,['Lennon Stella'],3.3881912
7,Sweet Dreams,['Staci Griesbach'],3.4248085
8,This Feeling,"['The Chainsmokers', 'Kelsea Ballerini']",3.591381
9,The Most,['Miley Cyrus'],3.68919


In [132]:
# personal listening histroy top 10 averaged - Seanna
combined_query_result(index_e, mean_seanna_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Despacito x Shape Of You,['Pentatonix'],2.476059
1,But I Am A Good Girl - Burlesque Original Moti...,['Christina Aguilera'],3.4907494
2,Wild One,['Layla Zoe'],3.491005
3,Honky Tonk Angels (Don't Happen Overnight),['Eleven Hundred Springs'],3.8464317
4,Torment In Me,['Leaving Dionysus'],3.8556976
5,One Essex Girl,['Tullycraft'],4.2352867
6,Another Drink for Me,['Louie Austen'],4.2571297
7,Warmer Love,['The Sheepdogs'],4.293274
8,Mystic in My Soul,['Ana Velinova'],4.3316803
9,Some People's Kids,['Karac Hendriks'],4.3457108


In [133]:
# spotify top 10 averaged
combined_query_result(index_e, mean_top_10_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Erase Me,"['Said the Sky', 'NÉONHÈART']",3.1773796
1,The Most,['Miley Cyrus'],3.6792183
2,Best Shot,"['Birdy', 'Jaymes Young']",3.968975
3,Dance You Off - Galavant Remix,"['Benjamin Ingrosso', 'Galavant']",4.129841
4,Truth or Consequences,['Sumaia Jackson'],4.1859436
5,Naming Your Town,['Berens & Greuel'],4.4264984
6,Feel the Pinch,['Marshall Okell'],4.44503
7,Again,['RiverKinn'],4.5892677
8,The Change,['JoJo'],4.6350746
9,Take Me Away,"['UNEQUAL', 'Mondo Cozmo']",4.6677475


#### Individual Song Vector as a Single Query

In [134]:
# personal listening histroy top 10 1by1 - Yuhan
individual_query_result(index_e, yuhan_favorite_songs_to_search, yuhan_songs_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,Something's Wrong with the Morning,Margo Guryan,"Take A Picture, Lose Your Soul",['The Steel Wheels'],3.461029
1,Wonderful U,AGA,Why?,['Devin Townsend'],7.072899
2,Forever Young,Eve Ai,Wait for Me,['Cas Haley'],11.241112
3,Lover,Taylor Swift,Love Chain,['Taylor Dayne'],4.4339523
4,At My Worst,Pink Sweat$,1-800,['Bad Bad Hats'],6.6018066
5,RADIO,HENRY,Triumph,['Kevin M. Thomas'],18.850098
6,The Most Beautiful Thing,Bruno Major,This Feeling Tonight,['Major Murphy'],8.4700165
7,deja vu,Olivia Rodrigo,Carta Branca,['Gusttavo Lima'],5.7066345
8,Anti-Hero,Taylor Swift,Stingy,['Elijah Blake'],3.0680656
9,Question...?,Taylor Swift,I Pray - 2005,['Mariah Carey'],5.6261673


In [135]:
# personal listening histroy top 10 1by1 - Seanna
individual_query_result(index_e, seanna_favorite_songs_to_search, seanna_songs_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,The Monster,Eminem,Fever,['The Eskies'],1.6237526
1,Say Something,A Great Big World,a Squandering,['People Get Ready'],5.348503
2,Bailando - Spanish Version,Enrique Iglesias,Déjame Pasar Compare (Spanish Rumba),['Raya Real'],6.814022
3,Teeth,5 Seconds of Summer,Kingdom of Avarice,['Fiction 8'],4.667015
4,100 Degrees,Rich Brian,Nothing but the Blood (feat. Eric Mccallister),"['Shai Linne', 'Eric Mccallister']",8.269501
5,I WANNA BE YOUR SLAVE,Måneskin,I Don't Wanna Be Your Mother,['Kendel Carson'],5.167679
6,Enemy - from the series Arcane League of Legends,Imagine Dragons,Success Story,['Carnage The Executioner'],5.681183
7,Gotta Have You,The Weepies,I'm Gonna Sit Right Down And Write Myself A Le...,['Wendee Glick'],7.6156693
8,You Belong With Me,Taylor Swift,Call Me Maybe,['Caitlin Hart'],3.0619736
9,Marry You,Bruno Mars,Everything Anyhow,['Freddy Monday'],4.8248863


In [136]:
# spotify top 10 1by1
individual_query_result(index_e, top_10_songs_to_search, top_10_song_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,No Lie,Sean Paul,"Just a Verse, Pt. I","['Undagawds', 'Thelonious Coltrane', 'Peter Ma...",8.31768
1,Arcade,Duncan Laurence,magnetic,"['Kelvin Jones', 'Sara Hartman']",4.7637367
2,Heartbreak Anniversary,Giveon,HEARTBREAK ANNIVERSARY,['Giveon'],0.0017089844
3,Where Are You Now,Lost Frequencies,Where Were You,['Migos'],4.0823975
4,Alone,Burna Boy,Best,['Young Mister'],3.685669
5,Anti-Hero,Taylor Swift,Stingy,['Elijah Blake'],3.0680656
6,Seek & Destroy,SZA,mirror,['Else & Poki'],3.1970863
7,Glimpse of Us,Joji,Alright in the End,['Kalyn Fay'],4.1178894
8,Used (feat. Don Toliver),SZA,You Niggaz Pussy (feat. V Slash),"['Juicy J', 'V Slash']",4.5147476
9,Come Back Home,Sofia Carson,Come Back,['Romarzs'],4.4620895


### Combined Song Vector as Single Query - Cosine

#### Combined Song Vector as a Single Query

In [137]:
# personal listening histroy top 10 averaged - Yuhan
combined_query_result(index_c, mean_yuhan_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Erase Me,"['Said the Sky', 'NÉONHÈART']",0.94418687
1,made up love song #43,['Lewis Watson'],0.9370434
2,Fight for You,['Shawn McDonald'],0.9355886
3,Say The Word,"['tyDi', 'JES']",0.9352302
4,Happy Is,['Eric Brace & Last Train Home'],0.9314263
5,Best Shot,"['Birdy', 'Jaymes Young']",0.9302498
6,Shift,"['Who TF Is Justin Time?', 'Big Murph']",0.9296707
7,Cowboy's Say Goodbye,['Midwest Avenue'],0.92966914
8,Love Is Magic,['Dave Heffner'],0.9275577
9,Without You,"['Tinashe', 'Brennin Hunt']",0.92729306


In [138]:
# personal listening histroy top 10 averaged - Seanna
combined_query_result(index_c, mean_seanna_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,All That I Want Is You - 2016 Version,['The Magic Gang'],0.95490974
1,Despacito x Shape Of You,['Pentatonix'],0.9506609
2,Love You Like a Burrito,['The Doubleclicks'],0.94879264
3,It's All Right,['The Apples In Stereo'],0.9482256
4,I Got You,['The Remedy Club'],0.94510305
5,I Want to Be a Real Cowboy Girl,['The Sweetback Sisters'],0.94418865
6,Take a Walk With Me,['Tom Joad'],0.94132423
7,But I Am A Good Girl - Burlesque Original Moti...,['Christina Aguilera'],0.9412315
8,You're the One,['The Prozacs'],0.9402188
9,Always a Failure With You,['Kiosk'],0.9392031


In [139]:
# spotify top 10 averaged
combined_query_result(index_c, mean_top_10_song_embeddings)

Unnamed: 0,song_name,artists,similarity_score
0,Comes Your Way (Inception),['Lucas & Steve'],0.9349553
1,Without Love You Can Save The World (feat. Rac...,"['Crazy Ex-Girlfriend Cast', 'Rachel Bloom']",0.9335497
2,All Out of Love,['H & Claire'],0.9276386
3,Fight for You,['Shawn McDonald'],0.92641795
4,Erase Me,"['Said the Sky', 'NÉONHÈART']",0.9254003
5,You're a Friend of Mine,['Sonny & Cher'],0.9232466
6,Where Have You Gone (Anywhere),['Lucas & Steve'],0.92308205
7,Up All Night,['Baker Grace'],0.9210635
8,Love Me Tender (Love Me Black and Blue),['Nikki & the Phantom Callers'],0.91886127
9,You Me And Gravity,"['Koven', 'Crystal Skies']",0.9164757


#### Individual Song Vector as a Single Query

In [140]:
# personal listening histroy top 10 1by1 - Yuhan
individual_query_result(index_c, yuhan_favorite_songs_to_search, yuhan_songs_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,Something's Wrong with the Morning,Margo Guryan,A Love of Mine,['The Vandelettes'],0.9562929
1,Wonderful U,AGA,Need You,['Fireboy DML'],0.89424056
2,Forever Young,Eve Ai,Wait for Me,['Cas Haley'],0.8937622
3,Lover,Taylor Swift,Love Chain,['Taylor Dayne'],0.9457879
4,At My Worst,Pink Sweat$,1-800,['Bad Bad Hats'],0.92910254
5,RADIO,HENRY,Youth Leagues,['Robert Pollard'],0.905766
6,The Most Beautiful Thing,Bruno Major,This Feeling Tonight,['Major Murphy'],0.9269985
7,deja vu,Olivia Rodrigo,Carta Branca,['Gusttavo Lima'],0.8687294
8,Anti-Hero,Taylor Swift,Beautiful,['Taylor Dayne'],0.933288
9,Question...?,Taylor Swift,GO,['The Good Perry'],0.9144734


In [141]:
# personal listening histroy top 10 1by1 - Seanna
individual_query_result(index_c, seanna_favorite_songs_to_search, seanna_songs_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,The Monster,Eminem,Brat Pack,['The Rocket Summer'],0.97155994
1,Say Something,A Great Big World,Find Me a Movie,['Gillwire'],0.9554636
2,Bailando - Spanish Version,Enrique Iglesias,Déjame Pasar Compare (Spanish Rumba),['Raya Real'],0.9163792
3,Teeth,5 Seconds of Summer,Kingdom of Avarice,['Fiction 8'],0.94852877
4,100 Degrees,Rich Brian,Man,['Gene Evaro Jr.'],0.89748394
5,I WANNA BE YOUR SLAVE,Måneskin,I Don't Wanna Be Your Mother,['Kendel Carson'],0.9659412
6,Enemy - from the series Arcane League of Legends,Imagine Dragons,Success Story,['Carnage The Executioner'],0.94134754
7,Gotta Have You,The Weepies,I'm Gonna Sit Right Down And Write Myself A Le...,['Wendee Glick'],0.9504755
8,You Belong With Me,Taylor Swift,Call Me Maybe,['Caitlin Hart'],0.9710896
9,Marry You,Bruno Mars,Everything Anyhow,['Freddy Monday'],0.91044796


In [142]:
# spotify top 10 1by1
individual_query_result(index_c, top_10_songs_to_search, top_10_song_embeddings)

Unnamed: 0,fav_song,artists,match_name,match_artists,similarity_score
0,No Lie,Sean Paul,"Just a Verse, Pt. I","['Undagawds', 'Thelonious Coltrane', 'Peter Ma...",0.9066723
1,Arcade,Duncan Laurence,Gifts,"['Edgar Leslie Bainton', 'Christopher Gillett'...",0.9149907
2,Heartbreak Anniversary,Giveon,HEARTBREAK ANNIVERSARY,['Giveon'],0.99994606
3,Where Are You Now,Lost Frequencies,Where Were You,['Migos'],0.96842015
4,Alone,Burna Boy,Best,['Young Mister'],0.951568
5,Anti-Hero,Taylor Swift,Beautiful,['Taylor Dayne'],0.933288
6,Seek & Destroy,SZA,mirror,['Else & Poki'],0.94596386
7,Glimpse of Us,Joji,Alright in the End,['Kalyn Fay'],0.93341416
8,Used (feat. Don Toliver),SZA,Spaceship (feat. Sheck Wes),"['Don Toliver', 'Sheck Wes']",0.96831095
9,Come Back Home,Sofia Carson,Come Back,['Romarzs'],0.9485333
