In [17]:
import pandas as pd
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
import numpy as np

### Load dataset of songs

Dataset: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs, an open source dataset on Kaggle. It provides nearly 1.2 million of songs in Spotify. Those songs were retreived by using Spotify API.

In [2]:
file_path = '../tracks_features.csv'
df = pd.read_csv(file_path)
print(df.head())

                       id                   name                      album  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  The Battle Of Los Angeles   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  The Battle Of Los Angeles   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  The Battle Of Los Angeles   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  The Battle Of Los Angeles   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  The Battle Of Los Angeles   

                 album_id                       artists  \
0  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
1  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
2  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
3  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
4  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   

                   artist_ids  track_number  disc_number  explicit  \
0  ['2d0hyoQ5ynDBnkvAbJKORj']             1            1     False   
1  ['2d0hyoQ5ynDBnkvAbJKORj'] 

### Preprocessing data

We want to perform some operations to select the numeric audio features we want, and also convert those categorical values into numeric one to create the vector embeddings.
The selected features includes:
- id (not sure if we need this?)
- name
- artists
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acousticness
- instrumentalness
- liveness
- valence
- tempo
- duration_ms
- time_signature
- year (do we want this?)

In [3]:
selected_features = df.drop(columns=["album", "album_id", "artist_ids", "track_number", "disc_number", "explicit", "release_date"])
print(selected_features.head())

                       id                   name  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire   

                        artists  danceability  energy  key  loudness  mode  \
0  ['Rage Against The Machine']         0.470   0.978    7    -5.399     1   
1  ['Rage Against The Machine']         0.599   0.957   11    -5.764     1   
2  ['Rage Against The Machine']         0.315   0.970    7    -5.424     1   
3  ['Rage Against The Machine']         0.440   0.967   11    -5.830     0   
4  ['Rage Against The Machine']         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.

In [4]:
# check if our filted features contains any missing value
selected_features.isna().any()

id                  False
name                 True
artists             False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
year                False
dtype: bool

Some songs have multiple artists, we want to convert them from a list to string.
Example: ['Pietro Locatelli', 'Capella Istropolitana', 'Jaroslav Krcek'] to 'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [5]:
def convert_artists_name(artists_list):
    items_list = artists_list.strip("[]").replace("'", "").split(", ")
    return ", ".join(items_list)

selected_features["artists"] = selected_features["artists"].apply(convert_artists_name)
selected_features.iloc[1184]["artists"]

'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [6]:
# remove duplicated rows by song name and artists name
print("Shape before duplicated removal: ", selected_features.shape)
selected_features = selected_features.drop_duplicates(subset=['name', 'artists'])
print("Shape after duplicated removal: ", selected_features.shape)

Shape before duplicated removal:  (1204025, 17)
Shape after duplicated removal:  (1141555, 17)


In [7]:
print(selected_features.head())
print(selected_features.tail())

                       id                   name                   artists  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  Rage Against The Machine   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  Rage Against The Machine   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  Rage Against The Machine   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  Rage Against The Machine   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  Rage Against The Machine   

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.470   0.978    7    -5.399     1       0.0727       0.02610   
1         0.599   0.957   11    -5.764     1       0.1880       0.01290   
2         0.315   0.970    7    -5.424     1       0.4830       0.02340   
3         0.440   0.967   11    -5.830     0       0.2370       0.16300   
4         0.426   0.929    2    -6.729     1       0.0701       0.00162   

   instrumentalness  liveness  valence    tempo  duration_ms  time_signature  \


### Create vectors/embeddings

We first need to convert those song and artists name into vector. The converted vector representation will have length of 14, so we can combine these with 14 numeric column values. We will combine the song name with artists name to one column for better tokenize

In [8]:
selected_features['string_summary'] = selected_features['name'] + ' - ' + selected_features['artists']
selected_features['string_summary'] = selected_features['string_summary'].astype(str)

# Drop the original 'name' and 'artists' columns
selected_features.drop(['name', 'artists'], axis=1, inplace=True)
print(selected_features.head())

                       id  danceability  energy  key  loudness  mode  \
0  7lmeHLHBe4nmXzuXc0HDjk         0.470   0.978    7    -5.399     1   
1  1wsRitfRRtWyEapl0q22o8         0.599   0.957   11    -5.764     1   
2  1hR0fIFK2qRG3f3RF70pb7         0.315   0.970    7    -5.424     1   
3  2lbASgTSoDO7MTuLAXlTW0         0.440   0.967   11    -5.830     0   
4  1MQTmpYOZ6fcMQc56Hdo7T         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.489  103.680   
2       0.4830       0.02340          0.000002    0.1220    0.370  149.749   
3       0.2370       0.16300          0.000004    0.1210    0.574   96.752   
4       0.0701       0.00162          0.105000    0.0789    0.539  127.059   

   duration_ms  time_signature  year  \
0       210133             4.0  1999   
1       206200    

In [9]:
# Convert string summaries to lowercase and then tokenize
selected_features['tokenized_summary'] = selected_features['string_summary'].apply(lambda x: word_tokenize(x.lower()))

In [10]:
# Define Word2Vec model parameters (may adjust later)
vector_size = 14
window_size = 5
min_count = 1

# Train Word2Vec model
word2vec_model = Word2Vec(selected_features['tokenized_summary'], vector_size=vector_size, window=window_size, min_count=min_count)

In [11]:
# Convert string summaries to vectors
def get_summary_vector(summary, model):
    summary_vector = [model.wv[word] for word in summary if word in model.wv]
    return sum(summary_vector) / len(summary_vector) if summary_vector else [0] * vector_size

summary_vector = selected_features['tokenized_summary'].apply(lambda x: get_summary_vector(x, word2vec_model))

In [12]:
selected_features.drop(['string_summary', 'tokenized_summary'], axis=1, inplace=True)
print(summary_vector[0])

[-1.4271774  -0.03569482 -1.4224687  -0.30498803  1.6186962   1.0922139
 -0.29048076  0.2500786  -0.44903776 -1.1465993  -0.6184953   0.39508137
 -3.07173     2.404003  ]


The numerical columns are audio characteristics of the song, and we want to scale all the values to make it become the embeddings.

In [13]:
# Extract the numeric columns (excluding 'id' and 'summary_vector')
numeric_columns = selected_features.drop(['id'], axis=1)

# Standardize the numeric columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_columns)

# Display the resulting DataFrame
print(scaled_data[0])

[-0.11476022  1.59348117  0.51038804  0.9231026   0.70123154 -0.10405628
 -1.0974239  -0.76136317  0.8647423   0.28528598  0.01103847 -0.23857505
  0.30032139 -0.70179981]


Finally, we want to merge those summary vector (name & artisits) with scaled vector (audio charactersitcs) to make the embeddings for each song.

In [25]:
song_embeddings = [
    np.concatenate([summary_row, scaled_row])
    for summary_row, scaled_row in zip(summary_vector, scaled_data)
]
print(song_embeddings[0])
print(len(song_embeddings), ", ", len(song_embeddings[0]))

[-1.42717743 -0.03569482 -1.42246866 -0.30498803  1.61869621  1.09221387
 -0.29048076  0.25007859 -0.44903776 -1.14659929 -0.61849529  0.39508137
 -3.0717299   2.4040029  -0.11476022  1.59348117  0.51038804  0.9231026
  0.70123154 -0.10405628 -1.0974239  -0.76136317  0.8647423   0.28528598
  0.01103847 -0.23857505  0.30032139 -0.70179981]
1141555 ,  28


Combining those things into our final table for uploading to Pinecone. The table should have two columns, one is id, and another one is song embeddings representation.

In [30]:
embedded_features = selected_features[["id"]].copy()
embedded_features.loc[:, "song_embeddings"] = song_embeddings
print(embedded_features.head())
print(embedded_features.shape)

                       id                                    song_embeddings
0  7lmeHLHBe4nmXzuXc0HDjk  [-1.4271774291992188, -0.03569481894373894, -1...
1  1wsRitfRRtWyEapl0q22o8  [-1.0679484605789185, -0.0033303499221801758, ...
2  1hR0fIFK2qRG3f3RF70pb7  [-1.103868007659912, 0.22797372937202454, -1.9...
3  2lbASgTSoDO7MTuLAXlTW0  [-1.2852662801742554, -0.1682312786579132, -1....
4  1MQTmpYOZ6fcMQc56Hdo7T  [-1.3810672760009766, -0.5420541763305664, -1....
(1141555, 2)


### Store embeddings to Pinecone

### Search for similar songs

1. Our personal favofite song (feed 1 get top 10)
2. Our listening history (feed 10 get top 10)
3. Spotify 2023 top 100 song (most streamed 1 get 10 top)
4. Spotify 2023 topp 100 song (feed 10 get top 10)

Pinecone search metric 