In [2]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Load dataset of songs

Dataset: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs, an open source dataset on Kaggle. It provides nearly 1.2 million of songs in Spotify. Those songs were retreived by using Spotify API.

In [17]:
file_path = '../tracks_features.csv'
df = pd.read_csv(file_path)
print(df.head())

                       id                   name                      album  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  The Battle Of Los Angeles   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  The Battle Of Los Angeles   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  The Battle Of Los Angeles   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  The Battle Of Los Angeles   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  The Battle Of Los Angeles   

                 album_id                       artists  \
0  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
1  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
2  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
3  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   
4  2eia0myWFgoHuttJytCxgX  ['Rage Against The Machine']   

                   artist_ids  track_number  disc_number  explicit  \
0  ['2d0hyoQ5ynDBnkvAbJKORj']             1            1     False   
1  ['2d0hyoQ5ynDBnkvAbJKORj'] 

### Preprocessing data

We want to perform some operations to select the numeric audio features we want, and also convert those categorical values into numeric one to create the vector embeddings.
The selected features includes:
- id (not sure if we need this?)
- name
- artists
- danceability
- energy
- key
- loudness
- mode
- speechiness
- acousticness
- instrumentalness
- liveness
- valence
- tempo
- duration_ms
- time_signature
- year (do we want this?)

In [24]:
selected_features = df.drop(columns=["album", "album_id", "artist_ids", "track_number", "disc_number", "explicit", "release_date"])
print(selected_features.head())

                       id                   name  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire   

                        artists  danceability  energy  key  loudness  mode  \
0  ['Rage Against The Machine']         0.470   0.978    7    -5.399     1   
1  ['Rage Against The Machine']         0.599   0.957   11    -5.764     1   
2  ['Rage Against The Machine']         0.315   0.970    7    -5.424     1   
3  ['Rage Against The Machine']         0.440   0.967   11    -5.830     0   
4  ['Rage Against The Machine']         0.426   0.929    2    -6.729     1   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0727       0.02610          0.000011    0.3560    0.503  117.906   
1       0.1880       0.01290          0.000071    0.1550    0.

In [25]:
# check if our filted features contains any missing value
selected_features.isna().any()

id                  False
name                False
artists             False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
year                False
dtype: bool

Some songs have multiple artists, we want to convert them from a list to string.
Example: ['Pietro Locatelli', 'Capella Istropolitana', 'Jaroslav Krcek'] to 'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [26]:
def convert_artists_name(artists_list):
    items_list = artists_list.strip("[]").replace("'", "").split(", ")
    return ", ".join(items_list)

selected_features["artists"] = selected_features["artists"].apply(convert_artists_name)
selected_features.iloc[1184]["artists"]

'Pietro Locatelli, Capella Istropolitana, Jaroslav Krcek'

In [27]:
# remove duplicated rows by song name and artists name
print("Shape before duplicated removal: ", selected_features.shape)
selected_features = selected_features.drop_duplicates(subset=['name', 'artists'])
print("Shape after duplicated removal: ", selected_features.shape)

Shape before duplicated removal:  (1204025, 17)
Shape after duplicated removal:  (1141555, 17)


In [28]:
print(selected_features.head())
print(selected_features.tail())

                       id                   name                   artists  \
0  7lmeHLHBe4nmXzuXc0HDjk                Testify  Rage Against The Machine   
1  1wsRitfRRtWyEapl0q22o8        Guerrilla Radio  Rage Against The Machine   
2  1hR0fIFK2qRG3f3RF70pb7       Calm Like a Bomb  Rage Against The Machine   
3  2lbASgTSoDO7MTuLAXlTW0              Mic Check  Rage Against The Machine   
4  1MQTmpYOZ6fcMQc56Hdo7T  Sleep Now In the Fire  Rage Against The Machine   

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.470   0.978    7    -5.399     1       0.0727       0.02610   
1         0.599   0.957   11    -5.764     1       0.1880       0.01290   
2         0.315   0.970    7    -5.424     1       0.4830       0.02340   
3         0.440   0.967   11    -5.830     0       0.2370       0.16300   
4         0.426   0.929    2    -6.729     1       0.0701       0.00162   

   instrumentalness  liveness  valence    tempo  duration_ms  time_signature  \


### Create vectors/embeddings

### Store embeddings to Pinecone

### Search for similar songs