# Embedding Generation
## We are generating and saving embeddings in 10D

In [None]:
# !pip install scikit-learn

In [3]:
import pandas as pd

spotify_filtered = pd.read_parquet("data/spotify_clean.parquet")
print("Loaded cleaned dataset:", spotify_filtered.shape)


Loaded cleaned dataset: (169776, 19)


In [6]:
spotify_filtered.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

#### 10D embeddings

In [2]:
# Creating the raw embeddings

import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# List of numeric columns used as base features
feature_cols = [
    'danceability', 'energy', 'valence', 'tempo', 'acousticness',
    'instrumentalness', 'liveness', 'speechiness', 'loudness', 'popularity'
]

# Raw 10D Embeddings
embeddings_10d = spotify_filtered[feature_cols].values.astype('float32')

print("Raw embeddings shape:", embeddings_10d.shape)

Raw embeddings shape: (169776, 10)


In [None]:
np.save("data/spotify_vectors_10d.npy", embeddings_10d)

# Map track IDs to indices
id_to_index = {track_id: idx for idx, track_id in enumerate(spotify_filtered['id'])}
json.dump(id_to_index, open("data/id_to_index.json", "w"))

print("Saved 10D embeddings - data/spotify_vectors_10d.npy")
print("Saved ID-to-index mapping - data/id_to_index.json")