# Cell 1 - Import & path

In [26]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [27]:
DATA_PROCESSED_DIR = os.path.join("..", "data", "processed")
data_path = os.path.join(DATA_PROCESSED_DIR, "spotify_dataset_clustered.csv")

df = pd.read_csv(data_path)
print("Dataset loaded:", df.shape)
df.head()

Dataset loaded: (169909, 18)


Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,duration_ms,macro_cluster,subcluster,subcluster_label
0,6KbQ3uYMLKb5jDxLF7wYDD,Singende Bataillone 1. Teil,['Carl Woitschach'],0,1928,0.995,0.708,0.195,0.563,0.151,-12.428,0.0506,118.469,0.779,158648,1,1_1,Warm Emotional Calm
1,6KuQTIu1KoTTkLXKrwlLPV,"Fantasiestücke, Op. 111: Più tosto lento","['Robert Schumann', 'Vladimir Horowitz']",0,1928,0.994,0.379,0.0135,0.901,0.0763,-28.454,0.0462,83.972,0.0767,282133,1,1_0,Deep Calm & Minimal
2,6L63VW0PibdM1HDSBoqnoM,Chapter 1.18 - Zamek kaniowski,['Seweryn Goszczyński'],0,1928,0.604,0.749,0.22,0.0,0.119,-19.924,0.929,107.177,0.88,104300,0,0_0,Short Spoken Calm
3,6M94FkXd15sOAOQYRnWPN8,Bebamos Juntos - Instrumental (Remasterizado),['Francisco Canaro'],0,1928,0.995,0.781,0.13,0.887,0.111,-14.734,0.0926,108.003,0.72,180760,1,1_1,Warm Emotional Calm
4,6N6tiFZ9vLTSOIxkj8qKrd,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['Frédéric Chopin', 'Vladimir Horowitz']",1,1928,0.99,0.21,0.204,0.908,0.098,-16.829,0.0424,62.149,0.0693,687733,1,1_0,Deep Calm & Minimal


In [29]:
df = df[df["subcluster"].notna()].copy()
print("Righe con subcluster:", df.shape)

Righe con subcluster: (169909, 18)


# Cell 2 - Selecting Audio features from the dataset

In [30]:
feature_cols = [
    "acousticness",
    "danceability",
    "energy",
    "instrumentalness",
    "liveness",
    "loudness",
    "speechiness",
    "tempo",
    "valence",
    "duration_ms",
]

X = df[feature_cols].values

# Target: subcluster (es. "2_5")
y_str = df["subcluster"].astype(str).values

# Encodiamo i subcluster in interi 0..K-1
le = LabelEncoder()
y = le.fit_transform(y_str)

num_classes = len(le.classes_)
print("Numero di subcluster (classi):", num_classes)
print("Classi:", le.classes_)


Numero di subcluster (classi): 11
Classi: ['0_0' '0_1' '1_0' '1_1' '1_2' '2_0' '2_1' '2_2' '2_3' '2_4' '2_5']


# Cell 3 - Train/val/test split + scaler

In [31]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Train: (135927, 10) Val: (16991, 10) Test: (16991, 10)


# Cell 4 - Dataset & DataLoader PyTorch

In [32]:
class SpotifyClusterDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)   # long per CrossEntropy

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SpotifyClusterDataset(X_train_scaled, y_train)
val_ds   = SpotifyClusterDataset(X_val_scaled, y_val)
test_ds  = SpotifyClusterDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)


# Cell 5 - Definition of the MLP model

In [33]:
input_dim = len(feature_cols)
hidden_dim = 64   # o 128, come avevi prima

class MLPCluster(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)  # <-- K classi
        )

    def forward(self, x):
        return self.net(x)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = MLPCluster(input_dim, hidden_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


# Cell 6 - train/val per epoch

In [38]:
def run_epoch(loader, model, criterion, optimizer=None):
    if optimizer is None:
        model.eval()
    else:
        model.train()

    total_loss = 0.0
    total = 0
    correct = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        loss = criterion(logits, y_batch)

        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y_batch.size(0)
        total += y_batch.size(0)

        preds = torch.argmax(logits, dim=1)
        correct += (preds == y_batch).sum().item()

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc


EPOCHS = 20

best_val_loss = float("inf")
best_val_acc = 0.0
best_state_dict = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(train_loader, model, criterion, optimizer)
    val_loss, val_acc = run_epoch(val_loader, model, criterion, optimizer=None)

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"train loss: {train_loss:.4f}, acc: {train_acc:.3f} | "
        f"val loss: {val_loss:.4f}, acc: {val_acc:.3f}"
    )

    # ✅ aggiorna il best model (qui uso val_loss come criterio principale;
    # in caso di pareggio, scelgo quello con val_acc migliore)
    if (val_loss < best_val_loss) or (
        np.isclose(val_loss, best_val_loss) and val_acc > best_val_acc
    ):
        best_val_loss = val_loss
        best_val_acc = val_acc
        best_state_dict = model.state_dict()

# ✅ alla fine del training, ricarichiamo i pesi migliori trovati
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    print(
        f"Loaded best model: val_loss={best_val_loss:.4f}, "
        f"val_acc={best_val_acc:.3f}"
    )
else:
    print("Warning: nessun best_state_dict salvato (controlla il loop di training).")


Epoch 1/20 | train loss: 0.1732, acc: 0.933 | val loss: 0.1274, acc: 0.952
Epoch 2/20 | train loss: 0.1697, acc: 0.934 | val loss: 0.1276, acc: 0.951
Epoch 3/20 | train loss: 0.1696, acc: 0.934 | val loss: 0.1267, acc: 0.951
Epoch 4/20 | train loss: 0.1704, acc: 0.934 | val loss: 0.1255, acc: 0.951
Epoch 5/20 | train loss: 0.1697, acc: 0.933 | val loss: 0.1259, acc: 0.951
Epoch 6/20 | train loss: 0.1688, acc: 0.934 | val loss: 0.1244, acc: 0.951
Epoch 7/20 | train loss: 0.1669, acc: 0.935 | val loss: 0.1255, acc: 0.951
Epoch 8/20 | train loss: 0.1674, acc: 0.935 | val loss: 0.1247, acc: 0.952
Epoch 9/20 | train loss: 0.1663, acc: 0.936 | val loss: 0.1256, acc: 0.950
Epoch 10/20 | train loss: 0.1648, acc: 0.936 | val loss: 0.1228, acc: 0.951
Epoch 11/20 | train loss: 0.1653, acc: 0.936 | val loss: 0.1237, acc: 0.952
Epoch 12/20 | train loss: 0.1655, acc: 0.935 | val loss: 0.1229, acc: 0.952
Epoch 13/20 | train loss: 0.1640, acc: 0.936 | val loss: 0.1230, acc: 0.953
Epoch 14/20 | train l

# Cell 7 - Validation on the test set

In [39]:
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        preds = torch.argmax(logits, dim=1)

        all_preds.append(preds.cpu().numpy())
        all_true.append(y_batch.cpu().numpy())

all_preds = np.concatenate(all_preds)
all_true = np.concatenate(all_true)

print("Test accuracy:", (all_preds == all_true).mean())

print("\nClassification report:")
print(classification_report(all_true, all_preds, target_names=le.classes_))


Test accuracy: 0.9501500794538285

Classification report:
              precision    recall  f1-score   support

         0_0       0.98      0.98      0.98       258
         0_1       0.92      0.95      0.94       129
         1_0       0.95      0.96      0.96      1337
         1_1       0.96      0.94      0.95      1278
         1_2       0.94      0.95      0.95       717
         2_0       0.95      0.93      0.94      2190
         2_1       0.94      0.96      0.95      2714
         2_2       0.96      0.96      0.96      3470
         2_3       0.95      0.96      0.96      1962
         2_4       0.93      0.92      0.92      1974
         2_5       0.95      0.97      0.96       962

    accuracy                           0.95     16991
   macro avg       0.95      0.95      0.95     16991
weighted avg       0.95      0.95      0.95     16991



# Cell 8 - Saving Model & Scaler

In [41]:
MODEL_DIR = os.path.join("..", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

torch.save(model.state_dict(), os.path.join(MODEL_DIR, "mlp_subcluster.pth"))
np.save(os.path.join(MODEL_DIR, "scaler_mean.npy"), scaler.mean_)
np.save(os.path.join(MODEL_DIR, "scaler_scale.npy"), scaler.scale_)
np.save(os.path.join(MODEL_DIR, "label_encoder_classes.npy"), le.classes_)


In [None]:
def run_epoch(loader, model, criterion, optimizer=None):
    if optimizer is None:
        model.eval()
    else:
        model.train()

    total_loss = 0.0
    total = 0
    correct = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        loss = criterion(logits, y_batch)

        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y_batch.size(0)
        total += y_batch.size(0)

        preds = torch.argmax(logits, dim=1)
        correct += (preds == y_batch).sum().item()

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc

EPOCHS = 20

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(train_loader, model, criterion, optimizer)
    val_loss, val_acc = run_epoch(val_loader, model, criterion, optimizer=None)

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"train loss: {train_loss:.4f}, acc: {train_acc:.3f} | "
        f"val loss: {val_loss:.4f}, acc: {val_acc:.3f}"
    )


Epoch 1/20 | train loss: 0.5928, acc: 0.804 | val loss: 0.2279, acc: 0.917
Epoch 2/20 | train loss: 0.2929, acc: 0.887 | val loss: 0.1929, acc: 0.928
Epoch 3/20 | train loss: 0.2603, acc: 0.900 | val loss: 0.1790, acc: 0.932
Epoch 4/20 | train loss: 0.2403, acc: 0.907 | val loss: 0.1660, acc: 0.938
Epoch 5/20 | train loss: 0.2249, acc: 0.913 | val loss: 0.1580, acc: 0.936
Epoch 6/20 | train loss: 0.2177, acc: 0.915 | val loss: 0.1524, acc: 0.941
Epoch 7/20 | train loss: 0.2091, acc: 0.919 | val loss: 0.1474, acc: 0.943
Epoch 8/20 | train loss: 0.2032, acc: 0.921 | val loss: 0.1417, acc: 0.945
Epoch 9/20 | train loss: 0.1974, acc: 0.923 | val loss: 0.1419, acc: 0.945
Epoch 10/20 | train loss: 0.1948, acc: 0.925 | val loss: 0.1395, acc: 0.947
Epoch 11/20 | train loss: 0.1927, acc: 0.925 | val loss: 0.1370, acc: 0.948
Epoch 12/20 | train loss: 0.1887, acc: 0.926 | val loss: 0.1358, acc: 0.947
Epoch 13/20 | train loss: 0.1854, acc: 0.927 | val loss: 0.1343, acc: 0.948
Epoch 14/20 | train l