# Cell 1 - Import & Loading

In [64]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [65]:
DATA_PROCESSED_DIR = os.path.join("..", "data", "processed")
data_path = os.path.join(DATA_PROCESSED_DIR, "spotify_dataset_clustered.csv")

df = pd.read_csv(data_path)
print("Dataset loaded:", df.shape)
df.head()

Dataset loaded: (169909, 24)


Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,...,duration_ms,macro_cluster,subcluster,subcluster_label,is_kids,is_christmas,is_nursery,is_religious,language_raw,main_language
0,6KbQ3uYMLKb5jDxLF7wYDD,Singende Bataillone 1. Teil,['Carl Woitschach'],0,1928,0.995,0.708,0.195,0.563,0.151,...,158648,1,1_1,Warm Emotional Acoustic,False,False,False,False,de,de
1,6KuQTIu1KoTTkLXKrwlLPV,"Fantasiestücke, Op. 111: Più tosto lento","['Robert Schumann', 'Vladimir Horowitz']",0,1928,0.994,0.379,0.0135,0.901,0.0763,...,282133,1,1_0,Deep Minimal Calm,False,False,False,False,de,de
2,6L63VW0PibdM1HDSBoqnoM,Chapter 1.18 - Zamek kaniowski,['Seweryn Goszczyński'],0,1928,0.604,0.749,0.22,0.0,0.119,...,104300,0,0_0,Spoken Chill & Emotional,False,False,False,False,pl,other
3,6M94FkXd15sOAOQYRnWPN8,Bebamos Juntos - Instrumental (Remasterizado),['Francisco Canaro'],0,1928,0.995,0.781,0.13,0.887,0.111,...,180760,1,1_1,Warm Emotional Acoustic,False,False,False,False,pt,pt
4,6N6tiFZ9vLTSOIxkj8qKrd,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['Frédéric Chopin', 'Vladimir Horowitz']",1,1928,0.99,0.21,0.204,0.908,0.098,...,687733,1,1_0,Deep Minimal Calm,False,False,False,False,fr,fr


In [66]:
df = df[df["subcluster"].notna()].copy()
print("Righe con subcluster:", df.shape)

Righe con subcluster: (169909, 24)


# Cell 2 - Selecting Audio features from the dataset

In [68]:
feature_cols = [
    "acousticness",
    "danceability",
    "energy",
    "instrumentalness",
    "liveness",
    "loudness",
    "speechiness",
    "tempo",
    "valence",
    "duration_ms",
]

X = df[feature_cols].values

# Target: subcluster (es. "2_5")
y_str = df["subcluster"].astype(str).values

# Encodiamo i subcluster in interi 0..K-1
le = LabelEncoder()
y = le.fit_transform(y_str)

num_classes = len(le.classes_)
print("Numero di subcluster (classi):", num_classes)
print("Classi:", le.classes_)


Numero di subcluster (classi): 10
Classi: ['0_0' '1_0' '1_1' '1_2' '2_0' '2_1' '2_2' '2_3' '2_4' '2_5']


# Cell 3 - Train/val/test split + scaler

In [69]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Train: (135927, 10) Val: (16991, 10) Test: (16991, 10)


# Cell 4 - Dataset & DataLoader PyTorch

In [70]:
class SpotifyClusterDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)   # long per CrossEntropy

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SpotifyClusterDataset(X_train_scaled, y_train)
val_ds   = SpotifyClusterDataset(X_val_scaled, y_val)
test_ds  = SpotifyClusterDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)


# Cell 5 - Definition of the MLP model

In [71]:
input_dim = len(feature_cols)
hidden_dim = 64   # o 128, come avevi prima

class MLPCluster(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)  # <-- K classi
        )

    def forward(self, x):
        return self.net(x)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = MLPCluster(input_dim, hidden_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


# Cell 6 - train/val per epoch

In [72]:
def run_epoch(loader, model, criterion, optimizer=None):
    if optimizer is None:
        model.eval()
    else:
        model.train()

    total_loss = 0.0
    total = 0
    correct = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        loss = criterion(logits, y_batch)

        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * y_batch.size(0)
        total += y_batch.size(0)

        preds = torch.argmax(logits, dim=1)
        correct += (preds == y_batch).sum().item()

    avg_loss = total_loss / total
    acc = correct / total
    return avg_loss, acc


EPOCHS = 20

best_val_loss = float("inf")
best_val_acc = 0.0
best_state_dict = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(train_loader, model, criterion, optimizer)
    val_loss, val_acc = run_epoch(val_loader, model, criterion, optimizer=None)

    print(
        f"Epoch {epoch}/{EPOCHS} | "
        f"train loss: {train_loss:.4f}, acc: {train_acc:.3f} | "
        f"val loss: {val_loss:.4f}, acc: {val_acc:.3f}"
    )

    # ✅ aggiorna il best model (qui uso val_loss come criterio principale;
    # in caso di pareggio, scelgo quello con val_acc migliore)
    if (val_loss < best_val_loss) or (
        np.isclose(val_loss, best_val_loss) and val_acc > best_val_acc
    ):
        best_val_loss = val_loss
        best_val_acc = val_acc
        best_state_dict = model.state_dict()

# ✅ alla fine del training, ricarichiamo i pesi migliori trovati
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    print(
        f"Loaded best model: val_loss={best_val_loss:.4f}, "
        f"val_acc={best_val_acc:.3f}"
    )
else:
    print("Warning: nessun best_state_dict salvato (controlla il loop di training).")


Epoch 1/20 | train loss: 0.5838, acc: 0.809 | val loss: 0.2167, acc: 0.921
Epoch 2/20 | train loss: 0.2830, acc: 0.892 | val loss: 0.1841, acc: 0.930
Epoch 3/20 | train loss: 0.2514, acc: 0.903 | val loss: 0.1706, acc: 0.932
Epoch 4/20 | train loss: 0.2308, acc: 0.910 | val loss: 0.1604, acc: 0.937
Epoch 5/20 | train loss: 0.2178, acc: 0.915 | val loss: 0.1552, acc: 0.939
Epoch 6/20 | train loss: 0.2081, acc: 0.919 | val loss: 0.1505, acc: 0.940
Epoch 7/20 | train loss: 0.1979, acc: 0.923 | val loss: 0.1438, acc: 0.943
Epoch 8/20 | train loss: 0.1938, acc: 0.925 | val loss: 0.1410, acc: 0.943
Epoch 9/20 | train loss: 0.1884, acc: 0.927 | val loss: 0.1370, acc: 0.944
Epoch 10/20 | train loss: 0.1840, acc: 0.928 | val loss: 0.1330, acc: 0.947
Epoch 11/20 | train loss: 0.1809, acc: 0.931 | val loss: 0.1327, acc: 0.948
Epoch 12/20 | train loss: 0.1782, acc: 0.932 | val loss: 0.1308, acc: 0.949
Epoch 13/20 | train loss: 0.1749, acc: 0.932 | val loss: 0.1283, acc: 0.949
Epoch 14/20 | train l

# Cell 7 - Validation on the test set

In [73]:
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(X_batch)
        preds = torch.argmax(logits, dim=1)

        all_preds.append(preds.cpu().numpy())
        all_true.append(y_batch.cpu().numpy())

all_preds = np.concatenate(all_preds)
all_true = np.concatenate(all_true)

print("Test accuracy:", (all_preds == all_true).mean())

print("\nClassification report:")
print(classification_report(all_true, all_preds, target_names=le.classes_))


Test accuracy: 0.9489729856983109

Classification report:
              precision    recall  f1-score   support

         0_0       0.99      0.98      0.98       387
         1_0       0.95      0.97      0.96      1337
         1_1       0.95      0.94      0.95      1278
         1_2       0.93      0.96      0.94       717
         2_0       0.93      0.95      0.94      2190
         2_1       0.95      0.95      0.95      2714
         2_2       0.96      0.96      0.96      3470
         2_3       0.97      0.94      0.95      1962
         2_4       0.92      0.92      0.92      1974
         2_5       0.94      0.96      0.95       962

    accuracy                           0.95     16991
   macro avg       0.95      0.95      0.95     16991
weighted avg       0.95      0.95      0.95     16991



# Cell 8 - Saving Model & Scaler

In [76]:
MODEL_DIR = os.path.join("..", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

torch.save(model.state_dict(), os.path.join(MODEL_DIR, "mlp_subcluster.pth"))
np.save(os.path.join(MODEL_DIR, "scaler_mean.npy"), scaler.mean_)
np.save(os.path.join(MODEL_DIR, "scaler_scale.npy"), scaler.scale_)
np.save(os.path.join(MODEL_DIR, "label_encoder_classes.npy"), le.classes_)
