# Cell 1 - Import & Data loading

In [15]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch import nn
import torch.nn.functional as F

In [16]:
# Cartelle
DATA_PROCESSED_DIR = os.path.join("..", "data", "processed")
MODEL_DIR = os.path.join("..", "models")

# File
data_path = os.path.join(DATA_PROCESSED_DIR, "spotify_dataset_clustered.csv")
model_path = os.path.join(MODEL_DIR, "mlp_subcluster.pth")
scaler_mean_path = os.path.join(MODEL_DIR, "scaler_mean.npy")
scaler_scale_path = os.path.join(MODEL_DIR, "scaler_scale.npy")
le_classes_path = os.path.join(MODEL_DIR, "label_encoder_classes.npy")

# Audio-features usate dal modello
feature_cols = [
    "acousticness",
    "danceability",
    "energy",
    "instrumentalness",
    "liveness",
    "loudness",
    "speechiness",
    "tempo",
    "valence",
    "duration_ms",
]

print("DATA_PROCESSED_DIR:", DATA_PROCESSED_DIR)
print("MODEL_DIR:", MODEL_DIR)


DATA_PROCESSED_DIR: ../data/processed
MODEL_DIR: ../models


In [17]:
df = pd.read_csv(data_path)
print("Shape dataset:", df.shape)
df.head()

Shape dataset: (169909, 25)


Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,...,macro_cluster,subcluster,subcluster_label,is_kids,is_christmas,is_nursery,is_religious,is_soundtrack,language_raw,main_language
0,6KbQ3uYMLKb5jDxLF7wYDD,Singende Bataillone 1. Teil,['Carl Woitschach'],0,1928,0.995,0.708,0.195,0.563,0.151,...,1,1_1,Warm Emotional Acoustic,False,False,False,False,False,de,de
1,6KuQTIu1KoTTkLXKrwlLPV,"Fantasiestücke, Op. 111: Più tosto lento","['Robert Schumann', 'Vladimir Horowitz']",0,1928,0.994,0.379,0.0135,0.901,0.0763,...,1,1_0,Deep Minimal Calm,False,False,False,False,True,de,de
2,6L63VW0PibdM1HDSBoqnoM,Chapter 1.18 - Zamek kaniowski,['Seweryn Goszczyński'],0,1928,0.604,0.749,0.22,0.0,0.119,...,0,0_0,Spoken Chill & Emotional,False,False,False,False,False,pl,other
3,6M94FkXd15sOAOQYRnWPN8,Bebamos Juntos - Instrumental (Remasterizado),['Francisco Canaro'],0,1928,0.995,0.781,0.13,0.887,0.111,...,1,1_1,Warm Emotional Acoustic,False,False,False,False,False,pt,pt
4,6N6tiFZ9vLTSOIxkj8qKrd,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['Frédéric Chopin', 'Vladimir Horowitz']",1,1928,0.99,0.21,0.204,0.908,0.098,...,1,1_0,Deep Minimal Calm,False,False,False,False,False,en,en


# Cell 2 - Loading scaler, label encoder & defining MLP model

In [18]:
# Scaler
scaler_mean = np.load(scaler_mean_path)
scaler_scale = np.load(scaler_scale_path)

scaler = StandardScaler()
scaler.mean_ = scaler_mean
scaler.scale_ = scaler_scale

# Classi del LabelEncoder
le_classes = np.load(le_classes_path, allow_pickle=True).astype(str)
num_classes = len(le_classes)

# Modello
input_dim = len(feature_cols)
hidden_dim = 64


class MLPCluster(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        return self.net(x)


device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

model = MLPCluster(input_dim, hidden_dim, num_classes).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

print("Model loaded. Num classes:", num_classes)
print("Classes:", le_classes)


Using device: mps
Model loaded. Num classes: 10
Classes: ['0_0' '1_0' '1_1' '1_2' '2_0' '2_1' '2_2' '2_3' '2_4' '2_5']


# Cell 3 - Precompute: feature matrix & subcluster summary

In [19]:
# Matrice feature globale
X_raw = df[feature_cols].values
X_scaled = scaler.transform(X_raw)

print("X_raw shape:", X_raw.shape)
print("X_scaled shape:", X_scaled.shape)

subcluster_summary = (
    df.groupby("subcluster")[feature_cols]
      .mean()
      .sort_index()
)

subcluster_summary.head()


X_raw shape: (169909, 10)
X_scaled shape: (169909, 10)


Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,duration_ms
subcluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0_0,0.467771,0.671075,0.255682,0.00513,0.330862,-18.688232,0.91486,107.482467,0.544188,182201.38668
1_0,0.905267,0.308582,0.156997,0.800893,0.167886,-20.604054,0.048183,96.757403,0.178934,326022.307802
1_1,0.901308,0.543354,0.330438,0.747177,0.20097,-14.205328,0.075352,117.093294,0.641392,187394.628187
1_2,0.123278,0.531765,0.705924,0.676251,0.195493,-9.490165,0.058513,123.730674,0.556025,262283.212502
2_0,0.792156,0.621782,0.371402,0.019055,0.183122,-11.839087,0.089119,113.027914,0.713708,174914.838836


# Cell 4 - Problematic genre flags

In [20]:
# Flag problematici nel dataset
PROBLEMATIC_FLAG_COLS = [
    "is_kids",
    "is_christmas",
    "is_nursery",
    "is_religious",
    "is_soundtrack",
]
# Mappa mood → categorie da includere
MOOD_FLAG_MAP = {
    "kids": ["is_kids", "is_nursery"],
    "children": ["is_kids", "is_nursery"],
    "nursery": ["is_nursery"],
    "christmas": ["is_christmas"],
    "xmas": ["is_christmas"],
    "holiday": ["is_christmas"],
    "religious": ["is_religious"],
    "gospel": ["is_religious"],
}


# Cell 5 - Helpers for profile targeting (mood, activity, weather, age)

In [21]:
# === 6. Costruzione profilo utente + range temporale (15–30 anni) ===

def build_target_profile(mood: str,
                         activity: str,
                         weather: str,
                         part_of_day: str,
                         age: int,
                         explorer: bool,
                         df_global: pd.DataFrame):
    """
    Costruisce un vettore 'base' nelle coordinate delle audio-features
    (acousticness, danceability, ...), usando euristiche su:
    - mood
    - activity
    - weather
    - part_of_day

    Inoltre, calcola:
      year_low  = anno quando l'utente aveva ~10–15 anni
      year_high = anno quando l'utente avrà ~30 anni
      year_pref = media (centro del range)

    Le feature sono centrate così (indicativo):
      - acousticness ↑ per input più tranquilli
      - danceability ↑ per input più tranquilli
      - energy ↑ per input più aggressivi/energetici
      - instrumentalness ↑ sia per tranquilli che per energetici
      - liveness ↑ per input più aggressivi
      - loudness ↑ per input più energetici
      - speechiness ↑ per input più tranquilli
      - tempo ↑ per input più aggressivi/energetici
      - valence: 0 ≈ sad, 0.5 ≈ angry, 1 ≈ happy
      - duration_ms: più lunga per mood/attività tranquille, più corta per gym/party

    Restituisce:
      base (dict delle feature target),
      (year_pref, year_low, year_high)
    """

    # --- Valori "base" neutri ---
    base = {
        "acousticness": 0.5,
        "danceability": 0.5,
        "energy": 0.5,
        "instrumentalness": 0.1,
        "liveness": 0.2,
        "loudness": -10.0,   # dBFS (valori più alti = meno negativi = più forti)
        "speechiness": 0.05,
        "tempo": 120.0,
        "valence": 0.5,
        "duration_ms": df_global["duration_ms"].median()
        if "duration_ms" in df_global.columns else 210_000,
    }

    m = (mood or "").lower().strip()
    a = (activity or "").lower().strip()
    w = (weather or "").lower().strip()
    d = (part_of_day or "").lower().strip()

    # ------------------------------------------------------------------
    # 1) Costruiamo dei punteggi astratti:
    #    - calm_level   → quanto l'input è "tranquillo"
    #    - energy_level → quanto è "energetico"
    #    - agg_level    → quanto è "aggressivo"
    #    Questi livelli poi guidano TUTTE le feature.
    # ------------------------------------------------------------------
    calm_level = 0.0
    energy_level = 0.0
    agg_level = 0.0

    # -----------------------
    # 1a. Mood → valence + livelli
    # -----------------------
    # Valence ancorata ai valori richiesti:
    #   sad   → ~0.1–0.2
    #   angry → 0.5
    #   happy → ~0.9
    #   relaxed → intermedio "felice ma soft" (~0.7)
    if m == "happy":
        base["valence"] = 0.9
        energy_level += 1.0
    elif m == "sad":
        base["valence"] = 0.15
        calm_level += 1.0
    elif m in ["relaxed", "calm", "chill"]:
        base["valence"] = 0.7
        calm_level += 1.5
    elif m in ["angry", "aggressive"]:
        base["valence"] = 0.5
        agg_level += 1.5
        energy_level += 1.0
    # Mood speciali (anche per filtraggio "problematic tracks")
    elif m in ["kids", "children", "nursery"]:
        base["valence"] = 0.9
        calm_level += 1.0
    elif m in ["christmas", "xmas", "holiday"]:
        base["valence"] = 0.8
        calm_level += 1.0
    elif m in ["religious", "gospel"]:
        base["valence"] = 0.6
        calm_level += 1.0
    else:
        # mood generico
        base["valence"] = 0.5

    # -----------------------
    # 1b. Activity → livelli
    # -----------------------
    calm_activities = ["study", "focus", "work", "reading", "chill", "chilling", "commute", "travel"]
    energetic_activities = ["gym", "workout", "run", "running", "party", "dancing", "dance"]

    if a in calm_activities:
        calm_level += 1.0
    if a in energetic_activities:
        energy_level += 1.5
        if a in ["party", "dancing", "dance"]:
            agg_level += 0.5  # party un po' più "spinto"

    # -----------------------
    # 1c. Weather & part_of_day come piccoli aggiustamenti
    # -----------------------
    # Meteo:
    if w in ["sunny", "clear"]:
        base["valence"] += 0.05
        energy_level += 0.3
    elif w in ["rainy", "storm", "stormy"]:
        base["valence"] -= 0.05
        calm_level += 0.3
    elif w in ["snow", "snowy"]:
        calm_level += 0.2

    # Fascia oraria:
    if d == "morning":
        energy_level += 0.2
    elif d in ["evening"]:
        # sera leggermente più "energetica" ma anche adatta a chill
        energy_level += 0.1
        calm_level += 0.1
    elif d in ["night", "late night"]:
        calm_level += 0.5
        energy_level -= 0.2

    # Clamp livelli a [0, 2] per evitare eccessi
    calm_level = float(np.clip(calm_level, 0.0, 2.0))
    energy_level = float(np.clip(energy_level, 0.0, 2.0))
    agg_level = float(np.clip(agg_level, 0.0, 2.0))

    # ------------------------------------------------------------------
    # 2) Mappiamo questi livelli sulle singole feature
    # ------------------------------------------------------------------

    # acousticness: ↑ con calma, ↓ con energia
    base["acousticness"] += 0.20 * calm_level - 0.10 * energy_level

    # danceability: richiesta ↑ per input più tranquilli
    base["danceability"] += 0.20 * calm_level - 0.05 * agg_level

    # energy: ↑ con energia/aggressività, ↓ con calma
    base["energy"] += 0.25 * energy_level + 0.20 * agg_level - 0.20 * calm_level

    # instrumentalness: ↑ sia per tranquilli che per energetici
    base["instrumentalness"] += 0.10 * calm_level + 0.10 * energy_level

    # liveness: ↑ per aggressivi / live-feel
    base["liveness"] += 0.15 * agg_level + 0.05 * energy_level

    # loudness (dBFS): valori più alti = meno negativi → più forti
    base["loudness"] += 3.0 * energy_level + 2.0 * agg_level - 2.0 * calm_level

    # speechiness: ↑ per input più tranquilli
    base["speechiness"] += 0.15 * calm_level - 0.10 * agg_level

    # tempo (BPM): ↑ per input aggressivi/energetici, ↓ per molto tranquilli
    base["tempo"] += 10.0 * energy_level + 8.0 * agg_level - 6.0 * calm_level

    # duration_ms:
    #   - più lunga per mood/attività tranquille
    #   - più corta per gym/party
    dur = base["duration_ms"]
    dur += 30_000 * calm_level   # +30s per livello di calma
    dur -= 20_000 * energy_level  # -20s per livello di energia
    base["duration_ms"] = max(90_000, float(dur))  # almeno 90s

    # ------------------------------------------------------------------
    # 3) SPECIAL STEERING per subcluster rari
    #    (stesse logiche di prima, ma sopra abbiamo dato un profilo sensato)
    #    Questi preset sovrascrivono il profilo quando il pattern di input matcha.
    # ------------------------------------------------------------------

    # 0_0 – Short Spoken Calm
    # relaxed + (study/work/reading) + evening/night
    if (
        m == "relaxed"
        and a in ["reading", "study", "work"]
        and d in ["evening", "night"]
    ):
        base = {
            "acousticness": 0.467771,
            "danceability": 0.671075,
            "energy": 0.255682,
            "instrumentalness": 0.005130,
            "liveness": 0.330862,
            "loudness": -18.688232,
            "speechiness": 0.914860,
            "tempo": 107.482467,
            "valence": 0.544188,
            "duration_ms": 182_201.0,
        }

    # 1_0 – Deep Calm & Minimal
    # sad + (study/work/reading) + evening/night
    if (
        m == "sad"
        and a in ["reading", "study", "work"]
        and d in ["evening", "night"]
    ):
        base = {
            "acousticness": 0.905267,
            "danceability": 0.308582,
            "energy": 0.156997,
            "instrumentalness": 0.800893,
            "liveness": 0.167886,
            "loudness": -20.604054,
            "speechiness": 0.048183,
            "tempo": 96.757403,
            "valence": 0.178934,
            "duration_ms": 326_022.0,
        }

    # 1_2 – Epic Intense
    # angry/happy + (gym/run/party) + evening/night
    if (
        m in ["angry", "happy"]
        and a in ["gym", "workout", "run", "running", "party", "dancing", "dance"]
        and d in ["evening", "night"]
    ):
        base = {
            "acousticness": 0.123278,
            "danceability": 0.531765,
            "energy": 0.705924,
            "instrumentalness": 0.676251,
            "liveness": 0.195493,
            "loudness": -9.490165,
            "speechiness": 0.058513,
            "tempo": 123.730674,
            "valence": 0.556025,
            "duration_ms": 262_283.0,
        }

    # 2_3 – Soft Sad Calm
    # sad/relaxed + (chill/commute) + evening/night + meteo non "super soleggiato"
    if (
        m in ["sad", "relaxed"]
        and a in ["chill", "chilling", "commute", "travel"]
        and d in ["evening", "night"]
        and w in ["rainy", "snow", "snowy", "cloudy", "stormy"]
    ):
        base = {
            "acousticness": 0.866599,
            "danceability": 0.406124,
            "energy": 0.196410,
            "instrumentalness": 0.024760,
            "liveness": 0.163935,
            "loudness": -15.543252,
            "speechiness": 0.044752,
            "tempo": 96.954149,
            "valence": 0.285893,
            "duration_ms": 209_920.0,
        }

    # 2_5 – Energetic Live Mood
    # happy/relaxed + party/gym + evening/night + meteo buono
    if (
        m in ["happy", "relaxed"]
        and a in ["party", "gym", "workout", "run", "running", "dancing", "dance"]
        and d in ["evening", "night"]
        and w in ["sunny", "clear"]
    ):
        base = {
            "acousticness": 0.430542,
            "danceability": 0.502758,
            "energy": 0.602284,
            "instrumentalness": 0.029864,
            "liveness": 0.718388,
            "loudness": -9.929079,
            "speechiness": 0.101848,
            "tempo": 118.597994,
            "valence": 0.546971,
            "duration_ms": 254_856.0,
        }

    # ------------------------------------------------------------------
    # 4) Clamp delle feature [0,1] dove ha senso
    # ------------------------------------------------------------------
    for k in ["acousticness", "danceability", "energy",
              "instrumentalness", "liveness",
              "speechiness", "valence"]:
        base[k] = float(np.clip(base[k], 0.0, 1.0))

    # --- Range temporale: 15–30 anni di vita dell'utente ---
    age_clipped = int(np.clip(age, 15, 70))
    current_year = 2025

    year_low = current_year - (age_clipped - 10)
    year_high = current_year - (age_clipped - 30)
    if year_low > year_high:
        year_low, year_high = year_high, year_low
    year_pref = int((year_low + year_high) / 2)

    # Limitiamo agli anni effettivamente presenti nel dataset
    if "year" in df_global.columns and df_global["year"].notna().any():
        yrs = df_global["year"].dropna().values
        year_low = max(year_low, int(yrs.min()))
        year_high = min(year_high, int(yrs.max()))
        year_pref = int(np.clip(year_pref, yrs.min(), yrs.max()))

    return base, (year_pref, year_low, year_high)

# Cell 6 - Temporal score & Popularity score

In [22]:
# === 6. Funzioni di scoring ===

def temporal_score(years, year_pref, year_low, year_high, explorer: bool = False):
    years = np.asarray(years, dtype=float)

    score = np.ones_like(years, dtype=float)
    core_mask = (years >= year_low) & (years <= year_high)

    dist = np.zeros_like(years, dtype=float)
    left_mask = years < year_low
    right_mask = years > year_high

    dist[left_mask] = year_low - years[left_mask]
    dist[right_mask] = years[right_mask] - year_high
    base_decay = 0.05 if explorer else 0.25
    score[~core_mask] = np.exp(-base_decay * dist[~core_mask])

    return score


def popularity_score(pops, explorer: bool = False):
    pops = np.asarray(pops, dtype=float)
    if np.isnan(pops).all():
        return np.ones_like(pops) * 0.5

    p_norm = (pops - pops.min()) / (pops.max() - pops.min() + 1e-8)

    if not explorer:
        return np.power(p_norm, 1.5)
    else:
        return np.sqrt(p_norm)


def compute_weather_score(df_local: pd.DataFrame, weather: str):
    w = weather.lower().strip()
    val = df_local["valence"].values if "valence" in df_local.columns else np.zeros(len(df_local))
    en = df_local["energy"].values if "energy" in df_local.columns else np.zeros(len(df_local))
    ac = df_local["acousticness"].values if "acousticness" in df_local.columns else np.zeros(len(df_local))

    if w in ["sunny", "clear"]:
        score = 0.6 * val + 0.4 * en
    elif w in ["rainy", "storm", "stormy"]:
        score = 0.6 * ac + 0.4 * (1 - val)
    elif w in ["snow", "snowy"]:
        score = 0.5 * ac + 0.5 * val
    else:
        score = 0.5 * val + 0.5 * en

    score = (score - score.min()) / (score.max() - score.min() + 1e-8)
    return score


def compute_part_of_day_score(df_local: pd.DataFrame, part_of_day: str):
    d = part_of_day.lower().strip()
    tempo = df_local["tempo"].values if "tempo" in df_local.columns else np.zeros(len(df_local))
    en = df_local["energy"].values if "energy" in df_local.columns else np.zeros(len(df_local))
    ac = df_local["acousticness"].values if "acousticness" in df_local.columns else np.zeros(len(df_local))

    if d == "morning":
        score = 0.5 * en + 0.5 * (tempo / (tempo.max() + 1e-8))
    elif d in ["evening"]:
        score = 0.5 * en + 0.5 * (1 - ac)
    elif d in ["night", "late night"]:
        score = 0.7 * ac + 0.3 * (1 - tempo / (tempo.max() + 1e-8))
    else:
        score = 0.5 * en + 0.5 * (tempo / (tempo.max() + 1e-8))

    score = (score - score.min()) / (score.max() - score.min() + 1e-8)
    return score


def compute_user_taste_score(df_local: pd.DataFrame,
                             fav_artists=None,
                             explorer: bool = False):
    if fav_artists is None or len(fav_artists) == 0:
        return np.ones(len(df_local)) * 0.5

    fav_clean = [a.strip().lower() for a in fav_artists if a.strip() != ""]
    if len(fav_clean) == 0:
        return np.ones(len(df_local)) * 0.5

    if "artist_name" not in df_local.columns:
        return np.ones(len(df_local)) * 0.5

    artists = df_local["artist_name"].astype(str).str.lower().values
    score = np.zeros(len(df_local), dtype=float)

    for i, art in enumerate(artists):
        if any(fa == art for fa in fav_clean):
            score[i] = 1.0
        elif any(fa in art for fa in fav_clean):
            score[i] = 0.7
        else:
            score[i] = 0.2

    if explorer:
        score = 0.3 + 0.7 * score

    score = (score - score.min()) / (score.max() - score.min() + 1e-8)
    return score

## Cell 8 - Building profile vector with MLP

In [23]:
def build_full_feature_vector_from_profile(profile_dict: dict):
    return np.array([profile_dict[c] for c in feature_cols], dtype=float)


def predict_subcluster_from_profile(profile_dict: dict):
    """
    Usa l'MLP addestrato per predire il subcluster più probabile
    dato il profilo utente.

    Qui applichiamo anche un piccolo bias ai logits per dare
    una chance in più ai subcluster rari.
    """
    x = build_full_feature_vector_from_profile(profile_dict).reshape(1, -1)
    x_scaled = scaler.transform(x)

    with torch.no_grad():
        logits = model(torch.tensor(x_scaled, dtype=torch.float32, device=device))

        # Bias morbido per cluster rari
        cluster_prior = {
            "0_0": 0.4,
            "1_0": 0.5,
            "1_2": 0.6,
            "2_3": 0.5,
            "2_5": 0.8,
        }
        prior = torch.zeros_like(logits)
        for idx, label in enumerate(le_classes):
            bias = cluster_prior.get(str(label), 0.0)
            if bias != 0.0:
                prior[0, idx] = bias

        logits = logits + prior
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]

    pred_idx = int(np.argmax(probs))
    subcluster_pred = le_classes[pred_idx]
    return subcluster_pred, probs


def find_neighbour_subclusters(profile_dict: dict, top_k: int = 3):
    x = np.array([profile_dict[c] for c in feature_cols], dtype=float).reshape(1, -1)
    x_scaled = scaler.transform(x)

    centers = subcluster_summary[feature_cols].values
    centers_scaled = scaler.transform(centers)

    sims = cosine_similarity(x_scaled, centers_scaled)[0]
    order = np.argsort(-sims)

    neigh_subclusters = subcluster_summary.index[order[:top_k]].tolist()
    return neigh_subclusters, sims[order[:top_k]], order[:top_k]


## Cell 9 - Checking for Favourite Artists

In [24]:
def _is_fav_artist_series(artist_series: pd.Series, fav_set):
    if not fav_set:
        return pd.Series(False, index=artist_series.index)

    lower = artist_series.astype(str).str.lower()

    def check_name(name: str) -> bool:
        return any(fav in name for fav in fav_set)

    return lower.apply(check_name)

# Cell 10 - Main

In [25]:
def recommend_playlist(mood: str,
                       activity: str,
                       part_of_day: str,
                       weather: str,
                       age: int,
                       explorer: bool,
                       n: int = 10,
                       fav_artists=None,
                       language_prefs=None):

    if fav_artists is None:
        fav_artists = []
    if language_prefs is None:
        language_prefs = []

    profile, (year_pref, year_low, year_high) = build_target_profile(
        mood=mood,
        activity=activity,
        weather=weather,
        part_of_day=part_of_day,
        age=age,
        explorer=explorer,
        df_global=df
    )

    df_local = df.copy()

    mood_clean = (mood or "").strip().lower()
    is_special_mood = mood_clean in MOOD_FLAG_MAP

    if any(col in df_local.columns for col in PROBLEMATIC_FLAG_COLS):
        if is_special_mood:
            cols = [c for c in MOOD_FLAG_MAP[mood_clean] if c in df_local.columns]
            if cols:
                mask_special = np.zeros(len(df_local), dtype=bool)
                for c in cols:
                    mask_special |= df_local[c].fillna(False).to_numpy().astype(bool)
                df_local = df_local[mask_special].copy()
        else:
            mask_keep = np.ones(len(df_local), dtype=bool)
            for c in PROBLEMATIC_FLAG_COLS:
                if c in df_local.columns:
                    mask_keep &= ~df_local[c].fillna(False).to_numpy().astype(bool)
            df_local = df_local[mask_keep].copy()

    if df_local.empty:
        df_local = df.copy()
        if "is_kids" in df_local.columns:
            df_local = df_local[df_local["is_kids"] == False].copy()

    if mood_clean not in ["kids", "children", "nursery"]:
        if "is_kids" in df_local.columns:
            df_local = df_local[df_local["is_kids"] == False].copy()

    langs_clean = {str(l).strip().lower() for l in language_prefs if l and str(l).strip() != ""}
    if langs_clean and ("main_language" in df_local.columns):
        mask_lang = df_local["main_language"].astype(str).str.lower().isin(langs_clean)
        df_lang_filtered = df_local[mask_lang].copy()
        if not df_lang_filtered.empty:
            df_local = df_lang_filtered
        else:
            print(f"⚠️ Nessun brano trovato per le lingue richieste {langs_clean}. "
                  f"Ignoro il filtro lingua e uso tutte le lingue disponibili.")

    mask_local = df.index.isin(df_local.index)
    X_local = X_scaled[mask_local]

    subcluster_pred, probs = predict_subcluster_from_profile(profile)

    neighbour_subclusters, sim_sub, idx_order = find_neighbour_subclusters(profile, top_k=3)
    if subcluster_pred not in neighbour_subclusters:
        neighbour_subclusters = [subcluster_pred] + neighbour_subclusters[:-1]

    x_target_raw = build_full_feature_vector_from_profile(profile).reshape(1, -1)
    x_target_scaled = scaler.transform(x_target_raw)
    mood_sim = cosine_similarity(x_target_scaled, X_local)[0]

    sub = df_local["subcluster"].astype(str)
    macro = df_local["macro_cluster"]

    macro_pred = int(str(subcluster_pred).split("_")[0])

    if not explorer:
        same_sub = 1.0
        neighbour = 0.8
        same_macro = 0.4
        other_macro = 0.1
    else:
        same_sub = 1.0
        neighbour = 0.9
        same_macro = 0.6
        other_macro = 0.3

    cluster_bonus = np.zeros(len(df_local), dtype=float)
    cluster_bonus[sub == subcluster_pred] = same_sub
    cluster_bonus[(sub.isin(neighbour_subclusters)) & (sub != subcluster_pred)] = neighbour
    cluster_bonus[(~sub.isin(neighbour_subclusters)) & (macro == macro_pred)] = same_macro
    cluster_bonus[(macro != macro_pred)] = other_macro

    if "year" in df_local.columns:
        years_local = df_local["year"].fillna(year_pref).values
    else:
        years_local = np.ones(len(df_local)) * year_pref

    if "popularity" in df_local.columns:
        pops_local = df_local["popularity"].fillna(df_local["popularity"].mean()).values
    else:
        pops_local = np.ones(len(df_local)) * 50

    if is_special_mood:
        time_score_raw = np.ones(len(df_local), dtype=float)
    else:
        time_score_raw = temporal_score(years_local, year_pref, year_low, year_high, explorer)

    pop_score_raw = popularity_score(pops_local, explorer=explorer)
    weather_score_raw = compute_weather_score(df_local, weather)
    day_score_raw = compute_part_of_day_score(df_local, part_of_day)
    user_taste_raw = compute_user_taste_score(df_local, fav_artists=fav_artists, explorer=explorer)

    def _norm(arr):
        arr = np.asarray(arr, dtype=float)
        return (arr - arr.min()) / (arr.max() - arr.min() + 1e-8)

    mood_sim_norm = _norm(mood_sim)
    cluster_bonus_norm = _norm(cluster_bonus)
    time_score_norm = _norm(time_score_raw)
    pop_score_norm = _norm(pop_score_raw)
    weather_score_norm = _norm(weather_score_raw)
    day_score_norm = _norm(day_score_raw)
    user_taste_norm = _norm(user_taste_raw)

    mood_cluster_score = 0.6 * mood_sim_norm + 0.4 * cluster_bonus_norm

    w_taste = 0.35
    w_mood_cluster = 0.25
    w_time = 0.20
    w_pop = 0.10
    w_weather = 0.05
    w_day = 0.05

    final_score = (
        w_taste * user_taste_norm +
        w_mood_cluster * mood_cluster_score +
        w_time * time_score_norm +
        w_pop * pop_score_norm +
        w_weather * weather_score_norm +
        w_day * day_score_norm
    )

    result = df_local.copy()
    result["score"] = final_score
    result["user_taste_score"] = user_taste_norm
    result["mood_cluster_score"] = mood_cluster_score
    result["time_score"] = time_score_norm
    result["pop_score"] = pop_score_norm
    result["weather_score"] = weather_score_norm
    result["day_score"] = day_score_norm

    if (not explorer) and (not is_special_mood) and "year" in result.columns and "popularity" in result.columns:
        years_col = result["year"].fillna(year_pref)
        pops_col = result["popularity"].fillna(result["popularity"].mean())

        margin = 3
        low = max(year_low - margin, years_col.min())
        high = min(year_high + margin, years_col.max())

        mask = (
            (years_col >= low) &
            (years_col <= high) &
            (pops_col >= 30)
        )
        result = result[mask].copy()

    if "track_id" in result.columns:
        result = result.drop_duplicates("track_id")

    if "artist_name" in result.columns:
        result["artist_rank"] = result.groupby("artist_name").cumcount()
        result = result[result["artist_rank"] < 3].drop(columns=["artist_rank"])

    result_sorted = result.sort_values("score", ascending=False)

    if ("artist_name" in result_sorted.columns) and fav_artists:
        fav_set = {a.strip().lower() for a in fav_artists if a and a.strip() != ""}

        mask_fav = _is_fav_artist_series(result_sorted["artist_name"], fav_set)

        df_fav = result_sorted[mask_fav]
        df_other = result_sorted[~mask_fav]

        if explorer:
            min_other_ratio = 0.5
        else:
            min_other_ratio = 0.3

        target_other = int(np.ceil(min_other_ratio * n))

        n_other = min(target_other, len(df_other))
        pick_other = df_other.head(n_other)

        remaining_slots = n - len(pick_other)
        pick_fav = df_fav.head(remaining_slots)

        already_idx = set(pick_other.index) | set(pick_fav.index)
        leftover = result_sorted[~result_sorted.index.isin(already_idx)].head(
            n - len(pick_other) - len(pick_fav)
        )

        top_result = pd.concat([pick_other, pick_fav, leftover]).head(n)

    else:
        top_result = result_sorted.head(n)

    cols_show = [
        "track_id", "track_name", "artist_name", "genre", "year", "popularity",
        "macro_cluster", "subcluster", "subcluster_label",
        "score", "user_taste_score", "mood_cluster_score",
        "time_score", "pop_score", "weather_score", "day_score"
    ]
    cols_exist = [c for c in cols_show if c in top_result.columns]
    top_result = top_result[cols_exist]

    print(
        f"User input → mood='{mood}', activity='{activity}', part_of_day='{part_of_day}', "
        f"weather='{weather}', age={age}, explorer={explorer}, "
        f"fav_artists={fav_artists}, language_prefs={language_prefs}"
    )
    print(f"Predicted subcluster: {subcluster_pred}")
    print("Neighbour subclusters:", neighbour_subclusters)
    print(f"Preferred year center: {year_pref}, range=[{year_low}, {year_high}]")
    print("Candidate pool size (after mood/problematic/lingua filters):", len(result_sorted))

    if ("artist_name" in top_result.columns) and fav_artists:
        fav_set = {a.strip().lower() for a in fav_artists if a and a.strip() != ""}
        mask_fav_pl = _is_fav_artist_series(top_result["artist_name"], fav_set)
        n_fav_final = mask_fav_pl.sum()
        print(
            f"In playlist: {n_fav_final}/{n} brani di favourite artists "
            f"({n - n_fav_final} di altri artisti)."
        )
    return top_result

# EXAMPLES

In [26]:
# Esempio 1: giovane, happy party, sera, bel tempo, con gusti forti
playlist1 = recommend_playlist(
    mood="happy",
    activity="party",
    part_of_day="evening",
    weather="sunny",
    age=23,
    explorer=True,
    n=10,
    fav_artists=['Drake', 'the weeknd', 'justin bieber'],
    language_prefs=['en', 'it'] 
)
playlist1

User input → mood='happy', activity='party', part_of_day='evening', weather='sunny', age=23, explorer=True, fav_artists=['Drake', 'the weeknd', 'justin bieber'], language_prefs=['en', 'it']
Predicted subcluster: 2_5
Neighbour subclusters: ['2_5', '2_1', '0_0']
Preferred year center: 2020, range=[2012, 2020]
Candidate pool size (after mood/problematic/lingua filters): 38357
In playlist: 5/10 brani di favourite artists (5 di altri artisti).


Unnamed: 0,track_id,track_name,artist_name,year,popularity,macro_cluster,subcluster,subcluster_label,score,user_taste_score,mood_cluster_score,time_score,pop_score,weather_score,day_score
135033,2fWSwWmKRuyioqIzOzuQGo,Beast - Southpaw Remix,"['Rob Bailey & The Hustle Standard', 'Busta Rh...",2015,64,2,2_5,Energetic Live Mood,0.606119,0.0,0.966894,1.0,0.8,0.726623,0.961293
107125,6VrCmhRBFnuGKmtNfk4jDs,Spotlight,"['Marshmello', 'Lil Peep']",2018,76,2,2_5,Energetic Live Mood,0.606032,0.0,0.952154,1.0,0.87178,0.712332,0.903983
87365,2bJvI42r8EF3wxjOuDav4r,Time of Our Lives,"['Pitbull', 'Ne-Yo']",2014,77,2,2_5,Energetic Live Mood,0.605462,0.0,0.945778,1.0,0.877496,0.770314,0.855046
7640,7I80zTBTDNnbOiJQ0JLjZr,Set it Off,"['Dove Cameron', 'Sofia Carson', 'Cameron Boyc...",2015,59,2,2_5,Energetic Live Mood,0.605057,0.0,0.968742,1.0,0.768115,0.771131,0.950062
47638,5jlsg7onynIipVy5krMaRZ,Hair (feat. Sean Paul),"['Little Mix', 'Sean Paul']",2016,65,2,2_5,Energetic Live Mood,0.603856,0.0,0.951042,1.0,0.806226,0.794814,0.914648
125482,2xtIAFJIMkzHGHiCrJ9iwd,Lolly,"['Maejor', 'Juicy J', 'Justin Bieber']",2013,62,2,2_5,Energetic Live Mood,0.951221,1.0,0.929594,1.0,0.787401,0.875664,0.925994
97088,3Jv1XgMPVjdN5xGfnRJ20s,Pop That,"['French Montana', 'Rick Ross', 'Drake', 'Lil ...",2013,62,2,2_5,Energetic Live Mood,0.936082,1.0,0.899637,1.0,0.787401,0.73479,0.91386
87122,608a1wIsSd5KzMEqm1O7w3,I'm On One,"['DJ Khaled', 'Drake', 'Rick Ross', 'Lil Wayne']",2011,67,2,2_5,Energetic Live Mood,0.924795,1.0,0.916267,0.950709,0.818535,0.597795,0.876857
115771,1QBwk6GTCxVdC2hoSw9tlM,Lord Knows,"['Drake', 'Rick Ross']",2011,57,2,2_5,Energetic Live Mood,0.922288,1.0,0.93615,0.950709,0.754983,0.547775,0.904434
86984,1kMuU3TNQvHbqvXCWBodmP,Right Above It,"['Lil Wayne', 'Drake']",2010,68,2,2_5,Energetic Live Mood,0.91124,1.0,0.886864,0.903821,0.824621,0.626991,0.898969


In [27]:
# Esempio 2: studio, mood calm, notte, pioggia, esplorativo, senza fav_artists
playlist2 = recommend_playlist(
    mood="christmas",
    activity="dancing",
    part_of_day="morning",
    weather="snowy",
    age=50,
    explorer=False,
    n=5,
    fav_artists=None,
    language_prefs=None
)
playlist2


User input → mood='christmas', activity='dancing', part_of_day='morning', weather='snowy', age=50, explorer=False, fav_artists=[], language_prefs=[]
Predicted subcluster: 1_2
Neighbour subclusters: ['2_2', '1_2', '2_5']
Preferred year center: 1995, range=[1985, 2005]
Candidate pool size (after mood/problematic/lingua filters): 309


Unnamed: 0,track_id,track_name,artist_name,year,popularity,macro_cluster,subcluster,subcluster_label,score,user_taste_score,mood_cluster_score,time_score,pop_score,weather_score,day_score
133677,0oPdaY4dXtc3ZsaG17V972,Feliz Navidad,['José Feliciano'],2002,46,2,2_2,Happy Energetic Dance,0.28482,0.0,0.576347,0.0,0.595342,0.748745,0.875223
141883,3sBnSBnzpEYKpJYGsAIbup,Christmas Eve / Sarajevo 12/24 - Instrumental,['Trans-Siberian Orchestra'],1996,42,1,1_2,Epic Intense Instrumental,0.276312,0.0,0.722161,0.0,0.519402,0.060165,0.816467
160582,2HBo8LhUxBJBsxWaP6ePL3,Si La Ves (feat. Noel Schajris & Leonel García),"['Franco De Vita', 'Noel Schajris', 'Leonel Ga...",2011,63,2,2_5,Energetic Live Mood,0.270081,0.0,0.496654,0.0,0.954203,0.381344,0.628608
151587,6wn2nmFn3wDuiMldRiuRuL,Last Christmas - Remastered,['Wham!'],2006,56,2,2_2,Happy Energetic Dance,0.266212,0.0,0.526787,0.0,0.799672,0.516453,0.574514
135220,1QKDXegQxJBzSs8ZBNtGbT,A GHETTO CHRISTMAS CAROL,['XXXTENTACION'],2017,65,2,2_2,Happy Energetic Dance,0.260811,0.0,0.469582,0.0,1.0,0.092757,0.775556


In [28]:
# Esempio 2: studio, mood calm, notte, pioggia, esplorativo, senza fav_artists
playlist2 = recommend_playlist(
    mood="upbeat",
    activity="workout",
    part_of_day="morning",
    weather="sunny",
    age=23,
    explorer=False,
    n=20,
    fav_artists= ['Dua lipa', 'Taylor swift ', 'billie eilish', 'Lady gaga'],
    language_prefs= ['en']
)
playlist2


User input → mood='upbeat', activity='workout', part_of_day='morning', weather='sunny', age=23, explorer=False, fav_artists=['Dua lipa', 'Taylor swift ', 'billie eilish', 'Lady gaga'], language_prefs=['en']
Predicted subcluster: 1_2
Neighbour subclusters: ['2_1', '1_2', '2_2']
Preferred year center: 2020, range=[2012, 2020]
Candidate pool size (after mood/problematic/lingua filters): 9818
In playlist: 14/20 brani di favourite artists (6 di altri artisti).


Unnamed: 0,track_id,track_name,artist_name,year,popularity,macro_cluster,subcluster,subcluster_label,score,user_taste_score,mood_cluster_score,time_score,pop_score,weather_score,day_score
152631,4Clmg1g8n2yBYG13Z1oSdG,Baila Conmigo (feat. Kelly Ruiz),"['Dayvi', 'Victor Cardenas', 'Kelly Ruíz']",2018,82,1,1_2,Epic Intense Instrumental,0.593197,0.0,0.969311,1.0,0.742542,0.737444,0.794858
126108,5yAu4njFSdM47dfsTkQPZ2,Runnin' Down A Dream,['Tom Petty'],2019,72,1,1_2,Epic Intense Instrumental,0.584115,0.0,0.950976,1.0,0.61094,0.828297,0.877244
97370,3hWfKBt3n7j1xqIy6LA5ve,Astronomia,"['Vicetone', 'Tony Igy']",2016,81,1,1_2,Epic Intense Instrumental,0.576681,0.0,0.942756,1.0,0.729,0.585545,0.7763
152624,6ho0GyrWZN3mhi9zVRW7xi,Losing It,['FISHER'],2018,78,1,1_2,Epic Intense Instrumental,0.575194,0.0,0.910929,1.0,0.688877,0.787464,0.78402
15826,665AK0ovB11McZRwfa852G,Astronomia,"['Vicetone', 'Tony Igy']",2016,77,1,1_2,Epic Intense Instrumental,0.571349,0.0,0.942756,1.0,0.675672,0.585545,0.7763
97641,64lsIF5pw0sJY0gV5kz0RN,Can I Call You Tonight?,['Dayglow'],2019,80,1,1_2,Epic Intense Instrumental,0.568204,0.0,0.910576,1.0,0.715542,0.650061,0.730051
135468,21J2oC0K22hqWShv5XveuX,Break My Heart - Moon Boots Remix,"['Dua Lipa', 'Moon Boots']",2020,33,1,1_2,Epic Intense Instrumental,0.873917,1.0,0.925661,1.0,0.189571,0.761127,0.70976
87952,24ySl2hOPGCDcxBxFIqWBu,Rain On Me (with Ariana Grande),"['Lady Gaga', 'Ariana Grande']",2020,94,2,2_1,Intense High-Energy,0.852298,1.0,0.551308,1.0,0.911364,0.744794,0.72189
88035,7ju97lgwC2rKQ6wwsf9no9,Rain On Me (with Ariana Grande),"['Lady Gaga', 'Ariana Grande']",2020,87,2,2_1,Intense High-Energy,0.842309,1.0,0.551308,1.0,0.811482,0.744794,0.72189
88018,6R6ZoHTypt5lt68MWbzZXv,Sour Candy (with BLACKPINK),"['Lady Gaga', 'BLACKPINK']",2020,87,2,2_5,Energetic Live Mood,0.839283,1.0,0.520808,1.0,0.811482,0.835443,0.72322
