# Cell 1 - import vari

In [2]:
import os
import joblib
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
MODEL_DIR = os.path.join("..", "models")
scaler_mean_path = os.path.join(MODEL_DIR, "scaler_mean.npy")
scaler_scale_path = os.path.join(MODEL_DIR, "scaler_scale.npy")
label_encoder_path = os.path.join(MODEL_DIR, "label_encoder_classes.npy")
model_path = os.path.join(MODEL_DIR, "mlp_subcluster.pth")

In [4]:
class MLPCluster(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_classes)
        )
    
    def forward(self, x):
        return self.net(x)

# Cell 2 - Setting and Running the model

In [5]:
scaler_mean = np.load(scaler_mean_path)
scaler_scale = np.load(scaler_scale_path)
print("Scaler caricato.")

Scaler caricato.


In [6]:
feature_cols = [
    "acousticness", "danceability", "energy", "instrumentalness",
    "liveness", "loudness", "speechiness", "tempo", "valence",
    "duration_ms"
]

le_classes = np.load(label_encoder_path, allow_pickle=True)
num_classes = len(le_classes)
print(f"Numero di classi: {num_classes}")

input_dim = len(feature_cols)
hidden_dim = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MLPCluster(input_dim=input_dim, hidden_dim=hidden_dim, num_classes=num_classes).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

print(f"Modello caricato su: {device}")

Numero di classi: 11
Modello caricato su: cpu


In [7]:
DATA_PROCESSED_DIR = os.path.join("..", "data", "processed")
data_path = os.path.join(DATA_PROCESSED_DIR, "spotify_dataset_clustered.csv")

df = pd.read_csv(data_path)
print("Dataset loaded:", df.shape)
df.head()

Dataset loaded: (169909, 18)


Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,duration_ms,macro_cluster,subcluster,subcluster_label
0,6KbQ3uYMLKb5jDxLF7wYDD,Singende Bataillone 1. Teil,['Carl Woitschach'],0,1928,0.995,0.708,0.195,0.563,0.151,-12.428,0.0506,118.469,0.779,158648,1,1_1,Warm Emotional Calm
1,6KuQTIu1KoTTkLXKrwlLPV,"Fantasiestücke, Op. 111: Più tosto lento","['Robert Schumann', 'Vladimir Horowitz']",0,1928,0.994,0.379,0.0135,0.901,0.0763,-28.454,0.0462,83.972,0.0767,282133,1,1_2,Epic Intense
2,6L63VW0PibdM1HDSBoqnoM,Chapter 1.18 - Zamek kaniowski,['Seweryn Goszczyński'],0,1928,0.604,0.749,0.22,0.0,0.119,-19.924,0.929,107.177,0.88,104300,0,0_1,Long Spoken Emotional
3,6M94FkXd15sOAOQYRnWPN8,Bebamos Juntos - Instrumental (Remasterizado),['Francisco Canaro'],0,1928,0.995,0.781,0.13,0.887,0.111,-14.734,0.0926,108.003,0.72,180760,1,1_1,Warm Emotional Calm
4,6N6tiFZ9vLTSOIxkj8qKrd,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['Frédéric Chopin', 'Vladimir Horowitz']",1,1928,0.99,0.21,0.204,0.908,0.098,-16.829,0.0424,62.149,0.0693,687733,1,1_2,Epic Intense


In [8]:

df_model = df.dropna(subset=feature_cols).copy()
print("Dataset after dropna:", df_model.shape)

Dataset after dropna: (169909, 18)


In [9]:
from sklearn.preprocessing import StandardScaler

X = df_model[feature_cols].values.astype("float32")

scaler = StandardScaler()
scaler.mean_ = scaler_mean
scaler.scale_ = scaler_scale

X_scaled = scaler.transform(X)

with torch.no_grad():
    inputs = torch.tensor(X_scaled, dtype=torch.float32).to(device)
    outputs = model(inputs)
    predicted_classes = torch.argmax(outputs, dim=1).cpu().numpy()

df_model["predicted_subcluster"] = [le_classes[i] for i in predicted_classes]

print("Clusters assigned to dataset.")

Clusters assigned to dataset.


# Cell 4 - engnieering the output 

In [10]:
import numpy as np

# Drop PCA columns if they exist (gestisci se non ci sono)
columns_to_drop = [col for col in ["pca_x", "pca_y"] if col in df_model.columns]
if columns_to_drop:
    df_model = df_model.drop(columns=columns_to_drop)

# Compute distances to centroids
df_model['distance_to_centroid'] = 0.0

for cluster_id in df_model["macro_cluster"].unique():
    cluster_data = df_model[df_model["macro_cluster"] == cluster_id]
    cluster_features = cluster_data[feature_cols].values
    centroid = cluster_features.mean(axis=0)
    distances = np.linalg.norm(cluster_features - centroid, axis=1)
    df_model.loc[df_model["macro_cluster"] == cluster_id, 'distance_to_centroid'] = distances

# Compute normalized distance within each cluster
df_model['distance_normalized'] = df_model.groupby('macro_cluster')['distance_to_centroid'] \
                                           .transform(lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() > x.min() else 0)

# Computing metrics for ranking
df_model['rank_distance'] = df_model.groupby('macro_cluster')['distance_to_centroid'] \
                                    .rank(ascending=True)  

df_model['rank_popularity'] = df_model.groupby('macro_cluster')['popularity'] \
                                      .rank(ascending=False)

weight_distance = 0.4
weight_pop = 0.6
df_model['combined_rank'] = weight_distance * (1 - df_model['rank_distance'] / df_model.groupby('macro_cluster')['rank_distance'].transform('max')) + \
                            weight_pop * (1 - df_model['rank_popularity'] / df_model.groupby('macro_cluster')['rank_popularity'].transform('max'))

# Final output
df_model.head()

Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,...,duration_ms,macro_cluster,subcluster,subcluster_label,predicted_subcluster,distance_to_centroid,distance_normalized,rank_distance,rank_popularity,combined_rank
0,6KbQ3uYMLKb5jDxLF7wYDD,Singende Bataillone 1. Teil,['Carl Woitschach'],0,1928,0.995,0.708,0.195,0.563,0.151,...,158648,1,1_1,Warm Emotional Calm,1_1,100703.516632,0.019574,20975.0,27506.0,0.148018
1,6KuQTIu1KoTTkLXKrwlLPV,"Fantasiestücke, Op. 111: Più tosto lento","['Robert Schumann', 'Vladimir Horowitz']",0,1928,0.994,0.379,0.0135,0.901,0.0763,...,282133,1,1_2,Epic Intense,1_2,22781.502565,0.004427,3622.5,27506.0,0.356481
2,6L63VW0PibdM1HDSBoqnoM,Chapter 1.18 - Zamek kaniowski,['Seweryn Goszczyński'],0,1928,0.604,0.749,0.22,0.0,0.119,...,104300,0,0_1,Long Spoken Emotional,0_1,76130.232211,0.069047,1768.0,2392.0,0.217544
3,6M94FkXd15sOAOQYRnWPN8,Bebamos Juntos - Instrumental (Remasterizado),['Francisco Canaro'],0,1928,0.995,0.781,0.13,0.887,0.111,...,180760,1,1_1,Warm Emotional Calm,1_1,78591.516292,0.015276,15560.0,27506.0,0.213071
4,6N6tiFZ9vLTSOIxkj8qKrd,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['Frédéric Chopin', 'Vladimir Horowitz']",1,1928,0.99,0.21,0.204,0.908,0.098,...,687733,1,1_2,Epic Intense,1_2,428381.486465,0.083274,32284.0,21278.5,0.148001


# Cell 4 - Playlist creation function

In [11]:
from collections import Counter
from datetime import datetime
def recommend_playlist(mood, activity, time_of_day, age):

    #moody matrix
    mood_map = {
        "relax": ["0_0", "0_1", "1_0", "1_1"],
        "happy": ["2_1", "2_2", "2_5"],
        "sad": ["1_0", "1_1"],
        "workout": ["2_3", "2_4", "2_5"],
        "focus": ["0_0", "1_0", "1_2"],
        "party": ["2_0", "2_1", "2_5"]
    }

    activity_map = {
        "study time": ["0_0", "1_0", "1_2"],
        "walking": ["2_1", "2_2", "0_1"],
        "running": ["2_3", "2_4", "2_5"],
        "relaxing": ["0_0", "0_1", "1_0"],
        "party": ["2_0", "2_1", "2_5"]
    }

    time_map = {
        "morning": ["2_2", "2_5", "0_1"],
        "afternoon": ["2_1", "2_2"],
        "evening": ["0_0", "1_1", "2_0"],
        "night": ["0_0", "1_0", "1_1"]
    }

    #cluster selection

    clusters = []
    clusters += mood_map.get(mood, [])
    clusters += activity_map.get(activity, [])
    clusters += time_map.get(time_of_day, [])

    if mood == "party" and activity == "party" and time_of_day == 'night':
        if "0_0" in clusters:
            clusters.remove("0_0")
        if "1_0" in clusters:
            clusters.remove("1_0")
        if "1_1" in clusters:
            clusters.remove("1_1")
        clusters += ["2_0", "2_1", "2_5"] 

    # first 10/15 songs from weighted clusters based popularity and distance to centroid
    Coun = dict(Counter(clusters))
    cluster_sum = len(clusters)
    if mood in ("party", "happy"):
        playlist_length = 10
    else:
        playlist_length = 15

    weighted_clusters = {}

    for cluster, count in Coun.items():
        weight = count / cluster_sum
        n_songs = max(1, int(weight * playlist_length))
        weighted_clusters[cluster] = n_songs

    candidated_songs = pd.DataFrame()
    
    for cluster, count in weighted_clusters.items():
        candidated_songs = pd.concat([
        candidated_songs,
        df_model[df_model["subcluster"] == cluster].sort_values('combined_rank', ascending = False).head(count)
    ])


    # age filtering for party and happy mood
    year = datetime.now().year
    birth_year = int(year) - int(age)
    
    if mood in ("party", "happy"):
        candidated_songs = pd.concat([
        candidated_songs,
        df_model[(df_model["subcluster"] == max(Coun, key=Coun.get)) & ((df_model["year"] >= birth_year + 18) & (df_model["year"] <= birth_year + 35)) ].sort_values('popularity', ascending = False).head(5)
    ])
    
    
    return candidated_songs.head(20)


recommend_playlist('party','party','night','24')

Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,...,duration_ms,macro_cluster,subcluster,subcluster_label,predicted_subcluster,distance_to_centroid,distance_normalized,rank_distance,rank_popularity,combined_rank
87769,0b9oOr2ZgvyQu88wzixux9,This Is America,['Childish Gambino'],83,2018,0.117,0.854,0.463,0.0,0.354,...,225773,2,2_0,Uplifting Gentle,2_0,60.56025,5e-05,70.0,240.0,0.998649
84441,2WfaOiMkCvy7F5fcp2zZ8L,Take on Me,['a-ha'],83,1985,0.018,0.573,0.902,0.00125,0.0928,...,225280,2,2_0,Uplifting Gentle,2_0,554.500122,0.000513,803.0,240.0,0.996439
116555,7sQKy5vlPQllr0k9IjYJv3,Sigues Con El,"['Dímelo Flow', 'Arcangel', 'Sech']",86,2019,0.11,0.883,0.668,0.000529,0.0734,...,226533,2,2_0,Uplifting Gentle,2_0,700.296223,0.000649,1028.0,108.0,0.996388
97449,5kRPPEWFJIMox5qIkQkiz5,Sweet Creature,['Harry Styles'],77,2017,0.746,0.427,0.262,0.0,0.068,...,224867,2,2_1,Intense High-Energy,2_1,966.536834,0.000899,1451.0,1045.5,0.990659
87980,4PV0uE5pZSh44E3NqNNDEH,Selfish,['Madison Beer'],86,2020,0.627,0.378,0.461,0.0,0.386,...,223270,2,2_1,Intense High-Energy,2_1,2563.794524,0.002397,3807.0,108.0,0.98801
97574,4WzhjxvLP95y7AMDy0Atwb,Out Of Love,['Alessia Cara'],78,2018,0.8,0.492,0.267,0.0,0.249,...,227693,2,2_1,Intense High-Energy,2_1,1859.578391,0.001737,2752.0,826.5,0.987777
144167,1Ej96GIBCTvgH7tNX1r3qr,Otro Trago,"['Sech', 'Darell']",79,2019,0.136,0.747,0.7,0.000167,0.11,...,225933,2,2_5,Energetic Live Mood,2_5,114.943314,0.000101,166.0,646.0,0.996431
87625,1pKeFVVUOPjFsOABub0OaV,Side To Side,"['Ariana Grande', 'Nicki Minaj']",78,2016,0.0528,0.65,0.736,0.0,0.235,...,226160,2,2_5,Energetic Live Mood,2_5,329.081535,0.000301,492.0,826.5,0.99459
107151,2fVHrSxsQbJUuj9MW9zG1e,Who Do You Love,"['The Chainsmokers', '5 Seconds of Summer']",79,2019,0.0101,0.51,0.846,0.0,0.208,...,226733,2,2_5,Energetic Live Mood,2_5,899.840662,0.000837,1349.0,646.0,0.992864
87940,7ytR5pFWmSjzHJIeQkgog4,ROCKSTAR (feat. Roddy Ricch),"['DaBaby', 'Roddy Ricch']",99,2020,0.247,0.746,0.69,0.0,0.101,...,181733,2,2_0,Uplifting Gentle,2_0,44100.431794,0.041345,64890.0,2.0,0.804375


In [12]:
from collections import Counter
from datetime import datetime
import pandas as pd
import numpy as np

def recommend_playlist(mood, activity, time_of_day, age):

    #moody matrix
    mood_map = {
        "relax": ["0_0", "0_1", "1_0", "1_1"],
        "happy": ["2_1", "2_2", "2_5"],
        "sad": ["1_0", "1_1"],
        "workout": ["2_3", "2_4", "2_5"],
        "focus": ["0_0", "1_0", "1_2"],
        "party": ["2_0", "2_1", "2_5"]
    }

    activity_map = {
        "study time": ["0_0", "1_0", "1_2"],
        "walking": ["2_1", "2_2", "0_1"],
        "running": ["2_3", "2_4", "2_5"],
        "relaxing": ["0_0", "0_1", "1_0"],
        "party": ["2_0", "2_1", "2_5"]
    }

    time_map = {
        "morning": ["2_2", "2_5", "0_1"],
        "afternoon": ["2_1", "2_2"],
        "evening": ["0_0", "1_1", "2_0"],
        "night": ["0_0", "1_0", "1_1"]
    }
    
    #cluster selection
    clusters = []
    clusters += mood_map.get(mood, [])
    clusters += activity_map.get(activity, [])
    clusters += time_map.get(time_of_day, [])

    if mood == "party" and activity == "party" and time_of_day == 'night':
        if "0_0" in clusters:
            clusters.remove("0_0")
        if "1_0" in clusters:
            clusters.remove("1_0")
        if "1_1" in clusters:
            clusters.remove("1_1")
        clusters += ["2_0", "2_1", "2_5"] 

    # first 10/15 songs from weighted clusters based popularity and distance to centroid
    Coun = dict(Counter(clusters))
    cluster_sum = len(clusters)
    if mood in ("party", "happy"):
        playlist_length = 10
    else:
        playlist_length = 15

    weighted_clusters = {}

    for cluster, count in Coun.items():
        weight = count / cluster_sum
        n_songs = max(1, int(weight * playlist_length))
        weighted_clusters[cluster] = n_songs
        
    #Take 3× the required number of songs so we have margin after applying filters
    candidated_songs = pd.DataFrame()

    for cluster, count in weighted_clusters.items():
        cluster_songs = (
            df_model[df_model["subcluster"] == cluster]
            .sort_values('combined_rank', ascending=False)
            .head(count * 3)
        )
        candidated_songs = pd.concat([candidated_songs, cluster_songs])

    #age filtering for party and happy mood
    year = datetime.now().year
    birth_year = int(year) - int(age)
    
    if mood in ("party", "happy"):
        #Search for songs released during the user's 15–30 age period (youth years)
        youth_start = birth_year + 15
        youth_end = birth_year + 30

        #Only filter if the range is valid (not future)
        if youth_start <= year:
            age_filtered = df_model[
                (df_model["subcluster"] == max(Coun, key=Coun.get)) &
                (df_model["year"] >= youth_start) &
                (df_model["year"] <= min(youth_end, year))
            ].sort_values('popularity', ascending=False).head(5)

            candidated_songs = pd.concat([candidated_songs, age_filtered])
            
    #Avoid duplicate songs
    candidated_songs = candidated_songs.drop_duplicates(subset=['track_id'])
    final_playlist = []
    artist_count = {}
    
    #Sort by combined_rank to take the best ones
    for _, song in candidated_songs.sort_values('combined_rank', ascending=False).iterrows():
        artist = song['artist_name']
        #Check if this artist already has 2 songs
        if artist_count.get(artist, 0) < 2:
            final_playlist.append(song)
            artist_count[artist] = artist_count.get(artist, 0) + 1
        #Stop when you reach the desired length
        if len(final_playlist) >= 20:
            break

    result = pd.DataFrame(final_playlist)
    
    return result


recommend_playlist('party','party','night','24')

Unnamed: 0,track_id,track_name,artist_name,popularity,year,acousticness,danceability,energy,instrumentalness,liveness,...,duration_ms,macro_cluster,subcluster,subcluster_label,predicted_subcluster,distance_to_centroid,distance_normalized,rank_distance,rank_popularity,combined_rank
87769,0b9oOr2ZgvyQu88wzixux9,This Is America,['Childish Gambino'],83,2018,0.117,0.854,0.463,0.0,0.354,...,225773,2,2_0,Uplifting Gentle,2_0,60.56025,5e-05,70.0,240.0,0.998649
84441,2WfaOiMkCvy7F5fcp2zZ8L,Take on Me,['a-ha'],83,1985,0.018,0.573,0.902,0.00125,0.0928,...,225280,2,2_0,Uplifting Gentle,2_0,554.500122,0.000513,803.0,240.0,0.996439
144167,1Ej96GIBCTvgH7tNX1r3qr,Otro Trago,"['Sech', 'Darell']",79,2019,0.136,0.747,0.7,0.000167,0.11,...,225933,2,2_5,Energetic Live Mood,2_5,114.943314,0.000101,166.0,646.0,0.996431
116555,7sQKy5vlPQllr0k9IjYJv3,Sigues Con El,"['Dímelo Flow', 'Arcangel', 'Sech']",86,2019,0.11,0.883,0.668,0.000529,0.0734,...,226533,2,2_0,Uplifting Gentle,2_0,700.296223,0.000649,1028.0,108.0,0.996388
87956,56uXDJRCuoS7abX3SkzHKQ,Myron,['Lil Uzi Vert'],86,2020,0.055,0.771,0.654,0.0,0.214,...,224955,2,2_0,Uplifting Gentle,2_0,879.172108,0.000817,1320.0,108.0,0.995508
87584,6b8Be6ljOzmkOmFslEb23P,24K Magic,['Bruno Mars'],78,2016,0.034,0.818,0.803,0.0,0.153,...,225983,2,2_0,Uplifting Gentle,2_0,150.162953,0.000134,226.0,826.5,0.995392
87625,1pKeFVVUOPjFsOABub0OaV,Side To Side,"['Ariana Grande', 'Nicki Minaj']",78,2016,0.0528,0.65,0.736,0.0,0.235,...,226160,2,2_5,Energetic Live Mood,2_5,329.081535,0.000301,492.0,826.5,0.99459
86643,1hGy2eLcmC8eKx7qr1tOqx,Beautiful Girls,['Sean Kingston'],78,2007,0.15,0.762,0.661,0.0,0.256,...,225373,2,2_0,Uplifting Gentle,2_0,460.57418,0.000425,664.0,826.5,0.994072
97756,4Zjdzxx0dsavsr7Ehr8fGE,Prospect (ft. Lil Baby),"['iann dior', 'Lil Baby']",78,2020,0.203,0.816,0.579,0.0,0.127,...,226314,2,2_0,Uplifting Gentle,2_0,480.595656,0.000443,695.5,826.5,0.993977
97673,00ko9WaS4jOX1kEk3gvHjf,Hot Shower,"['Chance the Rapper', 'MadeinTYO', 'DaBaby']",76,2019,0.00157,0.899,0.509,0.0,0.0573,...,225924,2,2_0,Uplifting Gentle,2_0,95.798914,8.3e-05,130.0,1309.5,0.993387
