<a href="https://colab.research.google.com/github/vinicius-verona/PlaylistContinuationRecommender/blob/main/MusicRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math


# **ELM IMPL**


In [10]:
EXPERIMENTS = True
inputW = dict()
inputB = dict()


"""
ELM (Extreme Learning Machine) is a single hidden layer feedforward neural network that sets random input weights and bias.

Its calculations follows the equation 1:
Eq 1 -> f(x) = \sum_{i=1}^{N} \beta_i \cdot g(w_i \cdot x + b_i)

where,
    N is the number of total neurons in the input layer (IL);
    x is the input array;
    w_i is the weight associated to a neuron in the IL;0
    b_i is the polarization (bias) associated to a neuron in the IL;
    g(⋅) is the activation function applied to the neurons in the IL;
    β_i is the weight associated to the neurons in the output layer (OL).

In the ELM training phase, the weights w_i and b_i ​are randomly assigned and determined.
The training phase consists only of determining the weights of the output layer β_i, which can be done efficiently using a least squares formula.
"""
class ELM:
    def __init__(self, input_size, hidden_size, output_size, activation='relu'):
        self.input_size  = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        functions = {
            'relu': self.relu,
            'sigmoid': self.sigmoid,
            'tanh': self.tanh,
            'binary_step': self.binary_step,
            'softplus': self.softplus,
            'leaky_relu': self.leaky_relu,  # Notar que para usar leaky_relu, é necessário passar o parâmetro alpha de alguma forma
            'softmax': self.softmax,
            'swish': self.swish,  # Notar que para usar leaky_relu, é necessário passar o parâmetro beta de alguma forma
            'elu': self.elu

        }
        self.activation_function = functions.get(activation, None)

        # Weights and Biases -> normal random number
        # Generate two matrix of shape (input_size, hidden_size) and (hidden_size, 1) in a normal distribution

        self.input_weights = np.random.normal(size=(self.input_size, self.hidden_size))
        self.biases        = np.random.normal(size=[1, self.hidden_size])

        global inputW
        global inputB
        key = f's-{hidden_size}'

        if (EXPERIMENTS):
            if (key not in inputW):
                inputW[key] = self.input_weights.copy()
            else:
                self.input_weights = inputW[key].copy()

            if (key not in inputB):
                inputB[key] = self.biases.copy()
            else:
                self.biases = inputB[key].copy()



    def train(self, X, Y):
        # Compute hidden layer output and output layer weights using pseudo-inverse
        # X.dot(self.input_weights) must have the same number of columns and self.biases = self.hidden_size
        hidden_output       = self.activation_function(X.dot(self.input_weights) + self.biases)
        self.output_weights = np.dot(np.linalg.pinv(hidden_output), Y)



    def predict(self, X):
        # Compute hidden layer output for prediction
        hidden_output = self.activation_function(X.dot(self.input_weights) + self.biases)

        # Return predict
        return np.dot(hidden_output, self.output_weights)



    def relu(self, x):
        return np.maximum(x, 0, x)

    def sigmoid(self, x):
        return 1/(1 + np.exp(-x))

    def softmax(self, x):
        x = np.array(x)  # Convert to NumPy array
        exp_vals = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_vals / np.sum(exp_vals, axis=-1, keepdims=True)

    def swish(self, x, beta=1.0):
        return x * (1 / (1 + np.exp(-beta * x)))

    def elu(self, x, alpha=1.0):
        return np.where(x > 0, x, alpha * (np.exp(x) - 1))

    def tanh(self, x):
        return np.tanh(x);
        # return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

    def binary_step(self, x):
        a = np.copy(x);
        a[a<0] = 0;
        a[a>0] = 1;
        return a;

    def softplus(self, x):
        return np.log1p(np.exp(x))

    def leaky_relu(self, x, alpha=0.01):
        return np.where(x > 0, x, x * alpha)


"""
Split a dataframe into training and testing sets.

Parameters:
- dataframe: The input dataframe to be split.
- train_percentage: The percentage of data to be used for training.
- test_percentage: The percentage of data to be used for testing.
- random_state: Seed for random number generation.

Returns:
- train_set: The training set dataframe.
- test_set: The testing set dataframe.
"""
def split_dataframe(dataframe, train_percentage=0.8, test_percentage=0.2, random_state=None):
    train_set, test_set = train_test_split(dataframe, train_size=train_percentage, test_size=test_percentage, random_state=random_state)
    return train_set, test_set



In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def search_matching_song(x_train, features):
    ndf = dataset_no_duplicates.copy()
    ndf = ndf.drop(columns=IGNORE_COLUMNS, axis=1)
    ndf = ndf.drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)
    ndf = normalize(convert_to_number(ndf))
    ndf = ndf.reset_index(drop=True)


    x_train = normalize_with_reference(x_train, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1))

    # Find all rows where any column has a value less than 0 or greater than 1


    # Print these rows


    #print(features)
    # print(x_train)
    #print('---')
    #print(normalize_with_reference(features, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)))

    # Merge ndf with x_train
    merged_df = ndf.merge(x_train, indicator=True, how='left')
    merged_df = merged_df.reset_index(drop=True)

    # Filter out rows where the values are only in ndf
    songs_not_in_x_train = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

    # Reset index if needed
    # songs_not_in_x_train = songs_not_in_x_train.drop(columns=['artist_name', 'playlist_pid'], axis=1)
    songs_not_in_x_train = songs_not_in_x_train.dropna()
    songs_not_in_x_train = songs_not_in_x_train.reset_index(drop=True)

    normalized_features = normalize_with_reference(features, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1))


    # Based on the mean of the features, calculate the cosine similarity with songs_not_in_x_train
    similarity = cosine_similarity(normalized_features.values.reshape(1,-1), songs_not_in_x_train)
    index_of_max_similarity = similarity.argmax()
    song = ndf.iloc[index_of_max_similarity]
    song = song.to_frame().T

    #print(song.columns)


    return song

In [12]:
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

FEATURES=['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

# Função para calcular o MSE
def calculate_mse(model, X_test, Y_true):
    Y_pred = model.predict(X_test)
    return mean_squared_error(Y_true, Y_pred)

# Função para calcular distancia entre solução ideal e prevista
def calculate_md(model, X_test, Y_true):
    Y_pred = pd.DataFrame(model.predict(X_test), columns=FEATURES)
    pred = Y_pred.tail(1)
    true = Y_true.tail(1)
    recommended_song = search_matching_song(X_test, pred)
    return (euclidean_dist(recommended_song, true))

# Função para testar diferentes configurações de parâmetros e avaliar usando o MSE
def test_elm_parameters(X_train, Y_train, X_test, Y_test, parameters, final_elm_value):
    results = []
    final_elm_array_values = []
    for param in parameters:
        elm = ELM(input_size=X_train.shape[1], hidden_size=param['hidden_size'], output_size=Y_train.shape[1], activation=param['activation'])
        elm.train(X_train, Y_train)
        error = calculate_md(elm, X_test, Y_test)
        results.append({'hidden_size': param['hidden_size'], 'activation': param['activation'], 'ERROR': error})
        final_elm_value["Method"] = f"ELM({param['hidden_size']} x {param['activation']})"
        final_elm_value["Error"] = error
        final_elm_array_values.append(final_elm_value)
    return results, final_elm_array_values



In [13]:
IGNORE_COLUMNS=[
    'playlist_name',
    'track_uri',
    'track_name',
    'artist_uri',
    'album_uri',
    'album_name',
    'position_in_playlist',
    'type',
    'uri',
    'track_href',
    'analysis_url'
]


In [14]:
# Load dataset
df_path = 1

if df_path == 1:
    df = pd.read_csv('/content/drive/My Drive/TP-Recommender-System/input/spotify-dataset.csv', sep=';')
    dataset = df.copy()
else:
    df = pd.read_csv('/content/drive/My Drive/data/spotify-dataset.csv', sep=';')
    dataset = df.copy()

columns = dataset.columns.tolist()
columns = list(filter(lambda x: (x != 'playlist_pid' and x != 'position_in_playlist' and x != 'playlist_name'), columns))

dataset_no_duplicates = dataset.drop_duplicates(subset=columns)
dataset_no_duplicates.info()

  df = pd.read_csv('/content/drive/My Drive/TP-Recommender-System/input/spotify-dataset.csv', sep=';')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 38793 entries, 0 to 62599
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   playlist_pid          38793 non-null  object
 1   playlist_name         38793 non-null  object
 2   track_uri             38793 non-null  object
 3   track_name            38793 non-null  object
 4   artist_uri            38793 non-null  object
 5   artist_name           38793 non-null  object
 6   album_uri             38793 non-null  object
 7   album_name            38793 non-null  object
 8   position_in_playlist  38793 non-null  object
 9   danceability          38793 non-null  object
 10  energy                38793 non-null  object
 11  key                   38793 non-null  object
 12  loudness              38793 non-null  object
 13  mode                  38793 non-null  object
 14  speechiness           38793 non-null  object
 15  acousticness          38793 non-null

In [15]:
def convert_to_number(dataframe):
    dataframe = dataframe.copy()

    # List of columns to convert
    numeric_columns = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                        'instrumentalness', 'liveness', 'valence', 'tempo', 'mode',
                        'key', 'duration_ms', 'time_signature', 'playlist_pid']

    for column in numeric_columns:
        try:
            dataframe.loc[:, column] = pd.to_numeric(dataframe[column], errors='coerce')
        except:
            pass

    return dataframe

In [16]:
def euclidean_dist(x, y):
    return np.linalg.norm(np.array(x) - np.array(y))

In [17]:
# Normalization of features
from sklearn.preprocessing import MinMaxScaler

def normalize(df):
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_normalized = df.copy()

    # Get numerical columns
    numerical_columns = df_normalized.select_dtypes(include=['float64', 'int64']).columns

    # Iterate through numerical columns and normalize them
    for column in numerical_columns:
        min_val = df_normalized[column].min()
        max_val = df_normalized[column].max()

        # Normalize the column using .loc
        if min_val != max_val:
            df_normalized.loc[:, column] = (df_normalized[column] - min_val) / (max_val - min_val)
        else:
            df_normalized.loc[:, column] = 0  # If min and max are the same, set the column to 0

    return df_normalized


def normalize_with_reference(df, reference_df):
    df = df.copy()

    # Get numerical columns from both DataFrames
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    reference_numerical_columns = reference_df.select_dtypes(include=['float64', 'int64']).columns

    # Iterate through numerical columns and normalize df with respect to reference_df
    for column in numerical_columns:
        if column in reference_numerical_columns:
            min_val = reference_df[column].min()
            max_val = reference_df[column].max()

            # Normalize the column based on the reference DataFrame
            if min_val != max_val:
                df.loc[:, column] = (df[column] - min_val) / (max_val - min_val)
            else:
                df.loc[:, column] = 0  # If min and max are the same, set the column to 0

    return df


# Comparação de Diferentes Parâmetros usando MSE

In [18]:
# # Creation of y_test = x_test[1:] + 1 song in the original playlist df
# y_test = x_test[1:].copy()
# n_song = test_set[1:2].copy()

# n_song = n_song.drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)
# n_song = n_song.drop(columns=IGNORE_COLUMNS, axis=1)
# y_test = y_test.append(n_song, ignore_index=True)

# y_test = convert_to_number(y_test)
# y_test.info()

In [19]:
# # Exemplo de como usar a função test_elm_parameters
# parameters = [
#     {'hidden_size': 100, 'activation': 'relu'},
#     {'hidden_size': 200, 'activation': 'relu'},
#     {'hidden_size': 300, 'activation': 'relu'},
#     {'hidden_size': 100, 'activation': 'sigmoid'},
#     {'hidden_size': 200, 'activation': 'sigmoid'},
#     {'hidden_size': 300, 'activation': 'sigmoid'},
#     {'hidden_size': 100, 'activation': 'leaky_relu'},
#     {'hidden_size': 200, 'activation': 'leaky_relu'},
#     {'hidden_size': 300, 'activation': 'leaky_relu'},

#     # {'hidden_size': 50, 'activation': 'tanh'},
#     # {'hidden_size': 150, 'activation': 'tanh'},
#     # {'hidden_size': 250, 'activation': 'tanh'},
#     # {'hidden_size': 100, 'activation': 'binary_step'},
#     # {'hidden_size': 200, 'activation': 'binary_step'},

#     # {'hidden_size': 100, 'activation': 'softplus'},
#     # {'hidden_size': 150, 'activation': 'leaky_relu'},

#     # {'hidden_size': 200, 'activation': 'tanh'},
#     # {'hidden_size': 250, 'activation': 'binary_step'},

#     {'hidden_size': 1000, 'activation': 'relu'},
#     {'hidden_size': 2000, 'activation': 'relu'},
#     {'hidden_size': 3000, 'activation': 'relu'},
#     {'hidden_size': 1000, 'activation': 'sigmoid'},
#     {'hidden_size': 2000, 'activation': 'sigmoid'},
#     {'hidden_size': 3000, 'activation': 'sigmoid'},
#     {'hidden_size': 1000, 'activation': 'leaky_relu'},
#     {'hidden_size': 2000, 'activation': 'leaky_relu'},
#     {'hidden_size': 3000, 'activation': 'leaky_relu'},

#     # {'hidden_size': 500, 'activation': 'tanh'},
#     # {'hidden_size': 1500, 'activation': 'tanh'},
#     # {'hidden_size': 2500, 'activation': 'tanh'},
#     # {'hidden_size': 1000, 'activation': 'binary_step'},
#     # {'hidden_size': 2000, 'activation': 'binary_step'},

#     # {'hidden_size': 1000, 'activation': 'softplus'},
#     # {'hidden_size': 1500, 'activation': 'leaky_relu'},

#     # {'hidden_size': 2000, 'activation': 'tanh'},
#     # {'hidden_size': 2500, 'activation': 'binary_step'},
#     # Outras funções de ativação podem ser adicionadas conforme necessário
# ]

# # results = test_elm_parameters(x_train, y_train, x_test, y_test, parameters)

# # Ordenar os resultados pelo hidden_size
# results_sorted = sorted(results, key=lambda x: x['hidden_size'])

# # Imprimir os resultados ordenados
# print(tabulate(results_sorted, headers="keys", tablefmt="simple_grid"))

# # Ordenar os resultados pelo activation
# results_sorted = sorted(results, key=lambda x: x['activation'])

# # Imprimir os resultados ordenados
# print(tabulate(results_sorted, headers="keys", tablefmt="simple_grid"))



1. **Impacto do Tamanho da Camada Oculta:** Observa-se que modelos com um maior número de neurônios na camada oculta (`hidden_size` de 300) tendem a ter um desempenho melhor em termos de MSE (Mean Squared Error). Isso sugere que um maior número de neurônios pode capturar a complexidade dos dados de forma mais eficaz, levando a uma melhor precisão na previsão. No entanto, isso também pode aumentar o risco de overfitting, dependendo da quantidade de dados de treinamento disponíveis e da complexidade do problema.

2. **Influência da Função de Ativação:** A escolha da função de ativação tem um impacto significativo no desempenho do modelo. As funções `relu` e `sigmoid` aparecem várias vezes entre as melhores configurações, indicando que essas funções podem ser mais adequadas para este conjunto específico de dados e tarefa. No entanto, a `relu` parece ter uma leve vantagem em configurações com um número maior de neurônios, enquanto diferentes funções de ativação podem ter desempenhos variados dependendo do tamanho da camada oculta.

3. **Trade-off entre Complexidade do Modelo e Desempenho:** Enquanto modelos mais complexos (com mais neurônios) tendem a ter um MSE menor, também há casos em que aumentar excessivamente a complexidade não traz melhorias proporcionais no desempenho ou até mesmo resulta em desempenho pior. Isso é visível no desempenho variado de configurações com tamanhos de camada oculta intermediários (150, 200, 250), onde não apenas o tamanho da camada, mas também a escolha da função de ativação influencia o MSE. Isso ressalta a importância de encontrar um equilíbrio entre a capacidade do modelo e a generalização para dados não vistos.


# **Testes Experimentais - ELM**


In [20]:
import warnings

k_playlists = 10

EXPERIMENTS = True
inputW = dict()
inputB = dict()

def get_k_largest_playlist_pid(k):
    # The model is trained for each given playlist
    df['playlist_pid'] = pd.to_numeric(df['playlist_pid'], errors='coerce')
    return df.groupby('playlist_pid').count().nlargest(k, 'danceability').index

def get_k_playlist_pid(k):
    # The model is trained for each given playlist
    df['playlist_pid'] = pd.to_numeric(df['playlist_pid'], errors='coerce')
    unique_playlist_pids = df.groupby('playlist_pid').count().index

    # Shuffle the DataFrame to randomize the order
    shuffled_df = df.sample(frac=1, random_state=42)

    # Select the first k unique playlist_pid values
    random_playlist_pids = shuffled_df['playlist_pid'].unique()[:k]

    return random_playlist_pids


def get_dataset(pid):
    playlist = df[df['playlist_pid'] == pid].copy()
    playlist = playlist.sort_values(by='danceability')

    # Split playlist
    split_index = int(math.floor(len(playlist) * 0.7))
    train_set = playlist.iloc[:split_index].copy()
    test_set = playlist.iloc[split_index:].copy()
    return train_set, test_set

def clean_dataset(dataset):
    dataset = dataset.drop(columns=IGNORE_COLUMNS, axis=1)
    return dataset

playlist_pids = get_k_largest_playlist_pid(k_playlists)
playlist_pids



Float64Index([434.0, 983.0, 1228.0, 1277.0, 1368.0, 1445.0, 1518.0, 1865.0,
              131.0, 285.0],
             dtype='float64', name='playlist_pid')

In [21]:

# Suppress FutureWarning for append
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=DeprecationWarning)

    results_elm = []

    for pid in playlist_pids:
      train, test = get_dataset(pid)

      # Clean the train and test datasets
      train = train.drop(columns=IGNORE_COLUMNS, axis=1)
      train = convert_to_number(train)
      test = test.drop(columns=IGNORE_COLUMNS, axis=1)
      test = convert_to_number(test)

      train = normalize_with_reference(train, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1))
      test = normalize_with_reference(test, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1))


      # Create x_train, y_train
      x_train = train.copy().drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)
      n_song = test[0:1].copy()
      n_song = n_song.drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)
      y_train = x_train[1:].copy()
      y_train = y_train.append(n_song, ignore_index=True)
      y_train = convert_to_number(y_train)

      # Create x_test, y_test
      x_test = y_train.copy()
      n_song = test[1:2].copy()
      n_song = n_song.drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)
      y_test = x_test[1:].copy()
      y_test = y_test.append(n_song, ignore_index=True)
      y_test = convert_to_number(y_test)

      # Run experiments
      exp_parameters = [
          {'hidden_size': 100, 'activation': 'relu'},
          {'hidden_size': 100, 'activation': 'tanh'},
          {'hidden_size': 100, 'activation': 'leaky_relu'},

          {'hidden_size': 500, 'activation': 'relu'},
          {'hidden_size': 500, 'activation': 'tanh'},
          {'hidden_size': 500, 'activation': 'leaky_relu'},

          {'hidden_size': 1000, 'activation': 'relu'},
          {'hidden_size': 1000, 'activation': 'tanh'},
          {'hidden_size': 1000, 'activation': 'leaky_relu'},

          {'hidden_size': 5000, 'activation': 'relu'},
          {'hidden_size': 5000, 'activation': 'tanh'},
          {'hidden_size': 5000, 'activation': 'leaky_relu'},

          {'hidden_size': 10000, 'activation': 'relu'},
          {'hidden_size': 10000, 'activation': 'tanh'},
          {'hidden_size': 10000, 'activation': 'leaky_relu'},
      ]

      final_elm_value = {"PID": pid, "Method": "", "Error": ""}

      print(f'Experiment - PID: {pid} -> dataset sizes: x_train({x_train.shape}) and x_test({x_test.shape})')
      results, final_elm_value = test_elm_parameters(x_train, y_train, x_test, y_test, exp_parameters, final_elm_value)

      results_elm = results_elm + final_elm_value

      # Ordenar os resultados pelo hidden_size
      results_sorted = sorted(results, key=lambda x: x['hidden_size'])

      # Imprimir os resultados ordenados
      print(tabulate(results_sorted, headers="keys", tablefmt="simple_grid"))

      # Ordenar os resultados pelo activation
      results_sorted = sorted(results, key=lambda x: x['activation'])

      # Imprimir os resultados ordenados
      print(tabulate(results_sorted, headers="keys", tablefmt="simple_grid"))
      print('--------------------------------------------------------------------------------')

Experiment - PID: 434.0 -> dataset sizes: x_train((70, 13)) and x_test((70, 13))
┌───────────────┬──────────────┬──────────┐
│   hidden_size │ activation   │    ERROR │
├───────────────┼──────────────┼──────────┤
│           100 │ relu         │ 0.992019 │
├───────────────┼──────────────┼──────────┤
│           100 │ tanh         │ 1.17032  │
├───────────────┼──────────────┼──────────┤
│           100 │ leaky_relu   │ 0.992019 │
├───────────────┼──────────────┼──────────┤
│           500 │ relu         │ 1.18948  │
├───────────────┼──────────────┼──────────┤
│           500 │ tanh         │ 1.18948  │
├───────────────┼──────────────┼──────────┤
│           500 │ leaky_relu   │ 1.18948  │
├───────────────┼──────────────┼──────────┤
│          1000 │ relu         │ 1.58954  │
├───────────────┼──────────────┼──────────┤
│          1000 │ tanh         │ 1.17032  │
├───────────────┼──────────────┼──────────┤
│          1000 │ leaky_relu   │ 1.58954  │
├───────────────┼──────────────┼───────

# **Testes Experimentais - Cosine Similarity**

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import warnings

k_playlists = 10

DEBUG = False
EXPERIMENTS = True
inputW = dict()
inputB = dict()

def get_k_largest_playlist_pid(df, k):
    # The model is trained for each given playlist
    df.loc['playlist_pid'] = pd.to_numeric(df['playlist_pid'], errors='coerce')
    return df.groupby('playlist_pid').count().nlargest(k, 'danceability').index

def get_k_playlist_pid(df, k):
    # The model is trained for each given playlist
    df.loc['playlist_pid'] = pd.to_numeric(df['playlist_pid'], errors='coerce')
    unique_playlist_pids = df.groupby('playlist_pid').count().index

    # Shuffle the DataFrame to randomize the order
    shuffled_df = df.sample(frac=1, random_state=42)

    # Select the first k unique playlist_pid values
    random_playlist_pids = shuffled_df['playlist_pid'].unique()[:k]

    return random_playlist_pids

def get_dataset(df, pid):
    playlist = df[df['playlist_pid'] == pid].copy()
    playlist = playlist.sort_values(by='danceability')

    # Split playlist
    split_index = int(math.floor(len(playlist) * 0.7))
    train_set = playlist.iloc[:split_index].copy()
    test_set = playlist.iloc[split_index:].copy()
    return train_set, test_set

def clean_dataset(dataset):
    dataset = dataset.drop(columns=IGNORE_COLUMNS, axis=1)
    return dataset

odf = convert_to_number(dataset)
odf = odf.drop(columns=IGNORE_COLUMNS, axis=1).copy()
odf = odf.drop(columns=['id'], axis=1)

playlist_pids = get_k_largest_playlist_pid(odf, k_playlists)
playlist_pids


  dataframe.loc[:, column] = pd.to_numeric(dataframe[column], errors='coerce')


Float64Index([434.0, 983.0, 1228.0, 1277.0, 1368.0, 1445.0, 1518.0, 1865.0,
              131.0, 285.0],
             dtype='float64', name='playlist_pid')

In [23]:
ndf = normalize(convert_to_number(dataset_no_duplicates))
ndf = ndf.drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1)
ndf = ndf.reset_index(drop=True)

  dataframe.loc[:, column] = pd.to_numeric(dataframe[column], errors='coerce')


In [24]:
# Suppress FutureWarning for append
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=DeprecationWarning)

    result_cos = []

    for pid in playlist_pids:
        print(f'--- PID {pid} ---')
        train, test = get_dataset(odf, pid)

        print(f'Detail: size of x_train({train.shape[0]})')

        train = normalize_with_reference(train, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1))
        test = normalize_with_reference(test, convert_to_number(dataset_no_duplicates).drop(columns=IGNORE_COLUMNS, axis=1).drop(columns=['id', 'artist_name', 'playlist_pid'], axis=1))

        # Create x_train, y_train
        x_train = train.copy()
        n_song = test[0:1].copy()
        n_song = n_song
        y_train = x_train[1:].copy()
        y_train = y_train.append(n_song, ignore_index=True)
        y_train = convert_to_number(y_train)

        # Merge ndf with x_train
        merged_df = ndf.merge(x_train, indicator=True, how='left')
        merged_df = merged_df.reset_index(drop=True)

        # Filter out rows where the values are only in ndf
        songs_not_in_x_train = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

        # Reset index if needed
        songs_not_in_x_train.reset_index(drop=True, inplace=True)
        x_train = x_train.drop(columns=['artist_name', 'playlist_pid'], axis=1)
        y_train = y_train.drop(columns=['artist_name', 'playlist_pid'], axis=1)
        songs_not_in_x_train = songs_not_in_x_train.drop(columns=['artist_name', 'playlist_pid'], axis=1)
        songs_not_in_x_train = songs_not_in_x_train.dropna()
        songs_not_in_x_train = songs_not_in_x_train.reset_index(drop=True)

        # Calculate x_train mean for each feature
        mean_features = x_train.mean()

        # Based on the mean of the features, calculate the cosine similarity with songs_not_in_x_train
        similarity = cosine_similarity(mean_features.values.reshape(1,-1), songs_not_in_x_train)
        index_of_max_similarity = similarity.argmax()
        song = ndf.iloc[index_of_max_similarity,:]
        song = song.to_frame().T


        # Calculate error
        error = euclidean_dist(song,y_train.tail(1))

        data = [
            ["Distance from mean_features", euclidean_dist(song, mean_features)],
            ["Error", error]
        ]

        # Only print the non-debug information in a tabulated format
        print(f'--- PID {pid} ---')
        print(tabulate(data, headers=["Metric", "Value"], tablefmt="grid"))

        final_value_cos = {"PID": pid, "Method": "COS", "Error": error}
        result_cos.append(final_value_cos)


--- PID 434.0 ---
Detail: size of x_train(70)
--- PID 434.0 ---
+-----------------------------+----------+
| Metric                      |    Value |
| Distance from mean_features | 0.823416 |
+-----------------------------+----------+
| Error                       | 1.24565  |
+-----------------------------+----------+
--- PID 983.0 ---
Detail: size of x_train(70)
--- PID 983.0 ---
+-----------------------------+----------+
| Metric                      |    Value |
| Distance from mean_features | 0.532853 |
+-----------------------------+----------+
| Error                       | 1.10868  |
+-----------------------------+----------+
--- PID 1228.0 ---
Detail: size of x_train(70)
--- PID 1228.0 ---
+-----------------------------+----------+
| Metric                      |    Value |
| Distance from mean_features | 0.879426 |
+-----------------------------+----------+
| Error                       | 1.1396   |
+-----------------------------+----------+
--- PID 1277.0 ---
Detail: size 

### Comparação

In [25]:
final_results = results_elm + result_cos

# Convert arrays of dictionaries to a list of lists for tabulate
def dicts_to_rows(dicts):
    return [[d["PID"], d["Method"], d["Error"]] for d in dicts]

final_results = dicts_to_rows(results_elm) + dicts_to_rows(result_cos)

# Sorting first by PID (as integers for proper numerical sorting) and then by Error (converted to float for numerical comparison)
final_results = sorted(final_results, key=lambda x: (int(x[0]), float(x[2])))

first_occurrences = {}

for row in final_results:
    pid, method, error = row
    pid_key_elm = f"{pid}_ELM"
    pid_key_cos = f"{pid}_COS"

    # Check if the method starts with "ELM" and it's the first occurrence for the PID
    if method.startswith("ELM") and pid_key_elm not in first_occurrences:
        first_occurrences[pid_key_elm] = row

    # Check if the method starts with "COS" and it's the first occurrence for the PID
    if method.startswith("COS") and pid_key_cos not in first_occurrences:
        first_occurrences[pid_key_cos] = row

# Extract the filtered results from the dictionary
filtered_results = list(first_occurrences.values())

# Sort the filtered results by PID again, as the dictionary might have altered the order
filtered_results = sorted(filtered_results, key=lambda x: int(x[0]))

# Print the filtered results
print("FINAL RESULTS (First ELM and COS for each PID)")
print(tabulate(filtered_results, headers=["PID", "Method", "Error"], tablefmt="simple_grid"))

FINAL RESULTS (First ELM and COS for each PID)
┌───────┬─────────────────────────┬──────────┐
│   PID │ Method                  │    Error │
├───────┼─────────────────────────┼──────────┤
│   131 │ COS                     │ 1.02573  │
├───────┼─────────────────────────┼──────────┤
│   131 │ ELM(10000 x leaky_relu) │ 1.27214  │
├───────┼─────────────────────────┼──────────┤
│   285 │ COS                     │ 0.987877 │
├───────┼─────────────────────────┼──────────┤
│   285 │ ELM(10000 x leaky_relu) │ 1.30671  │
├───────┼─────────────────────────┼──────────┤
│   434 │ ELM(10000 x leaky_relu) │ 1.18948  │
├───────┼─────────────────────────┼──────────┤
│   434 │ COS                     │ 1.24565  │
├───────┼─────────────────────────┼──────────┤
│   983 │ ELM(10000 x leaky_relu) │ 0.868433 │
├───────┼─────────────────────────┼──────────┤
│   983 │ COS                     │ 1.10868  │
├───────┼─────────────────────────┼──────────┤
│  1228 │ ELM(10000 x leaky_relu) │ 1.03031  │
├───────┼────