### Importando libs

In [None]:
# Basic imports
import pandas as pd
import tensorflow as tf
import numpy as np

# Preprocessing data imports
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

# AutoML imports
import autosklearn.classification

from tpot import TPOTClassifier
from flaml import AutoML

# Metrics imports
from sklearn.metrics import accuracy_score

### Lendo os dados

In [None]:
seed = 42

ratings = pd.read_csv('ml-25m/ratings_1kk.csv')

#### Visualizando os dados

In [None]:
ratings.head()

In [None]:
print('Número de usuários:', len(ratings.groupby('userId')))
print('Número de filmes:', len(ratings.groupby('movieId')))

#### Separação das features do modelo

In [None]:
X = ratings[['userId', 'movieId']]

#### Gerando as labels para predição:

A seguinte estratégia foi adotada, a ultima música a ser escutada pelo usuário foi utilizada como a música a ser predita. O objetivo então é criar os dados em um formato de timeSeries.

In [None]:
count = X.groupby('userId')['userId'].transform('count')
X = X[(count > 25) & (count < 50)]
X = X.groupby('userId')['movieId'].apply(list).values
X = pad_sequences(X, truncating='pre')

In [None]:
x = X[:,:-1]
y = X[:,-1:]

In [None]:
x.shape

In [None]:
y.shape

#### Métricas

In [None]:
def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    # get first array element of predictions array to correct calculate ground_truth len
    lb.fit(range(len(predictions[0]) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        
        # HACK: Should not be needed, but in case nothing was relevant
        if best <= 0:
            score = 0.0
        else:
            score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

#### Split dos dados de treino e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

#### FLAML

In [None]:
automl = AutoML()

In [None]:
automl.fit(X_train, 
           y_train, 
           task="classification", 
           metric='accuracy', 
           time_budget=3600,
           seed=seed,
           n_jobs=4)

In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

In [None]:
automl.model.estimator

In [None]:
y_pred = automl.predict(X_test)
y_pred_proba = automl.predict_proba(X_test)

In [None]:
print('accuracy', '=', accuracy_score(y_test, y_pred))
print('ndcg-score =', ndcg_score([i[0] for i in y_test], y_pred_proba, 10))

#### TPOT

In [None]:
tpot = TPOTClassifier(n_jobs=4, generations=0, max_time_mins=60, population_size=20, verbosity=2, random_state=seed)

In [None]:
tpot.fit(X_train, [i[0] for i in y_train])

In [None]:
y_pred = tpot.predict(X_test)
y_pred_proba = tpot.predict_proba(X_test)

In [None]:
print('accuracy =', accuracy_score(y_test, y_pred))
print('ndcg-score =', ndcg_score(y_test, y_pred_proba, 10))

#### AutoSklearn

In [None]:
cls = autosklearn.classification.AutoSklearnClassifier(seed=seed, n_jobs=4, time_left_for_this_task=3600)

In [None]:
cls.fit(X_train, y_train)

In [None]:
y_pred = cls.predict(X_test)
y_pred_proba = cls.predict_proba(X_test)

In [None]:
print('accuracy =', accuracy_score(y_test, y_pred))
print('ndcg-score =', ndcg_score(y_test, y_pred_proba, 10))