In [None]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np
import pandas as pd
import dill as pickle
from copy import deepcopy
from matplotlib import pyplot as plt
from matplotlib.colors import BASE_COLORS
from sklearn.model_selection import KFold

from uncertain.utils.data import Data
from uncertain.utils.training import train
from uncertain.utils.evaluation import test_recommendations, uncertainty_distributions

from uncertain.explicit import MF, CPMF, OrdRec
from uncertain.extras import Ensemble, Resample, UncertainWrapper, UserHeuristic, ItemHeuristic

data = pd.read_csv('data/data.csv')
data.columns = ['user', 'item', 'score', 'timestamps']
ML = Data(data, implicit=False, batch_size=2048)

# ExplicitMF (FunkSVD)

In [None]:
best_loss = 1000
for wd in [1e-5, 5e-6, 1e-6]:
    model = MF(ML.n_user, ML.n_item, embedding_dim=10, lr=1e-3, weight_decay=wd)
    this_loss = train(model, ML)
    if this_loss < best_loss:
        best_loss = this_loss
        with open('fitted/baseline.pth', 'wb') as f:
            pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

# Ensemble

In [None]:
models = [model]
for _ in range(2):
    models.append(MF(ML.n_user, ML.n_item, embedding_dim=10, lr=1e-3, weight_decay=5e-06))
    train(models[-1], ML)

model_ = Ensemble(models)
with open('fitted/ensemble.pkl', 'wb') as f:
    pickle.dump(model_, f, pickle.HIGHEST_PROTOCOL)

# Resample

In [None]:
models = []
og = deepcopy(ML.train)
for _ in range(3):
    ML.train = og[np.random.choice(len(og), int(0.8*len(og)), replace=False), :]
    models.append(MF(ML.n_user, ML.n_item, embedding_dim=10, lr=1e-3, weight_decay=model.weight_decay))
    train(models[-1], ML)
model_ = Resample(model, models)
with open('fitted/resample.pkl', 'wb') as f:
    pickle.dump(model_, f, pickle.HIGHEST_PROTOCOL)
ML.train = og

# Zhu et. al

In [None]:
errors = np.empty(len(ML.train))
og = deepcopy(ML)
for train_idx, test_idx in KFold(n_splits=2, shuffle=True).split(ML.train):
    test = og.train[test_idx]
    ML.train = og.train[train_idx]
    model_ = MF(ML.n_user, ML.n_item, embedding_dim=10, lr=1e-3, weight_decay=model.weight_decay)
    train(model_, ML)
    errors[test_idx] = np.abs(model_.predict(torch.tensor(test[:, 0]).long(), torch.tensor(test[:, 1]).long()) - test[:, 2])
ML.train = deepcopy(og.train)
ML.train[:, 2] = errors
ML.val[:, 2] = np.abs(model.predict(torch.tensor(ML.val[:, 0]).long(), torch.tensor(ML.val[:, 1]).long()) - ML.val[:, 2])
model_ = MF(ML.n_user, ML.n_item, embedding_dim=10, lr=1e-3, weight_decay=model.weight_decay)
train(model_, ML)
ML = og
with open('fitted/zhu.pth', 'wb') as f:
    pickle.dump(model_, f, pickle.HIGHEST_PROTOCOL)  

# CPMF

In [None]:
best_loss = 1000
grid = [1e-5, 5e-6, 1e-6]
pairs = [(wd0, wd1) for wd0 in grid for wd1 in grid]
for wd0, wd1 in pairs:
    model = CPMF(ML.n_user, ML.n_item, embedding_dim=10, lr=1e-3, weight_decay_MF=wd0, weight_decay_gammas=wd1)
    this_loss = train(model, ML)
    if this_loss < best_loss:
        best_loss = this_loss
        with open('fitted/cpmf.pth', 'wb') as f:
            pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

# OrdRec

In [None]:
ML.to_ordinal()
best_loss = 1000
grid = [1e-5, 5e-6, 1e-6]
pairs = [(wd0, wd1) for wd0 in grid for wd1 in grid]
for wd0, wd1 in pairs:
    model = OrdRec(ML.n_user, ML.n_item, ML.score_labels, embedding_dim=10, lr=1e-3, weight_decay_MF=wd0, weight_decay_step=wd1)
    this_loss = train(model, ML)
    if this_loss < best_loss:
        best_loss = this_loss
        with open('fitted/ordrec.pth', 'wb') as f:
            pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

# Results

In [None]:
results = {}

# Baseline
with open('fitted/baseline.pth', 'rb') as f:
    model = pickle.load(f)
results['MF'] = test_recommendations(model, ML, max_k=10)

# Heuristics
user_support = np.bincount(ML.train_val.user)
item = ML.train_val.groupby('item').agg({'user': 'size', 'score': 'var'})
empty = np.where(~pd.Series(np.arange(ML.n_item)).isin(item.index))[0]
empty = pd.DataFrame(np.full((len(empty), 2), float('NaN')), index=empty, columns=['user', 'score'])
item = item.append(empty).sort_index().fillna(0).to_numpy()
results['User support'] = test_recommendations(UserHeuristic(base_MF=model, uncertainty=-user_support), ML, max_k=10)
results['Item support'] = test_recommendations(ItemHeuristic(base_MF=model, uncertainty=-item[:, 0]), ML, max_k=10)
results['Item variance'] = test_recommendations(ItemHeuristic(base_MF=model, uncertainty=item[:, 1]), ML, max_k=10)

# Ensemble
with open('fitted/ensemble.pkl', 'rb') as f:
    model_ = pickle.load(f)
results['Ensemble'] = test_recommendations(model_, ML, max_k=10)

# Resample
with open('fitted/resample.pkl', 'rb') as f:
    model_ = pickle.load(f)
results['Resample'] = test_recommendations(model_, ML, max_k=10)

# Zhu
with open('fitted/zhu.pth', 'rb') as f:
    model_ = pickle.load(f)
results['MF-CV'] = test_recommendations(UncertainWrapper(model, model_), ML, max_k=10)

# CPMF
with open('fitted/cpmf.pth', 'rb') as f:
    model = pickle.load(f)
results['MF'] = test_recommendations(model, ML, max_k=10)
model.weight_decay_MF, model.weight_decay_gammas, results['MF'], model.recommend(0)

# OrdRec
model.weight_decay, results['MF']
with open('fitted/ordrec.pth', 'rb') as f:
    model = pickle.load(f)
results['MF'] = test_recommendations(model, ML, max_k=10)
model.weight_decay_MF, model.weight_decay_step, results['MF'], model.recommend(0)

results_df = pd.DataFrame.from_dict(results, orient='Index')
ratings = results_df[['RMSE', 'RPI', 'Classification']]
print(ratings)
colors = [c for c in list(BASE_COLORS)]
keys = results_df.index.to_list()
colors = {keys[i]:colors[i] for i in range(len(keys))}

f, ax = plt.subplots(ncols=2, figsize=(18, 5))
for key in keys:
    ax[0].plot(np.arange(1, 11), results_df['Novelty'][key],
               '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
    ax[1].plot(np.arange(2, 11), results_df['Diversity'][key],
               '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
ax[0].set_xticks(np.arange(1, 11))
ax[0].set_xlabel('K', fontsize=20)
ax[0].set_ylabel('Expected surprise@K', fontsize=20)
ax[0].legend(ncol=2, fontsize=15)
ax[1].set_xticks(np.arange(2, 11))
ax[1].set_xlabel('K', fontsize=20)
ax[1].set_ylabel('Diversity@K', fontsize=20)
ax[1].legend(ncol=2, fontsize=15)
f.tight_layout()

f, ax = plt.subplots(ncols=3, figsize=(18, 5), sharex=True)
for key in keys:
    ax[0].plot(np.arange(1, 11), results_df['Precision'][key],
               '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
    ax[1].plot(np.arange(1, 11), results_df['Recall'][key],
               '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
    ax[2].plot(np.arange(1, 11), results_df['NDCG'][key],
               '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
ax[0].set_xticks(np.arange(1, 11))
ax[0].set_xlabel('K', fontsize=20)
ax[0].set_ylabel('Precision@K', fontsize=20)
ax[0].legend(ncol=2, fontsize=15)
ax[1].set_xlabel('K', fontsize=20)
ax[1].set_ylabel('Recall@K', fontsize=20)
ax[1].legend(ncol=2, fontsize=15)
ax[2].set_xlabel('K', fontsize=20)
ax[2].set_ylabel('NDCG@K', fontsize=20)
ax[2].legend(ncol=2, fontsize=15)
f.tight_layout()

f, ax = plt.subplots(ncols=2, figsize=(18, 5))
keys = ['CPMF', 'OrdRec']
for key in keys:
    ax[0].plot(np.arange(1, 21), results_df['Quantile RMSE'][key],
            '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
    ax[1].plot(np.arange(1, 11), results_df['RRI'][key],
            '-', color=colors[key], label=key, linewidth=3, alpha=0.6)
ax[0].set_xticks(np.arange(1, 21))
ax[0].set_xticklabels([round(elem, 2) for elem in np.linspace(start=0.05, stop=1, num=20).tolist()])
ax[0].set_xlabel('Uncertainty quantile', fontsize=20)
ax[0].set_ylabel('RMSE', fontsize=20)
ax[0].legend(ncol=2, fontsize=20)
ax[1].set_xlabel('K', fontsize=20)
ax[1].set_ylabel('RRI@K', fontsize=20)
ax[1].legend(ncol=2, fontsize=20)
f.tight_layout()