In [1]:
# General Stuff:
import numpy as np
import pandas as pd

# Our Stuff:
from models.AutoRecBase import AutoRecBase
from models.VarAutoRec import VarAutoRec
from models.MF import MF


from scripts.get_data import download_2_data_sets, ratings_to_train_test, ratings_to_train_test_u
from scripts.get_2_other_data import get_2_other_datasets, secondary_to_train_test, secondary_to_train_test_u
from utils.evaluate import evaluate_model
from utils.loading_utils import load_model, save_model

import torch
from torch import nn
import pytorch_lightning as pl

# Visualization Stuff
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 240)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# AutoRec - AutoEncoders Meet Collaborative Filtering - PyTorch

<a id="toc"></a>
## Table of Content
1. [Introduction](#introduction)
1. [Conclusions](#conclusions)

<a id="introduction"></a>

## Introduction

In this notebook we will review a collaborative filtering approach using autoencoders, as suggested by Sedhain et al, in their 2015 paper "AutoRec: Autoencoders Meet Collaborative Filtering".

We will then introduce several improvements and asses them.

[Table of content](#toc)

In [2]:
is_default_dataset = True
if is_default_dataset:
    download_2_data_sets()
else:
    get_2_other_datasets()

In [3]:
if is_default_dataset:
    train_loader, val_loader = ratings_to_train_test(1,0, 1,10)
    mf_train_loader, mf_val_loader =  ratings_to_train_test_u(dataset_size=1,
                                                              validation_partition=0,
                                                              train_partition=1,
                                                              batch_size=10)
else:
    train_loader, val_loader = secondary_to_train_test(1,0, 1,10)
    mf_train_loader, mf_val_loader =  secondary_to_train_test_u(dataset_size=1,
                                                                validation_partition=0,
                                                                train_partition=1,
                                                                batch_size=10)


In [4]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs

Launching TensorBoard...

Go to:  [TensorBoard](http://localhost:6006)

In [5]:
models_dict = {} # (model,ephoc,lr): loss
models_state = {} # (model,ephoc,lr): model.state_dict()
models = [
    # AutoRecBase,
    # VarAutoRec,
    MF
]
lrs = [0.001,0.002,0.004,0.01]
activations = [nn.PReLU, nn.Sigmoid]

Sanity check:

In [6]:
for x, y, m in val_loader:
    print(x[:,0])
    break
for x, y, m in mf_val_loader:
    print(x[0])
    break
for x, y, r in mf_train_loader:
    print(x)
    print(y)
    print(r)
    break
# model(x)[:,0]

tensor([5., 3., 3., 3., 3., 4., 3., 4., 5., 5.])
tensor(1)
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([2294, 3186, 1566,  588, 1907,  783, 1836,  150,    1, 1962])
tensor([4, 4, 4, 4, 4, 4, 5, 5, 5, 4])


In [7]:
print(f"Number of users: {len(val_loader.dataset)}")
print(f"Number of items: {len(val_loader.dataset[0][1])}")

Number of users: 6040
Number of items: 3706


Datasets:

In [8]:
number_of_items = len(val_loader.dataset)
number_of_users = len(val_loader.dataset[0][1])

From the original paper:

In [9]:
latent_dims = [10, 20, 40, 80, 100, 200, 300, 400, 500]
lambdas = [0.001, 0.01, 0.1, 1, 100, 1000]

In [None]:
should_train = True
model_paths = []
if should_train:
    for model_class in models:
        for activation in activations:
            for lr in lrs:
                for latent in latent_dims:
                    for λ in lambdas:
                        model = model_class(number_of_items=number_of_items,
                                            num_of_users=number_of_users,
                                            hidden_size=latent,
                                            activation_function_1=activation,
                                            activation_function_2=activation,
                                            loss=nn.MSELoss(reduction='none'),
                                            λ=λ,
                                            lr=lr)
                        # training
                        trainer = pl.Trainer(gpus=0, max_epochs=10)
                        if type(model).__name__ == "MF":
                            trainer.fit(model,mf_train_loader, mf_val_loader)
                        else:
                            trainer.fit(model,train_loader, val_loader)

                        model_path = save_model(model_class=model_class,
                                   trainer=trainer,
                                   activation=activation,
                                   hidden_size=latent,
                                   lr=lr,
                                   λ=λ,
                                   is_default_dataset=is_default_dataset)
                        model_paths.append(model_path)
        #                 break
        #             break
        #         break
        #     break
        # break
print(model_paths)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name              | Type      | Params
------------------------------------------------
0 | embedding_user_mf | Embedding | 37.1 K
1 | embedding_item_mf | Embedding | 60.4 K
2 | l_0               | Linear    | 11    
3 | loss_func         | L1Loss    | 0     
------------------------------------------------
97.5 K    Trainable params
0         Non-trainable params
97.5 K    Total params
0.390     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  return F.l1_loss(input, target, reduction=self.reduction)


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name              | Type      | Params
------------------------------------------------
0 | embedding_user_mf | Embedding | 37.1 K
1 | embedding_item_mf | Embedding | 60.4 K
2 | l_0               | Linear    | 11    
3 | loss_func         | L1Loss    | 0     
------------------------------------------------
97.5 K    Trainable params
0         Non-trainable params
97.5 K    Total params
0.390     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
models_eval_dict = {}
Ks = [5, 10]
i = 0

for model_class in models:
    for activation in activations:
        for lr in lrs:
            for latent in latent_dims:
                for λ in lambdas:
                    model = load_model(model_class=model_class,
                                       activation=activation,
                                       hidden_size=latent,
                                       lr=lr,
                                       λ=λ,
                                       is_default_dataset=is_default_dataset)
                    for K in Ks:
                        (hits, ndcgs, mrrs) = evaluate_model(model, test_loader=val_loader, K=K)
                        models_eval_dict[f"row_{i}"] = [type(model).__name__, activation, latent, λ, lr, K, "HR", np.mean(hits)]
                        models_eval_dict[f"row_{i+1}"] = [type(model).__name__, activation, latent, λ, lr, K, "NDCG",np.mean(ndcgs)]
                        models_eval_dict[f"row_{i+2}"] = [type(model).__name__, activation, latent, λ, lr, K, "MRR",np.mean(mrrs)]
                        i += 3
    #                     break
    #                 break
    #             break
    #         break
    #     break
    # break

In [None]:
columns = ["model", "activation", "latent_dim", "lambda", "lr","topk","metric","score"]
eval_df = pd.DataFrame.from_dict(models_eval_dict, orient='index', columns=columns)
eval_df.to_csv("obj/eval_df", sep='\t')

eval_df