# Version updates

In [1]:
__version__ = 'v0.0.1'

1. Initial experiments

# Main code

## Setup

### Install other components

!pip install --quiet cornac==1.5.2 adjustText

### Import Libraries

In [2]:
import os
import sys
import itertools

import datetime
import scipy.sparse as sp
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
from adjustText import adjust_text
from collections import defaultdict
from time import sleep
from tqdm.notebook import tqdm

%matplotlib inline

import cornac
from cornac.utils import cache
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit, CrossValidation
from cornac.models import MF, NMF, WMF, BPR, BaselineOnly, NeuMF, CDL
from cornac.hyperopt import Discrete, Continuous
from cornac.hyperopt import GridSearch, RandomSearch

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

VERBOSE = True

  from .autonotebook import tqdm as notebook_tqdm


System version: 3.7.13 (default, May 11 2022, 08:49:57) 
[GCC 10.2.1 20210110]
Cornac version: 1.14.2


### Use of SEED
Determine whether we want to use a seed for repeatability.  The drawback is that training will take longer because of single-thread.  

This can be set to False first in the initial experimentation, and locked for consistency when needed later

In [3]:
USE_SEED = True  # True = repeatable, no parallelisation

if USE_SEED:
    SEED = 42
else:
    SEED = None

### Load data

In [4]:
df = pd.read_csv('../../data/wine_ratings.csv')

In [5]:
df.head()

Unnamed: 0,user_id,item_id,rating
0,34297304,4026015,3.0
1,34297304,2057563,4.0
2,34297304,1374478,3.0
3,34297304,1135067,2.0
4,34297304,7103,3.0


In [6]:
df.shape

(169342, 3)

In [7]:
dataset = cornac.data.Dataset.from_uir(df.itertuples(index=False))



### Display some facts of what we have

In [8]:
%%time

# Display some parameters
n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()

print(f'Number of users: {n_users:,}')
print(f'Number of items: {n_items:,}')
print(f'Shape of full matrix: {cornac.data.Dataset.from_uir(df.itertuples(index=False)).matrix.shape}')

# Sparsity
total_possible_rankings = n_users * n_items
sparsity = 1 - df.shape[0]/total_possible_rankings
print(f'Sparsity of matrix: {100*sparsity:0.3f}%')
print()

Number of users: 5,065
Number of items: 44,725
Shape of full matrix: (5065, 44725)
Sparsity of matrix: 99.925%

CPU times: user 507 ms, sys: 8.96 ms, total: 516 ms
Wall time: 515 ms


## Run experiments on some of the potential candidate models

In [10]:
%%time
K = 50
n_iterations = 100

# Baseline models
baseline = BaselineOnly()

svd = cornac.models.SVD(
    k=K,
    max_iter = n_iterations,
    verbose=VERBOSE,
    seed=SEED,
    name=f'SVD(K={K})'
)

mostpop = cornac.models.MostPop(
    name = f'MostPop'
)

mf = cornac.models.MF(
    k=K, 
    max_iter=n_iterations,
    verbose=VERBOSE,
    seed=SEED,
    name=f'MF(K={K})'
)

# Collaborative
bpr = cornac.models.BPR(
    k=K, 
    max_iter=n_iterations,
    verbose=VERBOSE, 
    seed=SEED, 
    name=f'BPR(K={K})'
)

wmf = cornac.models.WMF(
    k=K, 
    max_iter=n_iterations,
    verbose=VERBOSE, 
    seed=SEED, 
    name=f'WMF(K={K})'
)

# With Modality
# TODO

rs = RatioSplit(list(df.itertuples(index=False, name=None)), test_size=0.2, seed=SEED, verbose=VERBOSE)

# Define models to try
models = [
    baseline,
    svd,
    mostpop,
    mf,
    bpr,
    wmf,
]

# Define metrics
metrics = [
    cornac.metrics.RMSE(),
    cornac.metrics.FMeasure(k=20),
    cornac.metrics.AUC(),
    cornac.metrics.MRR(),
    cornac.metrics.NCRR(k=20),
    cornac.metrics.NDCG(k=20),
    cornac.metrics.Recall(k=20)
]

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 5065
Number of items = 39728
Number of ratings = 120708
Max rating = 5.0
Min rating = 1.0
Global mean = 3.9
---
Test data:
Number of users = 4667
Number of items = 11752
Number of ratings = 27608
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5065
Total items = 39728
CPU times: user 502 ms, sys: 23.4 ms, total: 525 ms
Wall time: 524 ms


In [11]:
%%time
model_experiment = cornac.Experiment(
    eval_method=rs, 
    models=models, 
    metrics=metrics,
    user_based=True,
    save_dir='cornac_experiments',
)

model_experiment.run()


[BaselineOnly] Training started!

[BaselineOnly] Evaluation started!


Rating: 100%|██████████| 27608/27608 [00:00<00:00, 91428.97it/s]
Ranking: 100%|██████████| 4667/4667 [00:20<00:00, 226.52it/s]



[SVD(K=50)] Training started!


100%|██████████| 100/100 [00:00<00:00, 135.54it/s, loss=5010.04] 


Optimization finished!

[SVD(K=50)] Evaluation started!


Rating: 100%|██████████| 27608/27608 [00:00<00:00, 64581.59it/s]
Ranking: 100%|██████████| 4667/4667 [00:23<00:00, 200.44it/s]


SVD(K=50) model is saved to cornac_experiments/SVD(K=50)/2022-06-09_06-25-12-220120.pkl

[MostPop] Training started!

[MostPop] Evaluation started!


Rating: 100%|██████████| 27608/27608 [00:00<00:00, 81507.63it/s]
Ranking: 100%|██████████| 4667/4667 [00:10<00:00, 431.92it/s]



[MF(K=50)] Training started!


100%|██████████| 100/100 [00:00<00:00, 138.98it/s, loss=5010.04] 


Optimization finished!

[MF(K=50)] Evaluation started!


Rating: 100%|██████████| 27608/27608 [00:00<00:00, 63922.31it/s]
Ranking: 100%|██████████| 4667/4667 [00:23<00:00, 200.69it/s]


MF(K=50) model is saved to cornac_experiments/MF(K=50)/2022-06-09_06-25-49-004933.pkl

[BPR(K=50)] Training started!


100%|██████████| 100/100 [00:02<00:00, 36.75it/s, correct=78.58%, skipped=0.09%]


Optimization finished!

[BPR(K=50)] Evaluation started!


Rating: 100%|██████████| 27608/27608 [00:00<00:00, 62256.67it/s]
Ranking: 100%|██████████| 4667/4667 [00:23<00:00, 201.57it/s]


BPR(K=50) model is saved to cornac_experiments/BPR(K=50)/2022-06-09_06-26-15-938922.pkl

[WMF(K=50)] Training started!


100%|██████████| 100/100 [05:17<00:00,  3.18s/it, loss=13.7]


Learning completed!

[WMF(K=50)] Evaluation started!


Rating: 100%|██████████| 27608/27608 [00:00<00:00, 63911.02it/s]
Ranking: 100%|██████████| 4667/4667 [00:29<00:00, 156.35it/s]

WMF(K=50) model is saved to cornac_experiments/WMF(K=50)/2022-06-09_06-32-05-504021.pkl

TEST:
...
             |   RMSE |    AUC |  F1@20 |    MRR | NCRR@20 | NDCG@20 | Recall@20 | Train (s) | Test (s)
------------ + ------ + ------ + ------ + ------ + ------- + ------- + --------- + --------- + --------
BaselineOnly | 0.5089 | 0.5795 | 0.0034 | 0.0112 |  0.0032 |  0.0045 |    0.0071 |    0.0220 |  21.5613
SVD(K=50)    | 0.5001 | 0.5515 | 0.0009 | 0.0045 |  0.0014 |  0.0014 |    0.0016 |    0.7927 |  24.3040
MostPop      | 1.4995 | 0.7823 | 0.0132 | 0.0450 |  0.0162 |  0.0207 |    0.0321 |    0.0045 |  11.7229
MF(K=50)     | 0.5001 | 0.5515 | 0.0009 | 0.0045 |  0.0014 |  0.0014 |    0.0016 |    0.7722 |  24.2747
BPR(K=50)    | 2.9075 | 0.8354 | 0.0131 | 0.0450 |  0.0161 |  0.0206 |    0.0317 |    2.7402 |  24.1850
WMF(K=50)    | 2.5915 | 0.7475 | 0.0390 | 0.0948 |  0.0436 |  0.0583 |    0.0884 |  318.6737 |  30.8815

CPU times: user 1h 12min 35s, sys: 14min 39s, total: 1h 27min 14s
Wa




## Train on best model

### Training

We will train on the train data, and validate on the probe data

### Save the model

## Extract the recommendations

### TO BE COMPLETED


# Hyperparameter search

### TO BE COMPLETED