In [1]:
import sys
import os
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.
System version: 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]
Cornac version: 1.14.2


In [2]:
fields = ['userId', 'movieId', 'rating']
# data = pd.read_csv('datasets/ml-latest-small/ratings.csv', usecols=fields).rename({'userId': 'userID', 'movieId': 'itemID'}, axis=1)
data = pd.read_csv('datasets/ml-latest-small/ratings.csv', usecols=fields)

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

In [3]:
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
train, test = python_random_split(data, 0.75)

In [5]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 610
Number of items: 8787


## Model training

In [6]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [7]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!
Took 5.9657 seconds for training.


## Prediction and Evaluation

In [8]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userId', itemcol='movieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 4.4723 seconds for prediction.


In [9]:
# Make sure the prediction and true data frames have the same set of users
common_users = set(test['userId']).intersection(set(all_predictions['userId']))
rating_true_common = test[test['userId'].isin(common_users)]
rating_pred_common = all_predictions[all_predictions['userId'].isin(common_users)]
n_users = len(common_users)

In [10]:
from recommenders.evaluation.python_evaluation import get_top_k_items

top_k = 10

df_hit = get_top_k_items(
    dataframe=rating_pred_common,
    col_user='userId',
    col_rating='prediction',
    k=top_k,
)

In [None]:
train.to_csv('./output/train.csv', index=False)
test.to_csv('./output/test.csv', index=False)
df_hit.to_csv('./output/rankings.csv', index=False)

In [69]:
k = 10
# eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_user='userId', col_item='movieId', col_prediction='prediction', k=k)
# eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
# eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

# print("MAP:\t%f" % eval_map,
#       "NDCG:\t%f" % eval_ndcg,
#       "Precision@K:\t%f" % eval_precision,
#       "Recall@K:\t%f" % eval_recall, sep='\n')

In [70]:
eval_ndcg

0.24414025587794844

In [38]:

# rating_pred_common: all predictions for every user for items they haven't seen
# df_hit: top-k recommended items for every user
df_hit = get_top_k_items(
    dataframe=rating_pred_common,
    col_user='userId',
    col_rating='prediction',
    k=top_k,
)
# df_hit: all the item hits, the items that the recommender got in the top-k which are in the test set
df_hit = pd.merge(df_hit, rating_true_common, on=['userId', 'movieId'])[
    ['userId', 'movieId', "rank"]
]

# count the number of hits vs actual relevant items per user
df_hit_count = pd.merge(
    df_hit.groupby('userId', as_index=False)['userId'].agg({"hit": "count"}),
    rating_true_common.groupby('userId', as_index=False)['userId'].agg(
        {"actual": "count"}
    ),
    on='userId',
)

In [52]:
df_hit

Unnamed: 0,userId,movieId,rank
0,1,1214,5
1,1,1198,9
2,4,2858,8
3,5,318,1
4,5,150,5
...,...,...,...
1366,610,3949,5
1367,610,58559,6
1368,610,51255,7
1369,610,27773,8


In [55]:
import numpy as np

# calculate discounted gain for hit items
df_dcg = df_hit.copy()
# relevance in this case is always 1
df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
# sum up discount gained to get discount cumulative gain
df_dcg = df_dcg.groupby('userId', as_index=False, sort=False).agg({"dcg": "sum"})

In [64]:
k = 10
n_users = 610

# calculate ideal discounted cumulative gain
df_ndcg = pd.merge(df_dcg, df_hit_count, on=['userId'])
df_ndcg["idcg"] = df_ndcg["actual"].apply(
    lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
)

In [65]:
# DCG over IDCG is the normalized DCG
(df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users

0.24414025587794844

In [63]:
df_ndcg

Unnamed: 0,userId,dcg,hit,actual,idcg
0,1,0.992405,2,46,6.554971
1,4,0.455120,1,57,6.554971
2,5,2.000806,2,8,5.703644
3,6,5.452737,8,78,6.554971
4,7,1.713753,3,44,6.554971
...,...,...,...,...,...
441,606,3.700570,5,253,6.554971
442,607,4.567769,6,55,6.554971
443,608,3.746917,7,200,6.554971
444,609,0.513898,1,8,5.703644


In [73]:
rating_true_common

Unnamed: 0,userId,movieId,rating
67037,432,77866,4.5
42175,288,474,3.0
93850,599,4351,3.0
6187,42,2987,4.0
12229,75,1610,4.0
...,...,...,...
76051,479,135,1.0
35045,234,2414,3.0
14383,91,1968,3.0
46656,306,69406,5.0
