In [1]:
import sys
import os
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

# change dir for custom imports
os.chdir('../')

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.
System version: 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]
Cornac version: 1.14.2


In [2]:
DATASET_NAME = 'ml-25m'
fields = ['userId', 'movieId', 'rating']
data = pd.read_csv('datasets/' + DATASET_NAME + '/shrunk/ratings_small_v2.csv', usecols=fields)

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

In [3]:
train, test = python_random_split(data, 0.70)

In [198]:
# if there's synthetic data, add it to the test set
# the idea is that the new synthetic data will have different indexes, so we look for those new indexes that are not found in the initial dataset
midified_ratings = pd.read_csv('datasets/' + DATASET_NAME + '/modified/ratings_random_experiment.csv', usecols=fields)
synthetic_data = midified_ratings.loc[~midified_ratings.index.isin(data.index.to_list())]

# concat the synthetic data with the test set


In [199]:
print(len(synthetic_data))
print(len(data))

1400
502732


In [188]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 3500
Number of items: 15006


## Model training

In [5]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [6]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!
Took 31.6342 seconds for training.


## Prediction and Evaluation

In [None]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userId', itemcol='movieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

In [None]:
# make sure the prediction and true data frames have the same set of users
common_users = set(test['userId']).intersection(set(all_predictions['userId']))
rating_true_common = test[test['userId'].isin(common_users)]
rating_pred_common = all_predictions[all_predictions['userId'].isin(common_users)]
n_users = len(common_users)

In [None]:
from recommenders.evaluation.python_evaluation import get_top_k_items

top_k = 10

df_hit = get_top_k_items(
    dataframe=rating_pred_common,
    col_user='userId',
    col_rating='prediction',
    k=top_k,
)

In [None]:
train.to_csv('../output/' + DATASET_NAME + '/train.csv', index=False)
test.to_csv('../output/' + DATASET_NAME + '/test.csv', index=False)
df_hit.to_csv('../output/' + DATASET_NAME + '/rankings.csv', index=False)
# data.to_csv('./datasets/ml-25m/ratings_small.csv', index=False)

In [None]:
k = 10
# eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_user='userId', col_item='movieId', col_prediction='prediction', k=k)
# eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
# eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

# print("MAP:\t%f" % eval_map,
#       "NDCG:\t%f" % eval_ndcg,
#       "Precision@K:\t%f" % eval_precision,
#       "Recall@K:\t%f" % eval_recall, sep='\n')

In [None]:
eval_ndcg

In [None]:

# rating_pred_common: all predictions for every user for items they haven't seen
# df_hit: top-k recommended items for every user
df_hit = get_top_k_items(
    dataframe=rating_pred_common,
    col_user='userId',
    col_rating='prediction',
    k=top_k,
)
# df_hit: all the item hits, the items that the recommender got in the top-k which are in the test set
df_hit = pd.merge(df_hit, rating_true_common, on=['userId', 'movieId'])[
    ['userId', 'movieId', "rank"]
]

# count the number of hits vs actual relevant items per user
df_hit_count = pd.merge(
    df_hit.groupby('userId', as_index=False)['userId'].agg({"hit": "count"}),
    rating_true_common.groupby('userId', as_index=False)['userId'].agg(
        {"actual": "count"}
    ),
    on='userId',
)

In [None]:
df_hit

In [None]:
import numpy as np

# calculate discounted gain for hit items
df_dcg = df_hit.copy()
# relevance in this case is always 1
df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
# sum up discount gained to get discount cumulative gain
df_dcg = df_dcg.groupby('userId', as_index=False, sort=False).agg({"dcg": "sum"})

In [None]:
k = 10
n_users = 610

# calculate ideal discounted cumulative gain
df_ndcg = pd.merge(df_dcg, df_hit_count, on=['userId'])
df_ndcg["idcg"] = df_ndcg["actual"].apply(
    lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
)

In [None]:
# DCG over IDCG is the normalized DCG
(df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users

In [None]:
df_ndcg

In [None]:
rating_true_common