In [1]:
import sys
import os
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

# change dir for custom imports
os.chdir('../')

print("System version: {}".format(sys.version))
print("Cornac version: {}".format(cornac.__version__))

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.
System version: 3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]
Cornac version: 1.14.2


In [2]:
DATASET_NAME = 'ml-latest-small'
fields = ['userId', 'movieId', 'rating']
data = pd.read_csv('datasets/' + DATASET_NAME + '/clean/ratings.csv', usecols=fields)

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

In [5]:
len(data)

415650

In [4]:
# add the synthetic data to the whole dataset (before test split)
midified_ratings = pd.read_csv('datasets/' + DATASET_NAME + '/modified/ratings_random_experiment.csv', usecols=fields)
synthetic_data = midified_ratings.loc[~midified_ratings.index.isin(data.index.to_list())]
data = pd.concat([data, synthetic_data])

In [6]:
train, test = python_random_split(data, 0.75)

In [None]:
# # if there's synthetic data, add it to the test set.
# # the idea is that the new synthetic data will have different indexes, 
# # so we look for those new indexes that are not found in the initial dataset.
# midified_ratings = pd.read_csv('datasets/' + DATASET_NAME + '/modified/ratings_random_experiment.csv', usecols=fields)
# synthetic_data = midified_ratings.loc[~midified_ratings.index.isin(data.index.to_list())]

# # concat the synthetic data with the test set
# test = pd.concat([test, synthetic_data])

In [7]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 385
Number of items: 13318


## Model training

In [8]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [9]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!
Took 27.1703 seconds for training.


## Prediction and Evaluation

In [10]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userId', itemcol='movieId', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 4.9185 seconds for prediction.


In [11]:
# make sure the prediction and true data frames have the same set of users
common_users = set(test['userId']).intersection(set(all_predictions['userId']))
rating_true_common = test[test['userId'].isin(common_users)]
rating_pred_common = all_predictions[all_predictions['userId'].isin(common_users)]
n_users = len(common_users)

In [12]:
from recommenders.evaluation.python_evaluation import get_top_k_items

top_k = 10

df_hit = get_top_k_items(
    dataframe=rating_pred_common,
    col_user='userId',
    col_rating='prediction',
    k=top_k,
)

In [13]:
# save train run information
train.to_csv('output/' + DATASET_NAME + '/train.csv', index=False)
test.to_csv('output/' + DATASET_NAME + '/test.csv', index=False)
df_hit.to_csv('output/' + DATASET_NAME + '/rankings.csv', index=False)
all_predictions.to_csv('output/' + DATASET_NAME + '/all_predictions.csv', index=False)