In [1]:
import sys
import os
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

# change dir for custom imports
os.chdir('../')

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.9.0 | packaged by conda-forge | (default, Nov 26 2020, 07:53:15) [MSC v.1916 64 bit (AMD64)]
Surprise version: 1.1.3


In [2]:
DATASET_NAME = 'ml-1m'
fields = ['userId', 'movieId', 'rating']
data = pd.read_csv('datasets/' + DATASET_NAME + '/clean/ratings.csv', usecols=fields)
    # .rename(columns={'userId': 'userID', 'movieId': 'itemID'})

data.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [3]:
# add the synthetic data to the whole dataset (before test split)
midified_ratings = pd.read_csv('datasets/' + DATASET_NAME + '/modified/ratings_random_experiment.csv', usecols=fields)
synthetic_data = midified_ratings.loc[~midified_ratings.index.isin(data.index.to_list())]
data = pd.concat([data, synthetic_data])

In [4]:
train, test = python_random_split(data, 0.75)

In [5]:
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x2578ebd5910>

In [6]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 26.606669099999998 seconds for training.


In [7]:
predictions = predict(svd, test, usercol='userId', itemcol='movieId')
predictions.head()

Unnamed: 0,userId,movieId,prediction
0,839,2474,3.817925
1,1447,2797,3.887148
2,1980,3048,2.713711
3,1447,3274,2.907078
4,225,3418,3.893396


In [8]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, train, usercol='userId', itemcol='movieId', remove_seen=True)
    
print("Took {} seconds for prediction.".format(test_time.interval))

Took 386.1867082 seconds for prediction.


In [9]:
all_predictions.head()

Unnamed: 0,userId,movieId,prediction
733434,1605,3549,4.083208
733435,1605,2370,3.52895
733436,1605,1959,2.961816
733437,1605,3594,3.412933
733438,1605,2091,3.090755


In [10]:
# make sure the prediction and true data frames have the same set of users
common_users = set(test['userId']).intersection(set(all_predictions['userId']))
rating_true_common = test[test['userId'].isin(common_users)]
rating_pred_common = all_predictions[all_predictions['userId'].isin(common_users)]
n_users = len(common_users)

In [11]:
from recommenders.evaluation.python_evaluation import get_top_k_items

top_k = 10

df_hit = get_top_k_items(
    dataframe=rating_pred_common,
    col_user='userId',
    col_rating='prediction',
    k=top_k,
)

In [12]:
# save train run information
train.to_csv('output/' + DATASET_NAME + '/train.csv', index=False)
test.to_csv('output/' + DATASET_NAME + '/test.csv', index=False)
df_hit.to_csv('output/' + DATASET_NAME + '/rankings.csv', index=False)
all_predictions.to_csv('output/' + DATASET_NAME + '/all_predictions.csv', index=False)