In [1]:
## exploring the surprise package using the jester dataset, as opposed to the MovieLens dataset

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [7]:
data = Dataset.load_builtin('jester')

# Use the famous SVD algorithm.
algo = SVD()

# Run 3-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    4.4254  4.4418  4.4299  4.4324  0.0069  
MAE (testset)     3.3299  3.3424  3.3341  3.3355  0.0052  
Fit time          79.80   79.47   79.82   79.70   0.16    
Test time         6.23    6.09    5.56    5.96    0.29    


{'test_rmse': array([4.42543513, 4.44175378, 4.42991528]),
 'test_mae': array([3.3299064 , 3.34239317, 3.33406385]),
 'fit_time': (79.7993631362915, 79.47046732902527, 79.82240152359009),
 'test_time': (6.230987310409546, 6.089756965637207, 5.560126304626465)}

In [4]:
from surprise import accuracy
from surprise.model_selection import train_test_split

In [5]:
trainset, testset = train_test_split(data, test_size=.25)

In [6]:
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 4.4857


4.485710111720141

In [10]:
from surprise import KNNBasic

In [12]:
## this snippet returns the following error message:
## MemoryError: Unable to allocate 13.0 GiB for an array with shape (59132, 59132) and data type int32
## Perhaps shrinking down the subspace could be a solution?

# # Retrieve the trainset.
# trainset = data.build_full_trainset()

# # Build an algorithm, and train it.
# algo = KNNBasic()
# algo.fit(trainset)

In [13]:
from collections import defaultdict

In [14]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
# data = Dataset.load_builtin('ml-100k')
# trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

In [15]:
## filters for just the first 2 users
dict(list(top_n.items())[:2])

{'1': [('140', 10),
  ('148', 9.268272502534833),
  ('114', 9.251100700004795),
  ('145', 9.133674704024061),
  ('56', 7.420145100767315),
  ('138', 6.472817478521407),
  ('71', 6.137252870466734),
  ('143', 6.110420663009521),
  ('142', 5.790979376179711),
  ('96', 5.762768176944148)],
 '2': [('96', 9.244926240974976),
  ('42', 8.758231105166407),
  ('113', 8.6033628828407),
  ('84', 8.431754863747468),
  ('90', 8.131098798724844),
  ('52', 7.283908434602852),
  ('116', 7.202755609315308),
  ('122', 6.997517945759986),
  ('127', 6.8010848583154075),
  ('63', 6.7207042149052825)]}

In [16]:
## Next step:
## convert .dat file to dataframe to compare and interpret ranked items