In [None]:
## tutorial of surprise, a Python package for matrix factorization / KNN recommender
## surprise is not for content-based recommendations
## tutorial url:
## https://surprise.readthedocs.io/en/stable/getting_started.html

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [2]:
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\liamk/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9361  0.9388  0.9410  0.9304  0.9331  0.9359  0.0038  
MAE (testset)     0.7384  0.7384  0.7415  0.7322  0.7372  0.7375  0.0030  
Fit time          5.07    5.32    5.37    5.38    5.37    5.30    0.12    
Test time         0.17    0.19    0.16    0.14    0.14    0.16    0.02    


{'test_rmse': array([0.93610272, 0.9388455 , 0.94095554, 0.93041995, 0.93308442]),
 'test_mae': array([0.7383522 , 0.7384209 , 0.74149551, 0.73215999, 0.73717831]),
 'fit_time': (5.065744638442993,
  5.320963144302368,
  5.371641635894775,
  5.379650592803955,
  5.369688510894775),
 'test_time': (0.1735367774963379,
  0.1904909610748291,
  0.1575772762298584,
  0.14262080192565918,
  0.14161372184753418)}

## Split and predict

In [None]:
## tutorial url:
## https://surprise.readthedocs.io/en/stable/getting_started.html

In [7]:
from surprise import accuracy
from surprise.model_selection import train_test_split

In [8]:
trainset, testset = train_test_split(data, test_size=.25)

In [9]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9403


0.9402789192887656

## KNN

In [13]:
## tutorial url:
## https://surprise.readthedocs.io/en/stable/getting_started.html

In [10]:
from surprise import KNNBasic

In [11]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x22c2e5b4be0>

In [12]:
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


In [17]:
uid = str(100)  # raw user id (as in the ratings file). They are **strings**!
iid = str(314)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 100        item: 314        r_ui = 4.00   est = 1.00   {'actual_k': 5, 'was_impossible': False}


## Top N Recommendations for user

In [19]:
from collections import defaultdict

In [22]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
# data = Dataset.load_builtin('ml-100k')
# trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

## returns top n items for each user
# Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [29]:
## filters for just the first 2 users
dict(list(top_n.items())[:2])

{'196': [('64', 4.769798580013519),
  ('114', 4.6087618010957785),
  ('318', 4.56637306569748),
  ('408', 4.516162819041641),
  ('357', 4.487254416788829),
  ('479', 4.481264796988658),
  ('169', 4.477466060096285),
  ('515', 4.465058006971798),
  ('98', 4.442689993258031),
  ('657', 4.4307557881510595)],
 '186': [('313', 4.547914828124093),
  ('513', 4.508279698704069),
  ('531', 4.4927791433497175),
  ('479', 4.4878758874877045),
  ('50', 4.446428689141754),
  ('498', 4.41514210258116),
  ('22', 4.413821533323922),
  ('223', 4.390087962053175),
  ('657', 4.38899060938893),
  ('921', 4.379178433047381)]}

## Get K nearest of items

In [36]:
from surprise import KNNBaseline
from surprise import get_dataset_dir
import io

In [39]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
## another example
# toy_story_raw_id = name_to_rid['Indiana Jones and the Last Crusade (1989)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

The 10 nearest neighbors of Toy Story are:
Raiders of the Lost Ark (1981)
Back to the Future (1985)
Return of the Jedi (1983)
Independence Day (ID4) (1996)
Star Wars (1977)
While You Were Sleeping (1995)
Liar Liar (1997)
Jurassic Park (1993)
Better Off Dead... (1985)
Maverick (1994)


## Get K nearest of users (self-attempt)

In [60]:
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': True}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

print()
print(f'Top 10 users most similiar to user id 100: {algo.get_neighbors(100, k=10)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Top 10 users most similiar to user id 100: [90, 384, 162, 105, 333, 534, 308, 377, 888, 802]


## Compute Precision and Recall

In [40]:
from surprise.model_selection import KFold

In [41]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


# data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.6420643336868154
0.23974209073365432
0.6510969568294418
0.24136661766843756
0.6320961470484281
0.2430249134124122
0.6285031847133764
0.2295466936597985
0.629462707670556
0.2380865422492218
