# Recommender System library Using Surprise 
Surprise is an easy-to-use Python scikit for recommender systems.

We are going use this library to ingest movie rating data and predict user's likelihood of preferring certain movie titles than the others.


In [2]:
import pandas as pd

# from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVD

from collections import defaultdict
from surprise.model_selection import cross_validate

In [3]:
# df = pd.read_csv('../input/ratings.csv')
df = pd.read_csv('../input/ratings_small.csv')
# df['rating'] = df['rating'].astype('int')

#reader = Reader(rating_scale=(1, 5))
reader = Reader()

algo = SVD()

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [4]:
# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8953  0.8995  0.8980  0.8934  0.8925  0.8957  0.0027  
MAE (testset)     0.6906  0.6926  0.6906  0.6874  0.6880  0.6898  0.0019  
Fit time          3.60    3.61    3.62    3.64    3.62    3.62    0.01    
Test time         0.10    0.10    0.10    0.10    0.10    0.10    0.00    


{'test_rmse': array([0.89526201, 0.8994588 , 0.89802362, 0.89341195, 0.89246095]),
 'test_mae': array([0.69061343, 0.69259067, 0.69060777, 0.68737038, 0.68797111]),
 'fit_time': (3.5985260009765625,
  3.609151840209961,
  3.6246118545532227,
  3.6354658603668213,
  3.623538017272949),
 'test_time': (0.10282611846923828,
  0.0993502140045166,
  0.09896326065063477,
  0.0982980728149414,
  0.10041594505310059)}

In [5]:
from surprise.model_selection import KFold
from surprise import accuracy

kf = KFold(n_splits=3)

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8940
RMSE: 0.9044
RMSE: 0.9075


In [6]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [7]:
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])


 357]
612 [4963, 2918, 69122, 1682, 94959, 2502, 47099, 364, 80463, 551]
320 [318, 296, 4226, 527, 356, 2762, 30749, 64614, 104841, 2010]
287 [1228, 1283, 3114, 364, 2571, 1196, 76093, 911, 79132, 98491]
403 [1219, 1, 1387, 1215, 1349, 1258, 32, 1255, 253, 1321]
52 [8368, 6539, 5791, 6942, 40815, 5816, 5812, 5878, 866, 39183]
601 [4226, 44191, 58559, 48780, 7361, 7022, 68358, 493, 6953, 2567]
420 [541, 1230, 1080, 904, 913, 908, 1220, 898, 1278, 1196]
510 [2858, 3897, 1247, 527, 940, 1704, 281, 541, 3034, 3114]
245 [1252, 2268, 2918, 2762, 3897, 1704, 1625, 4993, 4011, 1089]
167 [1270, 1240, 1210, 2454, 480, 1097, 2081, 3101, 3157, 2699]
120 [296, 608, 1240, 4963, 1036, 733, 3448, 3481, 2716, 1994]
239 [1259, 1060, 2599, 2144, 1288, 1278, 69, 223, 457, 1580]
376 [1212, 1299, 1207, 3201, 1247, 1230, 1267, 1952, 593, 4223]
401 [356, 541, 1198, 3471, 1356, 1, 3751, 1374, 3702, 1214]
139 [318, 541, 1259, 589, 5989, 2571, 1206, 3147, 2324, 480]
382 [3897, 7361, 4886, 88125, 318, 86882, 4963

In [11]:
top_n[515]

[(59315, 4.72424136179409),
 (2739, 4.628268590639445),
 (80463, 4.445845414524306),
 (53125, 4.057227891577574),
 (1566, 4.056761592009366),
 (5015, 4.0067541832172795),
 (3275, 3.9958071307829655),
 (60, 3.454916533076711),
 (519, 3.254663061938639)]