1. Basic usage

    1.1 Automatic cross-validation

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed)
data = Dataset.load_builtin('ml-100k')

In [2]:
# We'll use the famous SVD algorithm
algo = SVD()

In [3]:
# Run 5-fold cross-validation and print results
cross_validate(algo,data,measures=['RMSE','MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9427  0.9318  0.9348  0.9333  0.9389  0.9363  0.0040  
MAE (testset)     0.7416  0.7307  0.7380  0.7356  0.7401  0.7372  0.0038  
Fit time          4.56    4.42    4.43    4.41    4.46    4.46    0.05    
Test time         0.84    0.14    0.12    0.14    0.11    0.27    0.29    


{'test_rmse': array([0.94266092, 0.93180989, 0.93478787, 0.93326425, 0.93892981]),
 'test_mae': array([0.74163883, 0.730727  , 0.73798741, 0.73560876, 0.74014399]),
 'fit_time': (4.555803537368774,
  4.423142910003662,
  4.431175470352173,
  4.414164781570435,
  4.4560816287994385),
 'test_time': (0.842231273651123,
  0.13862919807434082,
  0.11668729782104492,
  0.13962697982788086,
  0.11170220375061035)}

    1.2 Train-test split and the fit() method

In [4]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed)
data = Dataset.load_builtin('ml-100k')

In [5]:
# sample random trainset and testset
# test set is made of 25% of the ratings
trainset,testset = train_test_split(data,test_size=.25)

In [6]:
# We'll use the famous SVD algorithm
algo = SVD()

In [37]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

In [38]:
predictions

[Prediction(uid='1', iid='3', r_ui=4.0, est=3.209984476408625, details={'was_impossible': False}),
 Prediction(uid='1', iid='13', r_ui=5.0, est=3.406438476287819, details={'was_impossible': False}),
 Prediction(uid='1', iid='15', r_ui=5.0, est=3.679458750627329, details={'was_impossible': False}),
 Prediction(uid='1', iid='18', r_ui=4.0, est=3.341154880376516, details={'was_impossible': False}),
 Prediction(uid='1', iid='19', r_ui=5.0, est=3.805277301137605, details={'was_impossible': False}),
 Prediction(uid='1', iid='28', r_ui=4.0, est=3.8063264158617574, details={'was_impossible': False}),
 Prediction(uid='1', iid='29', r_ui=1.0, est=3.006277098019457, details={'was_impossible': False}),
 Prediction(uid='1', iid='52', r_ui=4.0, est=3.7595961289572153, details={'was_impossible': False}),
 Prediction(uid='1', iid='59', r_ui=5.0, est=3.9375908267144433, details={'was_impossible': False}),
 Prediction(uid='1', iid='83', r_ui=3.0, est=3.8845647396301564, details={'was_impossible': False}

In [8]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9370


0.9370026894700734

In [9]:
# Note that you can train and test an algorithm with the following one-line:
predictions = algo.fit(trainset).test(testset)

In [10]:
accuracy.rmse(predictions)

RMSE: 0.9367


0.9366534632262384

    1.3 Train on a whole trainset and the predict() method

In [11]:
from surprise import KNNBasic
from surprise import Dataset

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

In [12]:
# Retrieve the trainset
trainset = data.build_full_trainset()

In [13]:
# Build an algorithm, and train it
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2b4928e01d0>

In [14]:
# We can now predict ratings by directly calling the predict() method. 
# Let’s say you’re interested in user 196 and item 302 (make sure they’re in the trainset!), 
# and you know that the true rating rui=4:
uid = str(196) # raw user id (as in the ratings file). They are **strings**! 
iid = str(302) # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid,iid,r_ui=4,verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


2. Use a custom dataset

    Surprise has a set of builtin datasets, but you can of course use a custom dataset. Loading a rating dataset can be done either from a file (e.g. a csv file), or from a pandas dataframe. Either way, you will need to define a Reader object for Surprise to be able to parse the file or the dataframe.

    2.1 从csv文件加载评分数据集

In [15]:
# To load a dataset from a file (e.g. a csv file), you will need the load_from_file() method:
import os
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser("C:\\Users\\Dean\\.surprise_data\\ml-100k\\ml-100k\\u.data")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp',sep='\t')

data = Dataset.load_from_file(file_path,reader=reader)

In [16]:
# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(),data,verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9407  0.9433  0.9383  0.9439  0.9525  0.9437  0.0048  
MAE (testset)     0.7443  0.7468  0.7437  0.7515  0.7552  0.7483  0.0044  
Fit time          0.17    0.19    0.19    0.18    0.19    0.19    0.01    
Test time         0.08    0.10    0.12    0.12    0.12    0.11    0.01    


{'test_rmse': array([0.94070162, 0.94325116, 0.93827524, 0.94393216, 0.9524632 ]),
 'test_mae': array([0.74434879, 0.74678171, 0.74374399, 0.7515291 , 0.75522879]),
 'fit_time': (0.17253971099853516,
  0.19447922706604004,
  0.18949365615844727,
  0.18453431129455566,
  0.1874995231628418),
 'test_time': (0.08376169204711914,
  0.0967409610748291,
  0.11668705940246582,
  0.11668705940246582,
  0.11768412590026855)}

    2.1 从pandas Dataframe加载评分数据集

In [17]:
# To load a dataset from a pandas dataframe, you will need the load_from_df() method. You will also need a Reader object,
# but only the rating_scale parameter must be specified. The dataframe must have three columns, 
# corresponding to the user (raw) ids, the item (raw) ids, and the ratings in this order. 
# Each row thus corresponds to a given rating. This is not restrictive as you can reorder the columns of your dataframe easily.
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# Creation of the dataframe.Column names are irrelevant.
ratings_dict = {'itemID':[1,1,1,2,2],
               'userID':[9,32,2,45,'user_foo'],
               'rating':[3,2,4,3,1]}
df = pd.DataFrame(ratings_dict)

In [18]:
df

Unnamed: 0,itemID,userID,rating
0,1,9,3
1,1,32,2
2,1,2,4
3,2,45,3
4,2,user_foo,1


In [19]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1,5))

In [20]:
# The columns must correspond to user id,item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID','itemID','rating']],reader)

In [21]:
# We can now use this dataset as we please,e.g. calling cross_validate
cross_validate(NormalPredictor(),data,cv=2)

{'test_rmse': array([1.57547085, 1.59500372]),
 'test_mae': array([1.40726814, 1.50640634]),
 'fit_time': (0.0, 0.0),
 'test_time': (0.0, 0.0)}

3. Use cross-validation 

    3.1 Use cross-validation iterators

In [22]:
# For cross-validation, we can use the cross_validate() function that does all the hard work for us. 
# But for a better control, we can also instanciate a cross-validation iterator, 
# and make predictions over each split using the split() method of the iterator, and the test() method of the algorithm. 
# Here is an example where we use a classical K-fold cross-validation procedure with 3 splits:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

In [23]:
for trainset,testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    # compute and print Root Mean Squared Error
    accuracy.rmse(predictions,verbose=True)

RMSE: 0.9424
RMSE: 0.9470
RMSE: 0.9468


    3.2 Use 预先定义好的 cross-validation iterators

In [24]:
# A special case of cross-validation is when the folds are already predefined by some files. 
# For instance, the movielens-100K dataset already provides 5 train and test files (u1.base, u1.test … u5.base, u5.test).
# Surprise can handle this case by using a surprise.model_selection.split.PredefinedKFold object:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('C:\\Users\\Dean\\.surprise_data\\ml-100k\\ml-100k\\')
                               
# This time,we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i,test_file % i) for i in (1,2,3,4,5)]

data = Dataset.load_from_folds(folds_files,reader=reader)
pkf = PredefinedKFold()

In [25]:
algo = SVD()

In [26]:
for trainset,testset in pkf.split(data):
    # train and test algorithm
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    # compute and print RootMean Squared Error
    accuracy.rmse(predictions,verbose=True)

RMSE: 0.9519
RMSE: 0.9401
RMSE: 0.9312
RMSE: 0.9320
RMSE: 0.9319


    Of course, nothing prevents you from only loading a single file for training and a single file for testing. However, the folds_files parameter still needs to be a list.

4. Tune(调整) algorithm parameters with GridSearchCV

    The cross_validate() function reports accuracy metric over a cross-validation procedure for a given set of parameters. If you want to know which parameter combination yields the best results, the GridSearchCV class comes to the rescue. Given a dict of parameters, this class exhaustively tries all the combinations of parameters and reports the best parameters for any accuracy measure (averaged over the different splits). 

In [27]:
# Here is an example where we try different values for parameters n_epochs, lr_all and reg_all of the SVD algorithm.
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Use movielens-100k
data = Dataset.load_builtin('ml-100k')

param_grid = {'n_epochs':[5,10],'lr_all':[0.002,0.005],'reg_all':[0.4,0.6]}
gs = GridSearchCV(SVD,param_grid,measures=['rmse','mae'],cv=3)

In [28]:
gs.fit(data)

In [29]:
# beat RMSE score
print(gs.best_score['rmse'])

0.9641831047333728


In [30]:
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


    We are here evaluating the average RMSE and MAE over a 3-fold cross-validation procedure, but any cross-validation iterator can used.

    Once fit() has been called, the best_estimator attribute gives us an algorithm instance with the optimal set of parameters, which can be used how we please:

In [31]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b493d48eb8>

Note

    Dictionary parameters such as bsl_options and sim_options require particular treatment. See usage example below:

In [32]:
param_grid = {'k':[10,20],
             'sim_options':{'name':['msd','cosine'],
                           'min_suport':[1,5],
                           'user_based':[False]}
             }

In [33]:
# Naturally, both can be combined, for example for the KNNBaseline algorithm:
param_grid = {'bsl_options':{'method':['als','sgd'],
                            'reg':[1,2]},
             'k':[2,3],
             'sim_options':{'name':['msd','cosine'],
                           'min_support':[1,5],
                           'user_based':[False
                                        ]}
             }

In [34]:
# For further analysis, the cv_results attribute has all the needed information and can be imported in a pandas dataframe:
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [35]:
gs.cv_results

{'split0_test_rmse': array([0.99753609, 1.00404734, 0.97432612, 0.983051  , 0.97874205,
        0.98657131, 0.96444155, 0.97410058]),
 'split1_test_rmse': array([0.99205488, 0.99828238, 0.96949079, 0.97820284, 0.9734913 ,
        0.98169526, 0.96005226, 0.96966746]),
 'split2_test_rmse': array([1.00162802, 1.00789653, 0.97798728, 0.98689593, 0.98221953,
        0.99040454, 0.9680555 , 0.97818143]),
 'mean_test_rmse': array([0.997073  , 1.00340875, 0.97393473, 0.98271659, 0.97815096,
        0.9862237 , 0.9641831 , 0.97398316]),
 'std_test_rmse': array([0.00392191, 0.00395085, 0.0034797 , 0.00355681, 0.00358771,
        0.00356404, 0.00327242, 0.00347681]),
 'rank_test_rmse': array([7, 8, 2, 5, 4, 6, 1, 3], dtype=int64),
 'split0_test_mae': array([0.80650722, 0.81557525, 0.78270655, 0.79352316, 0.78689419,
        0.797029  , 0.77334321, 0.78494728]),
 'split1_test_mae': array([0.80076225, 0.80943114, 0.77767916, 0.78853261, 0.78170445,
        0.7919644 , 0.76909475, 0.78054282]),
 'sp

In [36]:
# As you can see, each list has the same size of the number of parameter combination. It corresponds to the following table:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.997536,0.992055,1.001628,0.997073,0.003922,7,0.806507,0.800762,0.80982,0.805696,0.003742,7,0.944474,0.002952,0.229036,0.025189,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,1.004047,0.998282,1.007897,1.003409,0.003951,8,0.815575,0.809431,0.818813,0.814606,0.003891,8,0.922885,0.003303,0.225377,0.023283,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.974326,0.969491,0.977987,0.973935,0.00348,2,0.782707,0.777679,0.78498,0.781789,0.003051,2,0.931518,0.003562,0.243026,0.002344,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.983051,0.978203,0.986896,0.982717,0.003557,5,0.793523,0.788533,0.796243,0.792766,0.003193,5,0.921555,0.001402,0.227049,0.022527,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.978742,0.973491,0.98222,0.978151,0.003588,4,0.786894,0.781704,0.789268,0.785956,0.003158,4,1.868356,0.026802,0.221735,0.022801,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4
5,0.986571,0.981695,0.990405,0.986224,0.003564,6,0.797029,0.791964,0.799891,0.796295,0.003277,6,1.850717,0.003853,0.221407,0.023303,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}",10,0.002,0.6
6,0.964442,0.960052,0.968056,0.964183,0.003272,1,0.773343,0.769095,0.775161,0.772533,0.002542,1,1.855684,0.00863,0.225065,0.020956,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}",10,0.005,0.4
7,0.974101,0.969667,0.978181,0.973983,0.003477,3,0.784947,0.780543,0.787485,0.784325,0.002868,3,1.860696,0.012976,0.223383,0.024038,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}",10,0.005,0.6


5. Command line usage

Surprise can also be used from the command line, for example:

    surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3

See detailed usage by running:

    surprise -h