http://surpriselib.com

# Load data

In [1]:
from surprise import Dataset

data = Dataset.load_builtin('ml-100k')

# Manual pipeline

## Split data in train and test

In [2]:
from surprise.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
test

[('907', '143', 5.0),
 ('371', '210', 4.0),
 ('218', '42', 4.0),
 ('829', '170', 4.0),
 ('733', '277', 1.0),
 ('363', '1512', 1.0),
 ('193', '487', 5.0),
 ('808', '313', 5.0),
 ('557', '682', 2.0),
 ('774', '196', 3.0),
 ('638', '118', 3.0),
 ('632', '81', 5.0),
 ('417', '200', 4.0),
 ('580', '471', 3.0),
 ('640', '91', 4.0),
 ('450', '328', 4.0),
 ('596', '13', 2.0),
 ('586', '467', 4.0),
 ('653', '502', 2.0),
 ('378', '517', 3.0),
 ('405', '65', 1.0),
 ('279', '399', 4.0),
 ('327', '293', 3.0),
 ('346', '276', 1.0),
 ('59', '928', 4.0),
 ('514', '22', 4.0),
 ('807', '402', 5.0),
 ('473', '327', 3.0),
 ('342', '324', 1.0),
 ('269', '136', 4.0),
 ('654', '1', 4.0),
 ('250', '28', 4.0),
 ('282', '689', 2.0),
 ('534', '619', 4.0),
 ('194', '481', 3.0),
 ('184', '118', 2.0),
 ('291', '739', 3.0),
 ('293', '31', 2.0),
 ('943', '1028', 2.0),
 ('65', '69', 3.0),
 ('562', '135', 5.0),
 ('466', '62', 3.0),
 ('847', '317', 3.0),
 ('650', '521', 3.0),
 ('656', '326', 1.0),
 ('366', '53', 5.0),
 

In [4]:
train.n_users, train.n_items

(943, 1651)

## Train model

- Use 2 models : user & item based
https://surprise.readthedocs.io/en/stable/knn_inspired.html

In [5]:
from surprise import SVD

model = SVD()

In [6]:
model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x127928779d0>

## Make predictions

In [7]:
predictions = model.test(test)


In [11]:
predictions

[Prediction(uid='907', iid='143', r_ui=5.0, est=5, details={'was_impossible': False}),
 Prediction(uid='371', iid='210', r_ui=4.0, est=4.191122865147293, details={'was_impossible': False}),
 Prediction(uid='218', iid='42', r_ui=4.0, est=3.473083841095421, details={'was_impossible': False}),
 Prediction(uid='829', iid='170', r_ui=4.0, est=3.949191067572329, details={'was_impossible': False}),
 Prediction(uid='733', iid='277', r_ui=1.0, est=2.9758992679839533, details={'was_impossible': False}),
 Prediction(uid='363', iid='1512', r_ui=1.0, est=3.205272170479884, details={'was_impossible': False}),
 Prediction(uid='193', iid='487', r_ui=5.0, est=3.675045161329235, details={'was_impossible': False}),
 Prediction(uid='808', iid='313', r_ui=5.0, est=4.731864584859961, details={'was_impossible': False}),
 Prediction(uid='557', iid='682', r_ui=2.0, est=3.3218953362227355, details={'was_impossible': False}),
 Prediction(uid='774', iid='196', r_ui=3.0, est=2.4977535694803823, details={'was_impos

## Evaluation

In [8]:
from surprise import accuracy

accuracy.rmse(predictions=predictions)

RMSE: 0.9350


0.9350316172478067

In [9]:
accuracy.mae(predictions=predictions)

MAE:  0.7377


0.7377047556566634

# Cross validation

In [10]:
from surprise.model_selection import cross_validate

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9401  0.9307  0.9316  0.9381  0.9355  0.9352  0.0036  
MAE (testset)     0.7414  0.7320  0.7364  0.7394  0.7375  0.7373  0.0031  
Fit time          3.65    3.63    3.59    3.60    3.59    3.61    0.03    
Test time         0.14    0.10    0.13    0.13    0.10    0.12    0.02    


{'test_rmse': array([0.94013013, 0.9306818 , 0.93155582, 0.93810254, 0.93554429]),
 'test_mae': array([0.7413599 , 0.7320472 , 0.73638252, 0.73939694, 0.73747982]),
 'fit_time': (3.6541028022766113,
  3.6313018798828125,
  3.5908095836639404,
  3.5965473651885986,
  3.5859591960906982),
 'test_time': (0.13564324378967285,
  0.09873199462890625,
  0.1326453685760498,
  0.13463926315307617,
  0.09778022766113281)}