In [35]:
import surprise

In [36]:
import numpy as np
import pandas as pd
import urllib
import io
import zipfile

# download zip
tmpfile = zip

# tmpfile = zipfile.ZipFile('data/')

dataset = pd.read_csv(open('data/ratings.csv'))
dataset.head()
dataset = dataset.drop('timestamp', axis=1)
dataset.columns = ['uid','iid','rating']
dataset

Unnamed: 0,uid,iid,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [37]:
lower_rating = dataset['rating'].min()
upper_rating = dataset['rating'].max()
print(f"Review Range: {lower_rating} to {upper_rating}")

Review Range: 0.5 to 5.0


In [38]:
reader = surprise.Reader(rating_scale= (0.5,5.))
data = surprise.Dataset.load_from_df(dataset,reader)


In [39]:
alg = surprise.SVDpp()
output = alg.fit(data.build_full_trainset())

# the uids and iids should be set as strings
pred = alg.predict(uid='50', iid='52')
score = pred.est
print(score)

3.501556983616962


In [40]:
# get a list of all movie ids
iids = dataset['iid'].unique()

# get a list of iids that uid 50 has rated
iids50 = dataset.loc[dataset['uid']==50,'iid']

# remove the iids that uid 50 has rated from the list of all movie ids
iids_to_pred = np.setdiff1d(iids,iids50)


In [41]:
testset = [[50, iid, 4.] for iid in iids_to_pred]
predictions = alg.test(testset)
predictions[0]

Prediction(uid=50, iid=2, r_ui=4.0, est=2.5636480943615227, details={'was_impossible': False})

In [42]:
pred_ratings = np.array([pred.est for pred in predictions])

# find the index of the maximum predicted rating
i_max = pred_ratings.argmax()

iid = iids_to_pred[i_max]
print(f'Top item for user 50 has iid {iid} with predicted rating {pred_ratings[i_max]}')

Top item for user 50 has iid 1221 with predicted rating 3.695392402950901


In [43]:
param_grid = {'lr_all': [0.01,.01], 'reg_all':[.1,.5]}

In [44]:
gs = surprise.model_selection.GridSearchCV(surprise.SVDpp, param_grid, measures=['rmse','mae'], cv=3)
gs.fit(data)

print(gs.best_params['rmse'])

{'lr_all': 0.01, 'reg_all': 0.1}


In [45]:
alg = surprise.SVDpp(lr_all=.001) # parameter choices can be added here
output = surprise.model_selection.cross_validate(alg,data,verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8915  0.8924  0.8918  0.8922  0.8981  0.8932  0.0025  
MAE (testset)     0.6901  0.6910  0.6917  0.6912  0.6924  0.6913  0.0008  
Fit time          59.00   58.82   57.94   56.90   56.67   57.87   0.96    
Test time         5.19    5.85    5.09    5.29    5.03    5.29    0.29    
