In [1]:
# import libraries

import pandas as pd
import numpy as np

from surprise import Dataset, Reader, SVD, NormalPredictor
from surprise.model_selection import cross_validate, GridSearchCV, PredefinedKFold
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

from sklearn.model_selection import train_test_split

In [2]:
# import ratings csv as df

ratings = pd.read_csv('data/ratings.csv')

#drop unnecessary timestamp

ratings.drop(['timestamp'],inplace=True,axis=1)

ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
#convert df into surprise dataset

rdr = Reader(rating_scale=(1,5)) # optional paramter: 'user item rating'

data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=rdr)


#split into train, test sets

y = pd.DataFrame(ratings['rating'])
X = ratings.drop('rating',axis=1)
print(X.head(),y.head())
print(type(X),type(y))
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25)

train = pd.concat([X_train,y_train],axis=1)

test = pd.concat([X_test,y_test],axis=1)



   userId  movieId
0       1        1
1       1        3
2       1        6
3       1       47
4       1       50    rating
0     4.0
1     4.0
2     4.0
3     5.0
4     5.0
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [4]:
#load trainset, testset for gridsearch

trainset= Dataset.load_from_df(train[['userId','movieId','rating']], reader=rdr)

testset = Dataset.load_from_df(test[['userId','movieId','rating']], reader=rdr)

type(trainset)
#train_set = data.build_full_trainset()



surprise.dataset.DatasetAutoFolds

In [5]:
#instantiate gridsearch for best SVD model
param_grid = {'n_factors':[50,100,150,200], 'n_epochs':[30,],'lr_all':[.05,.1,.2],'reg_all':[.02,.05,.01]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'],cv=3,n_jobs=-1)


#fit gridsearch to train
gs.fit(trainset)

params = gs.best_params['rmse']

params

{'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.05, 'reg_all': 0.05}

In [6]:
gs.best_score['rmse']

0.879502777627348

In [10]:
from surprise import accuracy
svd_gs = SVD(n_factors= 200, n_epochs=30, lr_all= 0.05, reg_all= 0.05)

data = trainset.build_full_trainset()

test_data = testset.build_full_trainset()


test_set = test_data.build_anti_testset()
svd_gs.fit(data)
predictions = svd_gs.test(test_set)

print(accuracy.rmse(predictions))

RMSE: 0.5584
0.5583951565758049


In [None]:
print(cross_validate(svd_gs,trainset,measures=['rmse'],n_jobs=-1))

