In [1]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
import numpy as np

In [2]:
data_dir = "../data/processed/ratings_cleaned.csv" # use cleaned data with no standardization
ratings = pd.read_csv(data_dir, encoding="latin-1")

In [11]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433671 entries, 0 to 433670
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   userID      433671 non-null  int64  
 1   ISBN        433671 non-null  object 
 2   bookRating  433671 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.9+ MB


In [3]:
# Define a Reader object
reader = Reader(rating_scale=(0, 10))

# Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings[['userID', 'ISBN', 'bookRating']], reader)

To get the baseline RMSE we need to predict the averge rating for all items and then find the RMSE

In [4]:
# Split the dataset into a training set and a test set
trainset, testset = train_test_split(data, test_size=0.25)

# Compute the average rating
avg_rating = np.mean([rating for (_, _, rating) in trainset.all_ratings()])

# Predict this average rating for all items in the test set
predictions = [avg_rating for _ in testset]
actual_ratings = [rating for (_, _, rating) in testset]

# Calculate the RMSE
mse = np.mean((np.array(predictions) - np.array(actual_ratings))**2)
rmse_baseline = np.sqrt(mse)
mae_baseline = np.mean(np.abs(np.array(predictions) - np.array(actual_ratings)))

print(f'Baseline RMSE using average rating: {rmse_baseline}')
print(f'Baseline MAE using average rating: {mae_baseline}')

Baseline RMSE using average rating: 0.25286942344335483
Baseline MAE using average rating: 0.19011309225380876


With the baseline set, we can move on to getting a better evaluation metric with our model

### SVD

In [16]:
# Define the SVD algorithm object
svd = SVD()

# Train the algorithm on the training set
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b0c7cdb6a0>

In [17]:
# Predict ratings for the test set
predictions = svd.test(testset)

# Compute and print the Root Mean Squared Error (RMSE)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.2372
MAE:  0.1694


The RMSE only improved a little, let's try cross validating

In [18]:
from surprise.model_selection import cross_validate

# Define the SVD algorithm object
svd_cv = SVD()

# Run 5-fold cross-validation and print results
cross_validate(svd_cv, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2376  0.2400  0.2344  0.2376  0.2351  0.2369  0.0020  
MAE (testset)     0.1693  0.1706  0.1682  0.1698  0.1683  0.1692  0.0009  
Fit time          21.71   21.02   22.05   21.08   21.53   21.48   0.39    
Test time         0.58    0.58    0.56    1.01    1.10    0.77    0.24    


{'test_rmse': array([0.23759023, 0.23997848, 0.23440952, 0.23762851, 0.23511512]),
 'test_mae': array([0.16933493, 0.17055345, 0.16820098, 0.16979305, 0.1682825 ]),
 'fit_time': (21.706475734710693,
  21.020883560180664,
  22.048755407333374,
  21.07952117919922,
  21.533440828323364),
 'test_time': (0.5811326503753662,
  0.577904224395752,
  0.5647735595703125,
  1.005915880203247,
  1.1047651767730713)}

The standard deviation shows that the results are not too far from each other, let's try grid search

### NMF

In [14]:
from surprise import NMF
from surprise.model_selection import train_test_split

# Create and train the NMF model
nmf_model = NMF(n_factors=150)  # You can adjust the number of factors
nmf_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x272736f0b20>

In [15]:
# Predict ratings for the testset
predictions = nmf_model.test(testset)

# Calculate RMSE and MAE
rmse_nmf = accuracy.rmse(predictions)
mae_nmf = accuracy.mae(predictions)

print(f'NMF RMSE: {rmse_nmf}')
print(f'NMF MAE: {mae_nmf}')

RMSE: 0.2662
MAE:  0.1937
NMF RMSE: 0.2662168171469821
NMF MAE: 0.1936650650670354


### KNN - User based

In [5]:
from surprise import SVDpp

# Create and train the \(k\)NN model
svdpp_model = SVDpp()
svdpp_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x27a176b3d90>

In [6]:
predictions = svdpp_model.test(testset)

# Calculate RMSE and MAE
rmse_svdpp = accuracy.rmse(predictions)
mae_svdpp = accuracy.mae(predictions)

print(f'SVD++ RMSE: {rmse_svdpp}')
print(f'SVD++ MAE: {mae_svdpp}')


RMSE: 0.2303
MAE:  0.1628
SVD++ RMSE: 0.23027854060933062
SVD++ MAE: 0.1627650180544325


### KNN - Item based

In [17]:
uid = 276780  # user ID as string
iid = str('0806917695')  # book ID as string

# Get prediction
pred = svdpp_model.predict(uid, iid, verbose=True)


user: 276780     item: 0806917695 r_ui = None   est = 2.13   {'was_impossible': False}
