In [1]:
!pip install numpy
!pip install scipy
!pip install scikit-surprise

Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import numpy as np
import pandas as pd
import surprise
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV

import os
import random

# settings
from IPython.display import display
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

## Fine Tuning Probabalistic Matrix Factorization with Suprise


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Importing the dataset

In [4]:
users = pd.read_excel('/content/drive/MyDrive/Work/Datasets/BX-Users.xlsx')
books = pd.read_excel('/content/drive/MyDrive/Work/Datasets/BX-Books.xlsx')
ratings = pd.read_excel('/content/drive/MyDrive/Work/Datasets/BX-Book-Ratings.xlsx')

In [5]:
display(users.head(2))
display(books.head(2))
display(ratings.head(2))

Unnamed: 0,user_id,username,password,firstname,lastname,age
0,1,1username,1password,1firstname,1lastname,20
1,2,2username,2password,2firstname,2lastname,54


Unnamed: 0,isbn,title,author,publicationYear,publisher,image
0,B0002K6K8O,The Underground City,Jules Verne,2022,Jules Verne,http://images.amazon.com/images/P/B0002K6K8O.0...
1,B0002JV9PY,The Blockade Runners,Jules Verne,2022,Jules Verne,http://images.amazon.com/images/P/B0002JV9PY.0...


Unnamed: 0,id,user_id,isbn,rating
0,1149780,276723,5162443314,8
1,1149779,276721,590442449,10


In [6]:
# dropping duplicates
ratings.drop_duplicates(inplace=True)

# display unique records in ratings table
print('we have',ratings.shape[0], 'ratings')
print('the number of unique users we have is:', len(ratings.user_id.unique()))
print('the number of unique books we have is:', len(ratings.isbn.unique()))
print("The median user rated %d books."%ratings.user_id.value_counts().median())
print("the min rating is: %d"%ratings.rating.min(), 'The max rating is: %d'%ratings.rating.max())
ratings.head()

we have 1149777 ratings
the number of unique users we have is: 105283
the number of unique books we have is: 340025
The median user rated 1 books.
the min rating is: 0 The max rating is: 10


Unnamed: 0,id,user_id,isbn,rating
0,1149780,276723,5162443314,8
1,1149779,276721,590442449,10
2,1149778,276709,515107662,10
3,1149777,276706,679447156,0
4,1149776,276704,1563526298,9


In [7]:
# Assuming 'ratings' is your DataFrame containing user IDs, item IDs, and ratings
# Example: ratings = pd.DataFrame({'userID': [...], 'itemID': [...], 'rating': [...]})

# Define a Reader and the rating_scale
reader = Reader(rating_scale=(1, 10))  # Adjust rating_scale according to your dataset

# Load the dataset from the DataFrame
data = Dataset.load_from_df(ratings[['user_id', 'isbn', 'rating']], reader)

# Define the parameter grid for SVD
param_grid = {
    'n_epochs': [5, 10, 20],  # Number of epochs
    'lr_all': [0.002, 0.005, 0.01],  # Learning rate
    'n_factors': [50, 100, 150],  # Number of factors
    'reg_all': [0.02, 0.05, 0.1]  # Regularization term
}

# Use GridSearchCV to find the best parameters for the SVD algorithm
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

# Fit the GridSearchCV
gs.fit(data)

# Best RMSE score
print('Best RMSE:', gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print('Best parameters:', gs.best_params['rmse'])

# Use the best parameters to create a new SVD model
best_svd = SVD(**gs.best_params['rmse'])

# You can now train this model on the full dataset and use it for predictions
# For example, using cross-validation to evaluate its performance:
cross_validate(best_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Best RMSE: 3.43210825857518
Best parameters: {'n_epochs': 5, 'lr_all': 0.01, 'n_factors': 50, 'reg_all': 0.1}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4355  3.4253  3.4181  3.4169  3.4211  3.4234  0.0067  
MAE (testset)     2.9230  2.9138  2.9071  2.9063  2.9098  2.9120  0.0061  
Fit time          6.70    8.28    7.31    7.52    8.12    7.58    0.57    
Test time         4.33    2.82    2.76    2.71    2.59    3.04    0.65    


{'test_rmse': array([3.43554479, 3.42526173, 3.41812075, 3.41689224, 3.42110814]),
 'test_mae': array([2.92304599, 2.9138369 , 2.90707058, 2.90628035, 2.90981015]),
 'fit_time': (6.6955649852752686,
  8.275355577468872,
  7.311617851257324,
  7.523254871368408,
  8.115606546401978),
 'test_time': (4.332972526550293,
  2.8162922859191895,
  2.7631022930145264,
  2.7102596759796143,
  2.5945777893066406)}

In [8]:
# Assuming 'ratings' DataFrame contains columns ['user_id', 'isbn', 'rating']
# Select a random user
random_user = random.choice(ratings['user_id'].unique())

# Select a random book
random_book = random.choice(ratings['isbn'].unique())

print(f"Random User ID: {random_user}")
print(f"Random Book ID: {random_book}")


Random User ID: 90626
Random Book ID: 8423644723


In [9]:
# Assuming best_svd is your trained SVD model with the best parameters
pred = best_svd.predict(uid=str(random_user), iid=str(random_book))

# Print the predicted rating
print(f"Predicted rating for user {pred.uid} on book {pred.iid} is: {pred.est}")


Predicted rating for user 90626 on book 8423644723 is: 2.867680920873821


In [10]:
# Function to get recommendations for a user
def get_recommendations(user_id, model, n=10):
    # Convert the user's ratings to an anti-testset (books the user hasn't rated)
    testset = [[user_id, isbn, 0] for isbn in ratings['isbn'].unique() if isbn not in ratings[ratings['user_id'] == user_id]['isbn']]
    predictions = model.test(testset)

    # Get the top N recommendations
    top_n_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]

    recommended_books = [(pred.iid, pred.est) for pred in top_n_predictions]
    return recommended_books

# Example usage
user_id = '203092'
n = 5
recommended_books = get_recommendations(user_id, best_svd, n=n)
print(f"Top {n} recommended books for user {user_id}:")
for isbn, rating_pred in recommended_books:
    print(f"ISBN: {isbn}, Predicted Rating: {rating_pred}")
    # print(f"{books[books.isbn == isbn].title}")

Top 5 recommended books for user 203092:
ISBN: 1844262553, Predicted Rating: 7.227283768176363
ISBN: 156528207, Predicted Rating: 5.968324407215775
ISBN: 894808249, Predicted Rating: 5.757496431723364
ISBN: 439136350, Predicted Rating: 5.630034416062595
ISBN: 439139597, Predicted Rating: 5.601467384054841


In [14]:
# Dump the model
import pickle
path = "/content/drive/MyDrive/Work/Datasets/"

# saving the model
file_path = os.path.join(path, "model.pkl")
with open(file_path, 'wb') as file:
  pickle.dump(best_svd, file)

print(f"Model saved to {file_path}")

Model saved to /content/drive/MyDrive/Work/Datasets/model.pkl


***Important Notes***
- <b>Model Compatibility</b>: When you load the model in a different environment or after updating libraries, ensure the environment is compatible with the one used for training the model. This includes having the same versions of Surprise, scikit-learn, numpy, and other relevant libraries.
- <b>Security</b>: Be cautious when loading pickle files from untrusted sources, as they can execute arbitrary code.

In [15]:
# Load the model
with open(file_path, 'rb') as file:
  model = pickle.load(file)

In [16]:
# Function to get recommendations for a user
def get_recommendations(user_id, model, n=10):
    # Convert the user's ratings to an anti-testset (books the user hasn't rated)
    testset = [[user_id, isbn, 0] for isbn in ratings['isbn'].unique() if isbn not in ratings[ratings['user_id'] == user_id]['isbn']]
    predictions = model.test(testset)

    # Get the top N recommendations
    top_n_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]

    recommended_books = [(pred.iid, pred.est) for pred in top_n_predictions]
    return recommended_books

# Example usage
user_id = '203092'
n = 5
recommended_books = get_recommendations(user_id, model, n=n)
print(f"Top {n} recommended books for user {user_id}:")
for isbn, rating_pred in recommended_books:
    print(f"ISBN: {isbn}, Predicted Rating: {rating_pred}")
    # print(f"{books[books.isbn == isbn].title}")

Top 5 recommended books for user 203092:
ISBN: 1844262553, Predicted Rating: 7.227283768176363
ISBN: 156528207, Predicted Rating: 5.968324407215775
ISBN: 894808249, Predicted Rating: 5.757496431723364
ISBN: 439136350, Predicted Rating: 5.630034416062595
ISBN: 439139597, Predicted Rating: 5.601467384054841
