# Book Reading Propensity: Collaborative Filtering Approach

## Matrix Factorization & "k Nearest Neighbors"

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import os

from surprise import Reader, Dataset, SVD, BaselineOnly, NMF, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt

%matplotlib inline

## Read the Data

In [None]:
train_val_path = './data/goodreads_2016_train_val.csv'

In [2]:
df_train_val = pd.read_csv(train_val_path)
print(f'Data size: {df_train_val.shape[0]: >8,d}.')
df_train_val.head()

Data size: 3,491,950.


Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,10673579,5
1,7504b2aee1ecb5b2872d3da381c6c91e,29069989,2
2,7504b2aee1ecb5b2872d3da381c6c91e,8882815,3
3,7504b2aee1ecb5b2872d3da381c6c91e,6693332,4
4,7504b2aee1ecb5b2872d3da381c6c91e,4588949,4


In [3]:
del df_train_val

## Matrix Factorization with the Scikit Suprise: Model Selection
References:
 * https://surpriselib.com/
 * https://surprise.readthedocs.io/en/stable/index.html

### Building a Dataset

In [4]:
# REF: https://rubikscode.net/2020/04/27/collaborative-filtering-with-machine-learning-and-python/
class DatasetBuilder():
    def __init__(self, data_location):
        reader = Reader(rating_scale=(1, 5))
        self.ratings = pd.read_csv(data_location)
        
        self.dataset = Dataset.load_from_df(self.ratings[['user_id', 'book_id', 'rating']], reader)
        self.train_dataset, self.test_dataset = train_test_split(self.dataset, test_size=0.2)

In [5]:
train = DatasetBuilder(train_val_path)

### Grid Search: SVD

In [33]:
# Full Search Grid:
# Note: In the spirit of fast incremental improvements, 
# we break down the search grid into smaller subgrids.
# The next two cells illustrate one such experiment

svd_param_grid_full = {
    "n_factors": [32, 64, 128],
    "n_epochs": [10, 20, 50, 100],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.1, 0.5]
}

In [55]:
%%time
svd_param_grid = {
    "n_factors": [128],
    "n_epochs": [50, 100],
    "lr_all": [0.005],
    "reg_all": [0.1]
}

gs = GridSearchCV(SVD, svd_param_grid, measures=["rmse", "mae", "fcp"], cv=3, n_jobs=3, joblib_verbose=1)

gs.fit(train.dataset)
print('*'*50)
print('Best RMSE score, with hyperparameters')
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])
print('*'*50)
print('Best MAE score, with hyperparameters')
print(gs.best_score["mae"])
print(gs.best_params["mae"])
print('*'*50)
print('Best FCP score, with hyperparameters')
print(gs.best_score["fcp"])
print(gs.best_params["fcp"])
print('*'*50)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


**************************************************
Best RMSE score, with hyperparameters
1.1549409149407357
{'n_factors': 128, 'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.1}
**************************************************
Best MAE score, with hyperparameters
0.8318418249949565
{'n_factors': 128, 'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.1}
**************************************************
Best FCP score, with hyperparameters
0.6187826379436273
{'n_factors': 128, 'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.1}
**************************************************
CPU times: user 50 s, sys: 1min 20s, total: 2min 10s
Wall time: 11min 2s


[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed: 10.3min finished


In [8]:
# gs.cv_results

### Grid Search: NMF

In [33]:
# Full Search Grid:
# Note: In the spirit of fast incremental improvements, 
# we break down the search grid into smaller subgrids.
# The next couple of cells illustrate one such experiment

nmf_param_grid_full = {
    "n_factors": [32, 64, 128],
    "n_epochs": [10, 20, 50, 100],
    "biased": [False, True]
}

In [6]:
%%time
nmf_param_grid = {
    "n_factors": [128],
    "n_epochs": [10],
    "biased": [False, True]
}

gs = GridSearchCV(NMF, nmf_param_grid, measures=["rmse", "mae", "fcp"], cv=3, n_jobs=3, joblib_verbose=1)

gs.fit(train.dataset)
print('*'*50)
print('Best RMSE score, with hyperparameters')
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])
print('*'*50)
print('Best MAE score, with hyperparameters')
print(gs.best_score["mae"])
print(gs.best_params["mae"])
print('*'*50)
print('Best FCP score, with hyperparameters')
print(gs.best_score["fcp"])
print(gs.best_params["fcp"])
print('*'*50)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


**************************************************
Best RMSE score, with hyperparameters
1.3348718206119399
{'n_factors': 128, 'n_epochs': 10, 'biased': False}
**************************************************
Best MAE score, with hyperparameters
0.9369168304480354
{'n_factors': 128, 'n_epochs': 10, 'biased': False}
**************************************************
Best FCP score, with hyperparameters
0.3638301603132758
{'n_factors': 128, 'n_epochs': 10, 'biased': True}
**************************************************
CPU times: user 41.7 s, sys: 36.7 s, total: 1min 18s
Wall time: 11min 19s


[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed: 10.8min finished


In [7]:
gs.cv_results

{'split0_test_rmse': array([1.33480871, 1.91178115]),
 'split1_test_rmse': array([1.33507148, 1.92561447]),
 'split2_test_rmse': array([1.33473527, 1.94094462]),
 'mean_test_rmse': array([1.33487182, 1.92611342]),
 'std_test_rmse': array([0.00014433, 0.01191116]),
 'rank_test_rmse': array([1, 2]),
 'split0_test_mae': array([0.93674739, 1.45747351]),
 'split1_test_mae': array([0.93707775, 1.47037776]),
 'split2_test_mae': array([0.93692536, 1.49048659]),
 'mean_test_mae': array([0.93691683, 1.47277929]),
 'std_test_mae': array([0.000135  , 0.01358409]),
 'rank_test_mae': array([1, 2]),
 'split0_test_fcp': array([0.33495661, 0.37628206]),
 'split1_test_fcp': array([0.33938084, 0.35712743]),
 'split2_test_fcp': array([0.32837866, 0.358081  ]),
 'mean_test_fcp': array([0.33423871, 0.36383016]),
 'std_test_fcp': array([0.00452022, 0.00881342]),
 'rank_test_fcp': array([2, 1]),
 'mean_fit_time': array([189.97589628, 252.0327901 ]),
 'std_fit_time': array([ 5.72445159, 20.29270348]),
 'mean_t

### Evaluate & Compare the Selected MF Algorithms

In [9]:
class TrainEvalAlgos():
    def __init__(self, dataset):
        self.algos = []
        self.dataset = dataset
        
    def addAlgorithm(self, algo):
        self.algos.append(algo)
        
    def train_and_evaluate(self):
        for algo in self.algos:
            algo.fit(self.dataset.train_dataset)
            predictions = algo.test(self.dataset.test_dataset)
            rmse = accuracy.rmse(predictions)
            mae = accuracy.mae(predictions)
            fcp = accuracy.fcp(predictions)
            print('-----------')
            print(f'{algo.__class__.__name__}') 
            print('-----------')
            print(f'      Metrics - RMSE: {rmse}, MAE: {mae}, FCP: {fcp}')
            print('-----------')

In [None]:
%%time
train_eval_run = TrainEvalAlgos(train)

n_factors = 128
print(f'n_factors = {n_factors}')
print('*'*50)
svd = SVD(**{'n_factors': n_factors, 'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.1})
train_eval_run.addAlgorithm(svd)

nmf = NMF(**{'n_factors': n_factors, 'n_epochs': 100, 'biased': False})
train_eval_run.addAlgorithm(nmf)

bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
als = BaselineOnly(bsl_options=bsl_options)
train_eval_run.addAlgorithm(als)

train_eval_run.train_and_evaluate()
print('*'*50)

n_factors = 128
**************************************************
RMSE: 0.7878
MAE:  0.6016
FCP:  0.6110
-----------
SVD
-----------
      Metrics - RMSE: 0.7877730588340818, MAE: 0.6016093212901044, FCP: 0.6109515092713509
-----------


## Inference

 * From the assignment: "*The input for the model will be (user_id, item_id) with other feature data and the output will be (positive or negative, confidence value)*"
 * __Propencity model and "cold starts":__
     * Whenever both the __user and the item are "known"__ (i.e., previously observed), we use previously selected model that showed the best performance on the train set.
     * Whenever we encounter a __"cold start" case (a user and/or item is new, previously unobserved)__, we will use the __baseline model__, i.e. an average rating, with corrections for a user/item bias (if we happen to have that data).
 * __Estimating the confidence:__ 
     * We can use the accuracy of the binary ("positive/negative") predictions on the train set, as an estimate of the confidence
     * As an incremental improvement, we can adjust the confidence based on the number of ratings provided by the user (the more we know about the user's preferences, the more accurately we can make a recommendation) 

In [2]:
test_path = './data/goodreads_2016_test.csv'

df_test = pd.read_csv(test_path)
print(f'Data size: {df_test.shape[0]: >8,d}.')
df_test.head()

Data size: 3,491,950.


Unnamed: 0,user_id,book_id,rating,would_recommend
0,8842281e1d1347389f2ab93d60773d4d,10673579,5,1
1,7504b2aee1ecb5b2872d3da381c6c91e,29069989,2,0
2,7504b2aee1ecb5b2872d3da381c6c91e,8882815,3,0
3,7504b2aee1ecb5b2872d3da381c6c91e,6693332,4,1
4,7504b2aee1ecb5b2872d3da381c6c91e,4588949,4,1


In [None]:
del df_test

In [4]:
test = DatasetBuilder(test_path)

In [None]:
def user_item_propensity(user_id, item_id, baseline_model, main_model):
    if user_id in new_users and item_id in new_items:
        # Use the baseline model: global average rating
        rating = baseline_model.predict()
        confidence = None
        return rating, confidence
    elif user_id in new_users:
        # Correct for the item bias
        rating = baseline_model.predict(item_id)
        confidence = None
    elif item_id in new_items:
        # Correct for the user bias
        rating = baseline_model.predict(user_id)
        confidence = None
    else:
        # Run the main model
        rating = main_model.predict(user_id, item_id)
        confidence = None

## Save the Results

## Create a Label/Target: "Would Recommend"

In [6]:
# df['would_recommend'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)
# df.head()

Unnamed: 0,user_id,book_id,month,day,hour,rating,would_recommend
0,8842281e1d1347389f2ab93d60773d4d,29058155,12,13,11,3,0
1,8842281e1d1347389f2ab93d60773d4d,10673579,1,8,1,5,1
2,8842281e1d1347389f2ab93d60773d4d,76620,12,14,12,5,1
3,7504b2aee1ecb5b2872d3da381c6c91e,29069989,8,17,19,2,0
4,7504b2aee1ecb5b2872d3da381c6c91e,8882815,6,21,16,3,0
