# Collaborative Filtering System

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from surprise import KNNWithMeans, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

In [2]:
# constants
PATH = '../data/data.csv'

## Import Data

In [3]:
df = pd.read_csv(PATH)
df.shape

(100000, 10)

In [4]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,655,52,4,11482,300,4,8,2012,94,7
1,2713,90,3,6479,469,1,8,2012,33,5
2,409,17,2,25472,435,1,12,2001,196,4
3,1150,234,10,23950,529,2,23,2019,79,2
4,2424,390,5,13046,395,2,20,2010,200,4


## Method 1

In [5]:
# generate a pivot table with readers on the index and books on the column and values being the ratings
pt_df = df.pivot_table(
    columns = 'book_id',
    index = 'reader_id',
    values = 'book_rating'
).fillna(0)

# convert to a csr matrix
mat = pt_df.values
mat = csr_matrix(mat)

In [6]:
def normalize(pred_ratings):
    '''
    This function will normalize the input pred_ratings
    
    params:
        pred_ratings (List -> List) : The prediction ratings 
    '''
    return (pred_ratings - pred_ratings.min()) / (pred_ratings.max() - pred_ratings.min())

In [7]:
def generate_prediction_df(mat, pt_df, n_factors):
    '''
    This function will calculate the single value decomposition of the input matrix
    given n_factors. It will then generate and normalize the user rating predictions.
    
    params:
        mat (CSR Matrix) : scipy csr matrix corresponding to the pivot table (pt_df)
        pt_df (DataFrame) : pandas dataframe which is a pivot table
        n_factors (Integer) : Number of singular values and vectors to compute. 
                              Must be 1 <= n_factors < min(mat.shape). 
    '''
    
    if not 1 <= n_factors < min(mat.shape):
        raise ValueError("Must be 1 <= n_factors < min(mat.shape)")
        
    # matrix factorization
    u, s, v = svds(mat, k = n_factors)
    s = np.diag(s)

    # calculate pred ratings
    pred_ratings = np.dot(np.dot(u, s), v) 
    pred_ratings = normalize(pred_ratings)
    
    # convert to df
    pred_df = pd.DataFrame(
        pred_ratings,
        columns = pt_df.columns,
        index = list(pt_df.index)
    ).transpose()
    return pred_df

In [8]:
%time pred_df = generate_prediction_df(mat, pt_df, 10)

CPU times: user 4min 24s, sys: 21.6 s, total: 4min 45s
Wall time: 40 s


In [9]:
def recommend_items(pred_df, usr_id, n_recs):
    '''
    Given a usr_id and pred_df this function will recommend
    items to the user.
    
    params:
        pred_df (DataFrame) : generated from `generate_prediction_df` function
        usr_id (Integer) : The user you wish to get item recommendations for
        n_recs (Integer) : The number of recommendations you want for this user
    '''
    
    usr_pred = pred_df[usr_id].sort_values(ascending = False).reset_index().rename(columns = {usr_id : 'sim'})
    rec_df = usr_pred.sort_values(by = 'sim', ascending = False).head(n_recs)
    return rec_df

In [10]:
recommend_items(pred_df, 5, 5)

Unnamed: 0,book_id,sim
0,2994,0.680722
1,1083,0.114266
2,2577,0.114266
3,937,0.019964
4,1710,0.019859


## Method 2

In [11]:
pt_df = df.pivot_table(
    columns = 'book_id', 
    index = 'author_id', 
    values = 'book_rating'
).fillna(0)

In [12]:
def find_similar_readers(pt_df, reader_id, n_recs):
    '''
    This function will find similar readers to the user specified reader_id
    '''
    
    # separate reader of interest & all other readers 
    reader = pt_df[pt_df.index == reader_id]
    other_readers = pt_df[pt_df.index != reader_id]

    # get similarity of current reader and all other readers
    sim = cosine_similarity(reader, other_readers)[0].tolist()
    idx = other_readers.index.tolist()

    # create a similarity dictionary for this user w.r.t all other users
    idx_sim = dict(zip(idx, sim))
    idx_sim = sorted(idx_sim.items(), key=lambda x: x[1], reverse=True)

    similar_readers = idx_sim[:n_recs]
    readers = [rdr[0] for rdr in similar_readers]

    return readers

In [13]:
find_similar_readers(pt_df = pt_df, reader_id = 226, n_recs = 5)

[319, 191, 145, 162, 212]

## Method 3

In [14]:
rdf = df[['reader_id', 'book_id', 'book_rating']]

In [15]:
#Reading the dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rdf, reader)

In [16]:
#Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)


In [17]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fdb43561310>

In [18]:
# run the trained model against the testset
test_pred = algo.test(testset)

In [19]:
# get RMSE
accuracy.rmse(test_pred, verbose=True)

RMSE: 2.9306


2.9306185721359865

In [20]:
algo.predict(uid = 10, iid = 43)

Prediction(uid=10, iid=43, r_ui=None, est=5, details={'actual_k': 0, 'was_impossible': False})