# Hybrid System

In [67]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split

In [2]:
# constants
PATH = '../data/data.csv'

## Import Data

In [3]:
df = pd.read_csv(PATH)
df.shape

(100000, 10)

In [4]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,655,52,4,11482,300,4,8,2012,94,7
1,2713,90,3,6479,469,1,8,2012,33,5
2,409,17,2,25472,435,1,12,2001,196,4
3,1150,234,10,23950,529,2,23,2019,79,2
4,2424,390,5,13046,395,2,20,2010,200,4


## Content Based

In [5]:
rmat = df.pivot_table(
    columns = 'book_id',
    index = 'reader_id',
    values = 'book_rating'
).fillna(0)

In [6]:
%%time
#Compute the cosine similarity matrix 
cosine_sim = cosine_similarity(rmat, rmat)
cosine_sim = pd.DataFrame(cosine_sim, index=rmat.index, columns=rmat.index)

CPU times: user 6min 12s, sys: 1min 38s, total: 7min 50s
Wall time: 1min 36s


## Collaborative Filtering

In [8]:
reader = Reader()
data = Dataset.load_from_df(df[['reader_id', 'book_id', 'book_rating']], reader)

In [9]:
# split data into train test
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [10]:
# train
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fae082a17f0>

In [65]:
# run the trained model against the testset
test_pred = svd.test(testset)

In [68]:
# get RMSE
accuracy.rmse(test_pred, verbose=True)

RMSE: 2.9300


2.9299770991365213

## Hybrid

In [56]:
def hybrid(reader_id, book_id, n_recs, df, cosine_sim, svd_model = svd):
    '''
    This function represents a hybrid recommendation system, it will have the following flow:
        1. Use a content-based model (cosine_similarity) to compute the 50 most similar books
        2. Compute the predicted ratings that the user might give these 50 books using a collaborative
           filtering model (SVD)
        3. Return the top n books with the highest predicted rating
        
    params:
        reader_id (Integer) : The reader_id 
        book_id (Integer) : The book_id 
        n_recs (Integer) : The number of recommendations you want
        df (DataFrame) : Original dataframe with all book information 
        cosine_sim (DataFrame) : The cosine similarity dataframe
        svd_model (Model) : SVD model
    '''
    
    # sort similarity values in decreasing order and take top 50 results
    sim = list(enumerate(cosine_sim[int(book_id)]))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)
    sim = sim[1:50]
    
    # get book metadata
    book_idx = [i[0] for i in sim]
    books = df.iloc[book_idx][['book_id', 'book_rating', 'num_pages', 'publish_year', 'book_price', 'reader_id']]
    
    # predict using the svd_model
    books['est'] = books.apply(lambda x: svd_model.predict(reader_id, x['book_id'], x['book_rating']).est, axis = 1)
    
    # sort predictions in decreasing order and return top n_recs
    books = books.sort_values('est', ascending=False)
    return books.head(n_recs)

In [72]:
hybrid(234,4539,5)

Unnamed: 0,book_id,book_rating,num_pages,publish_year,book_price,reader_id,est
24451,2306,5,276,2011,67,8187,5.0
28108,2419,2,548,2011,126,5865,5.0
10646,2328,8,594,2002,12,22890,5.0
13206,829,3,96,2004,195,8629,5.0
20887,2880,8,83,2013,82,28214,5.0
