# Hybrid System

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

In [2]:
# constants
PATH = '../data/data.csv'

## Import Data

In [3]:
df = pd.read_csv(PATH)
df.shape

(100000, 10)

In [4]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,655,52,4,11482,300,4,8,2012,94,7
1,2713,90,3,6479,469,1,8,2012,33,5
2,409,17,2,25472,435,1,12,2001,196,4
3,1150,234,10,23950,529,2,23,2019,79,2
4,2424,390,5,13046,395,2,20,2010,200,4


## Flow
1. Take in a book_id and user_id as input
2. Use a content-based model to compute the 50 most similar books
3. Compute the predicted ratings that the user might give these 50 books using a collaborative filter
4. Return the top n books with the highest predicted rating

In [5]:
rmat = df.pivot_table(
    columns = 'book_id',
    index = 'reader_id',
    values = 'book_rating'
).fillna(0)

In [6]:
%%time
#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(rmat, rmat)

CPU times: user 6min 12s, sys: 1min 38s, total: 7min 50s
Wall time: 1min 36s


In [7]:
%%time
#Convert into pandas dataframe
cosine_sim = pd.DataFrame(cosine_sim, index=rmat.index, columns=rmat.index)

CPU times: user 29.3 ms, sys: 11.3 ms, total: 40.6 ms
Wall time: 7.4 ms


In [8]:
#Build the SVD based Collaborative filter
reader = Reader()
data = Dataset.load_from_df(df[['reader_id', 'book_id', 'book_rating']], reader)


In [9]:
#Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.3,random_state=10)

In [10]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fae082a17f0>

In [56]:
def hybrid(reader_id, book_id, n_recs):
    sim_scores = list(enumerate(cosine_sim[int(book_id)]))
    
    #Sort the (index, score) tuples in decreasing order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Select the top 50 tuples, excluding the first 
    #(as it is the similarity score of the movie with itself)
    sim_scores = sim_scores[1:50]
    
    #Store the cosine_sim indices of the top 25 movies in a list
    book_indices = [i[0] for i in sim_scores]

    #Extract the metadata of the aforementioned movies
    books = df.iloc[book_indices][['book_id', 'book_rating', 'num_pages', 'publish_year', 'book_price', 'reader_id']]
    
    #Compute the predicted ratings using the SVD filter
    books['est'] = books.apply(lambda x: svd.predict(reader_id, x['book_id'], x['book_rating']).est, axis = 1)
    
    #Sort the movies in decreasing order of predicted rating
    books = books.sort_values('est', ascending=False)
    
    #Return the top 10 movies as recommendations
    return books.head(n_recs)

In [64]:
hybrid(25433,2324,4)

Unnamed: 0,book_id,book_rating,num_pages,publish_year,book_price,reader_id,est
8806,2488,7,637,2001,198,4411,5.0
8292,502,9,298,2019,81,27627,5.0
23621,1184,7,403,2013,88,16860,5.0
16789,2293,7,573,2021,166,9000,5.0
