# Collaborative Filtering System

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

In [2]:
# constants
PATH = '../data/data.csv'

## Import Data

In [3]:
df = pd.read_csv(PATH)
df.shape

(100000, 10)

In [4]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price,text_lang
0,655,52,4,11482,300,4,8,2012,94,7
1,2713,90,3,6479,469,1,8,2012,33,5
2,409,17,2,25472,435,1,12,2001,196,4
3,1150,234,10,23950,529,2,23,2019,79,2
4,2424,390,5,13046,395,2,20,2010,200,4


We will try to build a system which will predict the users rating of books.

In [5]:
ratings_df = df[['reader_id', 'book_id', 'book_rating']].copy()

In [6]:
#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings_df.copy()
y = ratings_df['reader_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [7]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [8]:
#Define the baseline model to always return 6.
def baseline(reader_id, book_id):
    return 6.0

In [9]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['reader_id'], X_test['book_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['book_rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [10]:
score(baseline)

2.913154075339419

## User - User Based

In [11]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='book_rating', index='reader_id', columns='book_id')

r_matrix.head()

book_id,1,2,3,4,5,6,7,8,9,10,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,3000
reader_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,


### Mean

In [12]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(reader_id, book_id):
    
    #Check if movie_id exists in r_matrix
    if book_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[book_id].mean()
    
    else:
        #Default to a rating of 6.0 in the absence of any information
        mean_rating = 6.0
    
    return mean_rating

In [13]:
#Compute RMSE for the Mean model
score(cf_user_mean)

2.9400202223375866

### Weighted Mean

In [14]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [15]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [16]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

reader_id,1,3,4,5,6,7,8,9,10,11,...,29990,29991,29992,29993,29994,29995,29996,29997,29998,29999
reader_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.328266,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328266,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
cosine_sim.shape

(27027, 27027)

In [18]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(reader_id, book_id):
    
    #Check if movie_id exists in r_matrix
    if book_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[reader_id]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[book_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 6.0 in the absence of any information
        wmean_rating = 6.0
    
    return wmean_rating

In [19]:
# score(cf_user_wmean)

## Preprocess Data

In [14]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

### Approach 2

In [15]:
# we will pivot the dataset such that the readers are on the rows and the columns will be the books
# each value in the pivot table will consist of the rating that user has provided for that book
pt_df = df.pivot_table(
    columns = 'book_id', 
    index = 'author_id', 
    values = 'book_rating'
).fillna(0)

In [16]:
mat = pt_df.values

In [17]:
mat = csr_matrix(mat)

In [18]:
users_ids = list(pt_df.index)
users_ids[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [19]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(mat, k = NUMBER_OF_FACTORS_MF)

In [20]:
U.shape

(450, 15)

In [21]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [22]:
Vt.shape

(15, 3000)

In [23]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 6.43742092e-01, -7.54639768e-02,  7.38403163e-02, ...,
         2.11190768e-01,  6.58350934e-01,  3.99074880e-01],
       [ 6.79534929e-01,  4.71032402e-01,  1.89129106e-01, ...,
         7.33608225e-01,  2.29125244e-01,  5.22129066e-01],
       [ 1.67325863e-01,  2.46421761e-01,  8.70571741e-01, ...,
         6.32495512e-01,  6.21927457e-01,  5.55995590e-01],
       ...,
       [ 4.86659935e-01, -3.73243117e-02, -6.69284325e-01, ...,
         5.30717906e-04,  1.49756732e+00,  4.22877184e-01],
       [ 6.87560573e-01,  9.10996433e-03,  1.80066892e-01, ...,
         2.85722805e-01,  3.03105142e-01,  3.18669987e-01],
       [ 8.27202375e-01,  8.39761622e-01,  5.52377780e-01, ...,
         2.52764945e-01, -1.26726447e-01,  4.86223771e-01]])

In [24]:
usr_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [25]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(usr_ratings_norm, columns = pt_df.columns, index=users_ids).transpose()
# cf_preds_df.head(10)

In [26]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['book_id'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'book_id', 
                                                          right_on = 'book_id')[['recStrength', 'book_id']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, df)


In [27]:
cf_recommender_model.recommend_items(1)

Unnamed: 0,book_id,recStrength
0,2887,0.558143
1,809,0.51535
2,1516,0.504145
3,490,0.498952
4,360,0.49797
5,1107,0.495817
6,2254,0.488178
7,794,0.479507
8,1322,0.475991
9,415,0.475401


## Recommend

### Approach 3

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
pt_df = df.pivot_table(
    columns = 'book_id', 
    index = 'author_id', 
    values = 'book_rating'
).fillna(0)

In [35]:
def find_similar_readers(pt_df, reader_id, n_recs):
    '''
    This function will find similar readers to the user specified reader_id
    '''
    
    # separate reader of interest & all other readers 
    reader = pt_df[pt_df.index == reader_id]
    other_readers = pt_df[pt_df.index != reader_id]

    # get similarity of current reader and all other readers
    sim = cosine_similarity(reader, other_readers)[0].tolist()
    idx = other_readers.index.tolist()

    # create a similarity dictionary for this user w.r.t all other users
    idx_sim = dict(zip(idx, sim))
    idx_sim = sorted(idx_sim.items(), key=lambda x: x[1], reverse=True)

    similar_readers = idx_sim[:k]
    readers = [rdr[0] for rdr in similar_readers]

    return readers

In [37]:
find_similar_readers(pt_df = pt_df, reader_id = 226, n_recs = 5)

[319, 191, 145, 162, 212]

In [38]:
def recommend_item(user_index, similar_user_indices, matrix, items=5):
    
    # load vectors for similar users
    similar_users = matrix[matrix.index.isin(similar_user_indices)]
    # calc avg ratings across the 3 similar users
    similar_users = similar_users.mean(axis=0)
    # convert to dataframe so its easy to sort and filter
    similar_users_df = pd.DataFrame(similar_users, columns=['mean'])
    
    
    # load vector for the current user
    user_df = matrix[matrix.index == user_index]
    # transpose it so its easier to filter
    user_df_transposed = user_df.transpose()
    # rename the column as 'rating'
    user_df_transposed.columns = ['rating']
    # remove any rows without a 0 value. Anime not watched yet
    user_df_transposed = user_df_transposed[user_df_transposed['rating']==0]
    # generate a list of animes the user has not seen
    animes_unseen = user_df_transposed.index.tolist()
    
    # filter avg ratings of similar users for only anime the current user has not seen
    similar_users_df_filtered = similar_users_df[similar_users_df.index.isin(animes_unseen)]
    # order the dataframe
    similar_users_df_ordered = similar_users_df.sort_values(by=['mean'], ascending=False)
    # grab the top n anime   
    top_n_anime = similar_users_df_ordered.head(items)
    top_n_anime_indices = top_n_anime.index.tolist()
    # lookup these anime in the other dataframe to find names
    anime_information = df[df['book_id'].isin(top_n_anime_indices)]
    
    return anime_information #items
# try it out
recommend_item(206, similar_user_indices, pt_df)

NameError: name 'similar_user_indices' is not defined