Next key step in building CF-based recommendation systems is to generate user-item ratings matrix from the ratings table.
 

Using SKlearn, we are going to use a variety of functions to find similarity, predict, and recommend different books.

In [None]:
#!pip3 install surprise

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sklearn
from surprise.model_selection import cross_validate
from sklearn.metrics.pairwise import pairwise_distances

import re
import surprise

import time
import warnings



In [3]:
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='ISO-8859–1',on_bad_lines='skip',quotechar='"',sep=";",escapechar= "\\")
books = pd.read_csv('BX-Books.csv', encoding='ISO-8859–1',on_bad_lines='skip',quotechar='"',sep=";",escapechar= "\\")
users = pd.read_csv('BX-Users.csv', encoding='ISO-8859–1',on_bad_lines='skip',quotechar='"',sep=";",escapechar= "\\")



In [41]:
#print(ratings.shape)
ratings.columns
ratings = ratings.rename(columns={"User-ID": "UserID", "Book-Rating": "BookRating", "Book-Author": "BookAuthor", "Book-Title": "BookTitle","Year-Of-Publication": "PublicationYear"})


#print(books.shape)
#print(users.shape)

0          276725
1          276726
2          276727
3          276729
4          276729
            ...  
1149775    276704
1149776    276706
1149777    276709
1149778    276721
1149779    276723
Name: UserID, Length: 1149780, dtype: int64

In [23]:
#remove implicit data
drop_duplicate_ratings = ratings.drop_duplicates().dropna()
explicit_book_ratings = drop_duplicate_ratings[drop_duplicate_ratings['Book-Rating'] > 0]


#merge Users and Ratings dataset
reviews_and_users = pd.merge(left=explicit_book_ratings,right= books, how = 'inner').merge(users.dropna(), how = 'inner')
reviews_and_users = reviews_and_users.drop_duplicates()

#Dataset Cleaning
reviews_and_users = reviews_and_users.drop(columns = ['Location','Image-URL-S','Image-URL-M','Image-URL-L'])
reviews_and_users = reviews_and_users.rename(columns={"User-ID": "UserID", "Book-Rating": "BookRating", "Book-Author": "BookAuthor", "Book-Title": "BookTitle","Year-Of-Publication": "PublicationYear"})
reviews_and_users['BookAuthor'] = reviews_and_users['BookAuthor'].str.title()

For the testing set, I am reducing the dataset down to users who have reviewed 100 books & Books with 100 ratings.


In [32]:
reviews_and_users


Unnamed: 0,UserID,ISBN,BookRating,BookTitle,BookAuthor,PublicationYear,Publisher,Age
0,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,16.0
1,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,16.0
2,16877,038550120X,9,A Painted House,John Grisham,2001,Doubleday,37.0
3,16877,034539657X,7,Dark Rivers of the Heart,Dean R. Koontz,1995,Ballantine Books,37.0
4,16877,0743211383,3,Dreamcatcher,Stephen King,2001,Scribner,37.0
...,...,...,...,...,...,...,...,...
269625,276660,0583307841,8,ROBOT RACE (MICRO ADV 6),David Antony Kroft,1985,HarperCollins Publishers,15.0
269626,276664,0004703723,9,Dictionary Of Economics-2Nd Ed,Christopher Pass,1991,Trafalgar Square,31.0
269627,276664,0140136908,7,History of Economic Thought (Penguin Economics),William J. Barber,1992,Penguin USA,31.0
269628,276664,0631189629,9,British Social Policy Since 1945 (Making Conte...,Howard Glennerster,1996,Blackwell Publishers,31.0


In [46]:
def popular_ratings(ratings, user_threshold=200, rating_threshold=200, book_threshold=1):
    counts_users = ratings.UserID.value_counts()
    counts_ratings = ratings.BookRating.value_counts()
    sample_ratings = ratings[ratings['UserID'].isin(counts_users[counts_users >= user_threshold].index)]
    sample_ratings = sample_ratings[ratings['BookRating'].isin(counts_ratings[counts_ratings >= rating_threshold].index)]
    isbn_group = sample_ratings.groupby('ISBN', as_index=False)['BookRating'].count()
    sample_ratings = sample_ratings[sample_ratings.ISBN.isin(list(isbn_group[isbn_group.BookRating > book_threshold].ISBN.values))]
    return sample_ratings



In [52]:
sampled_ratings = popular_ratings(reviews_and_users, user_threshold=400, rating_threshold=400, book_threshold=1)
rating_matrix = sampled_ratings.pivot(index='UserID',
                                         columns='ISBN',
                                         values= 'BookRating').fillna(0)
print(sampled_ratings.shape)

rating_matrix

(4047, 8)


  sample_ratings = sample_ratings[ratings['BookRating'].isin(counts_ratings[counts_ratings >= rating_threshold].index)]


ISBN,0001056107,002026478X,0060002050,006000441X,0060004606,0060004622,006000469X,0060004746,0060008865,0060011904,...,1854710443,1855385074,1878448900,1890862185,1890862290,189205101X,1892065487,1895565014,1902852427,1932112138
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,10.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
60244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0
63714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93047,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0
95359,8.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
userid_list = book_ratings.index.tolist()
column_names = ["UserID", "Location", "Age"]
sampled_users = pd.DataFrame(columns = column_names)
sampled_users = users.loc[users['User-ID'].isin(userid_list)]
sampled_users = sampled_users.reset_index()
sampled_users = sampled_users.drop(['index'], axis=1)

sampled_users.head(-1)






Unnamed: 0,User-ID,Location,Age
0,16795,"mechanicsville, maryland, usa",47.0
1,23872,"tulsa, oklahoma, usa",22.0
2,56399,"n/a, surrey, united kingdom",63.0
3,60244,"alvin, texas, usa",47.0
4,63714,"milton keynes, england, united kingdom",29.0
5,69078,"new york, new york, usa",42.0
6,76626,"london, england, united kingdom",38.0
7,78973,"amadora, lisboa, portugal",29.0
8,93047,"nashua, new hampshire, usa",52.0
9,95359,"charleston, west virginia, usa",33.0


In [74]:
book_isbn_list = book_ratings.columns.values.tolist()
column_names = ["ISBN", "BookTitle", "BookAuthor", "PublicationYear", "Publisher"]
sampled_books = pd.DataFrame(columns = column_names)
sampled_books = books.loc[books['ISBN'].isin(book_isbn_list)]
sampled_books = sampled_books.reset_index()
sampled_books = sampled_books.drop(['index'], axis=1)
sampled_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...
1,0446310786,To Kill a Mockingbird,Harper Lee,1988,Little Brown &amp; Company,http://images.amazon.com/images/P/0446310786.0...,http://images.amazon.com/images/P/0446310786.0...,http://images.amazon.com/images/P/0446310786.0...
2,0449005615,Seabiscuit: An American Legend,LAURA HILLENBRAND,2002,Ballantine Books,http://images.amazon.com/images/P/0449005615.0...,http://images.amazon.com/images/P/0449005615.0...,http://images.amazon.com/images/P/0449005615.0...
3,0553582747,From the Corner of His Eye,Dean Koontz,2001,Bantam Books,http://images.amazon.com/images/P/0553582747.0...,http://images.amazon.com/images/P/0553582747.0...,http://images.amazon.com/images/P/0553582747.0...
4,042518630X,Purity in Death,J.D. Robb,2002,Berkley Publishing Group,http://images.amazon.com/images/P/042518630X.0...,http://images.amazon.com/images/P/042518630X.0...,http://images.amazon.com/images/P/042518630X.0...


The Data is more or less ready now to be used for a Collaborative Filtering Model.

The Model I will go with is a KNN Model (K Nearest Neighbors) there is more information about the process below:

In K Nearest Neighbors for collaborative filtering, we use the number of k people who most similar to the person we are looking for to find good recommendations. 

The best value for k depends on the problem. We use KNN with Means algorithm for building user-based recommender system. 

This algorithm takes into account the mean ratings of each user.
We use cosine similarity measure to compute the closeness of users with each other.



In [84]:
#get neighbors of target user based on similarity measure.
#find k nearest neighbors and use their ratings to recommend the items to the target user.


class UserBasedCollaborativeFiltering():
    
    def __init__(self, users, books, ratings, k=10, max_rating=10.0):
        self.users = users
        self.users = self.users.reset_index()
        self.users = self.users.drop(columns=['index'])
        
        self.books = books
        
        self.ratings = ratings
        self.ratings = self.ratings.reset_index()
        self.ratings = self.ratings.drop(columns=['UserID'])
        
        self.k = k
        self.max_rating = max_rating
    
    def normalize(self, dataframe):

        row_sum_ratings = dataframe.sum(axis=1) # sum entries of rows
        non_zero_count = dataframe.astype(bool).sum(axis=1) # count non-zero entries of rows 
        
        dataframe_mean = row_sum_ratings / non_zero_count # mean of rows
        
        self.normalized_ratings = dataframe.subtract(dataframe_mean, axis = 0) # subtract on rows(iteration over columns!)
    
    def compute_similarity(self, x, y):
        return np.dot(x, y)/ (np.linalg.norm(x) * np.linalg.norm(y))


    def create_similarity_matrix(self):          
        num_users = len(self.users)
        similarity_array = np.array([self.compute_similarity(self.ratings.iloc[i,:], self.ratings.iloc[j,:])
                                    for i in range(num_users) for j in range(num_users)])
        similarity_matrix = pd.DataFrame(data = similarity_array.reshape(self.users.shape[0], self.users.shape[0]))
        
        return similarity_matrix

    def get_neighbors(self, user_id, similarity_matrix):
        user_index = self.users.loc[self.users['User-ID'] == user_id].index.values[0]
        user_similairities = similarity_matrix.iloc[user_index].values
        temp_neighbors_index = user_similairities.argsort()[-(self.k + 1):][::-1]
        neighbor_index = np.delete(temp_neighbors_index, np.where(temp_neighbors_index[user_index] == user_index))

        return neighbor_index    
        
    def score_item(self, user_id, neighbor_rating, neighbor_similarity, ratings):
        user_index = self.users.loc[self.users['User-ID'] == user_id].index.values[0]
        active_user_mean_rating = np.mean(ratings.iloc[user_index, :])
        score = np.dot(neighbor_similarity, neighbor_rating) + active_user_mean_rating
        data = score.reshape(1, len(score))
        columns = neighbor_rating.columns
        return pd.DataFrame(data= data , columns= columns)
    
    

    def recommend(self, user_id):
        user_index = self.users.loc[self.users['User-ID'] == user_id].index.values[0]
        user_ratings = book_ratings.iloc[user_index]
        recommendation_columns = []

        for i in range(len(user_ratings.index)):
            isbn = user_ratings.index[i]
            rating = user_ratings.values[i]
            if rating == 0.0:
                recommendation_columns.append(isbn)

        self.normalize(self.ratings)  
        similarity_matrix = self.create_similarity_matrix()
        neighbor_index = self.get_neighbors(user_id, similarity_matrix)
        neighbor_rating = self.normalized_ratings.loc[neighbor_index][recommendation_columns]
        neighbor_similarity = similarity_matrix[user_index].loc[neighbor_index]
        recommendation_score = self.score_item(user_id, neighbor_rating, neighbor_similarity, self.ratings)
        recommended_book_ISBNs = recommendation_score.stack().nlargest(self.k)
        recommended_book_ISBNs = [recommended_book_ISBNs.index.values[i][1] for i in range(len(recommended_book_ISBNs))]
        recommended_books = self.books.loc[self.books['ISBN'].isin(recommended_book_ISBNs)]

        return recommended_books
    

In [88]:
user_based_cf = UserBasedCollaborativeFiltering(sampled_users, sampled_books, book_ratings)
similarity_matrix = user_based_cf.create_similarity_matrix()

user_id = 23872
recommendations = user_based_cf.recommend(user_id)


In [91]:
recommendations.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
7,804106304,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),http://images.amazon.com/images/P/0804106304.0...,http://images.amazon.com/images/P/0804106304.0...,http://images.amazon.com/images/P/0804106304.0...
39,316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...
66,61009059,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...
224,399146431,The Bonesetter's Daughter,Amy Tan,2001,Putnam Publishing Group,http://images.amazon.com/images/P/0399146431.0...,http://images.amazon.com/images/P/0399146431.0...,http://images.amazon.com/images/P/0399146431.0...
255,811802981,The Golden Mean: In Which the Extraordinary Co...,Nick Bantock,1993,Chronicle Books,http://images.amazon.com/images/P/0811802981.0...,http://images.amazon.com/images/P/0811802981.0...,http://images.amazon.com/images/P/0811802981.0...


There is a glaring flaw in the design of this model:
For every time you want a recommendation, you have to find similarity between every single Neighbor.

This becomes computationally more expensive for every user involved.

In [1]:
    
    def __init__(self, users, books, ratings, k=10, max_rating=10.0):
        self.users = users
        self.users = self.users.reset_index()
        self.users = self.users.drop(columns=['index'])
        
        self.books = books
        
        self.ratings = ratings
        self.ratings = self.ratings.reset_index()
        self.ratings = self.ratings.drop(columns=['UserID'])
        
        self.k = k
        self.max_rating = max_rating
        
        self.frequencies = {}
        self.deviations = {}
        
    
    def prepare_data(self):
                
        user_indices = list(self.ratings.index.values)

        users_ratings = []
        for user_index in user_indices:
            rated_book_indices = list(self.ratings.iloc[user_index].to_numpy().nonzero()[0])
            users_ratings.append({user_index: dict(self.ratings[self.ratings.columns[rated_book_indices]].iloc[user_index])})
    
        self.users_ratings = users_ratings
        
        return self.users_ratings
        
        
    def compute_deviations(self):
        users_ratings = self.users_ratings
        num_users = len(self.users)
        
        for i in range(num_users):
            for ratings in self.users_ratings[i].values():
                for item, rating in ratings.items():
                    self.frequencies.setdefault(item, {})
                    self.deviations.setdefault(item, {})
                    
                    for (item2, rating2) in ratings.items():
                        if item != item2:
                            self.frequencies[item].setdefault(item2, 0)
                            self.deviations[item].setdefault(item2, 0.0)
                            self.frequencies[item][item2] += 1
                            self.deviations[item][item2] += rating - rating2
            
            for (item, ratings) in self.deviations.items():
                for item2 in ratings:
                    ratings[item2] /= self.frequencies[item][item2]
    
    
    def slope_one_recommend(self, user_ratings):
        recommendations = {}
        frequencies = {}
        
        for (user_item, user_rating) in user_ratings.items():
        
            for (diff_item, diff_ratings) in self.deviations.items():
                if diff_item not in user_ratings and user_item in self.deviations[diff_item]:
                    freq = self.frequencies[diff_item][user_item]
                    recommendations.setdefault(diff_item, 0.0)
                    frequencies.setdefault(diff_item, 0)
        
                    recommendations[diff_item] += (diff_ratings[user_item] + user_rating) * freq
                    frequencies[diff_item] += freq
        
        recommendations = [(k, v / frequencies[k]) for (k, v) in recommendations.items()]
        
        recommendations.sort(key=lambda ratings: ratings[1], reverse = True)
        
        return recommendations
    
    
    def recommend(self, recommendations):
        top_k_recommendations = recommendations[: self.k]
        
        isbns = [recommendation[0] for recommendation in top_k_recommendations]
        
        recommended_books = [self.books.loc[self.books['ISBN'] == isbn] for isbn in isbns]
        return pd.concat(recommended_books)

In [2]:
item_based_cf = ItemBasedCollaborativeFiltering(sampled_users, sampled_books, book_ratings)
users_ratings = item_based_cf.prepare_data()
item_based_cf.compute_deviations()

user_index = 1
pd.DataFrame(sampled_users.iloc[user_index])


NameError: name 'ItemBasedCollaborativeFiltering' is not defined

In [None]:
recommendations = item_based_cf.slope_one_recommend(users_ratings[user_index][user_index])
item_based_cf.recommend(recommendations)