Next key step in building CF-based recommendation systems is to generate user-item ratings matrix from the ratings table.
 

Using SKlearn, we are going to use a variety of functions to find similarity, predict, and recommend different books.

In [83]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os, sys
import re


In [8]:
ratings = pd.read_csv('BX-Book-Ratings-Clean.csv', encoding='ISO-8859–1',on_bad_lines='skip',quotechar='"',sep=";",escapechar= "\\")
books = pd.read_csv('BX_Books.csv', encoding='ISO-8859–1',on_bad_lines='skip',quotechar='"',sep=";",escapechar= "\\")
users = pd.read_csv('BX-Users.csv', encoding='ISO-8859–1',on_bad_lines='skip',quotechar='"',sep=";",escapechar= "\\")



In [38]:
ratings.head()



Unnamed: 0,User-ID,ISBN,Book-Rating
412,276925,0006511929,0
413,276925,002542730X,10
414,276925,0060520507,0
415,276925,0060930934,0
416,276925,0060951303,0


In [41]:
#remove implicit data
drop_duplicate_ratings = ratings.drop_duplicates()
explicit_book_ratings = drop_duplicate_ratings[drop_duplicate_ratings['Book-Rating'] > 0]


#merge Users and Ratings dataset
reviews_and_users = pd.merge(left=explicit_book_ratings,right= books, how = 'inner').merge(users, how = 'inner')
reviews_and_users = reviews_and_users.drop_duplicates()

#Dataset Cleaning
reviews_and_users = reviews_and_users.drop(columns = ['Location'])
reviews_and_users = reviews_and_users.rename(columns={"User-ID": "User ID", "Book-Rating": "Book Rating", "Book-Author": "Book Author", "Book-Title": "Book Title","Year-Of-Publication": "Publication Year"})
reviews_and_users['Book Author'] = reviews_and_users['Book Author'].str.title()

For the testing set, I am reducing the dataset down to users who have reviewed 100 books & Books with 100 ratings.


In [43]:
counts = reviews_and_users["User ID"].value_counts()
ratings = reviews_and_users[reviews_and_users["User ID"].isin(counts[counts>= 100 ].index)]

counts2 = reviews_and_users["Book Rating"].value_counts()
ratings = reviews_and_users[reviews_and_users["Book Rating"].isin(counts[counts>= 100 ].index)]


In [60]:
ratings_matrix = reviews_and_users.pivot(index='User ID',
                                         columns='ISBN',
                                         values= 'Book Rating')

ratings_matrix = reviews_and_users.pivot(index='User ID',
                                         columns='ISBN',
                                         values='Book Rating')

userID = ratings_matrix.index
ISBN = ratings_matrix.columns

print(ratings_matrix.shape)
ratings_matrix.head()


(1810, 90731)


ISBN,0000913154,0001046438,000104687X,0001047213,0001047973,000104799X,0001048082,0001053736,0001053744,0001055607,...,B0000T6KHI,B0000T6KIM,B0000VZEJQ,B0000X8HIE,B00011SOXI,B00013AX9E,B0001FZGRQ,B0001GMSV2,B0001I1KOG,B000234N3A
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
183,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
507,,,,,,,,,,,...,,,,,,,,,,
882,,,,,,,,,,,...,,,,,,,,,,
1424,,,,,,,,,,,...,,,,,,,,,,


In [61]:
#since NaNs cannot be handled by training algorithms, replacing these by 0, which indicates absence of ratings
#setting data type
ratings_matrix.fillna(0, inplace = True)
ratings_matrix = ratings_matrix.astype(np.int32)

In [62]:
ratings_matrix.head(5)

ISBN,0000913154,0001046438,000104687X,0001047213,0001047973,000104799X,0001048082,0001053736,0001053744,0001055607,...,B0000T6KHI,B0000T6KIM,B0000VZEJQ,B0000X8HIE,B00011SOXI,B00013AX9E,B0001FZGRQ,B0001GMSV2,B0001I1KOG,B000234N3A
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
183,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
507,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
#setting global variables
global metric,k
k=10
metric='cosine'

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

In [66]:
#This function finds k similar users given the user_id and ratings matrix 
#These similarities are same as obtained via using pairwise_distances
def findksimilarusers(user_id, ratings, metric = metric, k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
            
    return similarities,indices

In [67]:
#This function finds k similar items given the item_id and ratings matrix

def findksimilaritems(item_id, ratings, metric=metric, k=k):
    similarities=[]
    indices=[]
    ratings=ratings.T
    loc = ratings.index.get_loc(item_id)
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()

    return similarities,indices

In [71]:
#This function predicts the rating for specified user-item combination based on item-based approach
def predict_itembased(user_id, item_id, ratings, metric = metric, k=k):
    prediction= wtd_sum =0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilaritems(item_id, ratings) #similar users based on correlation coefficients
    sum_wt = np.sum(similarities)-1
    product=1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == item_loc:
            continue;
        else:
            product = ratings.iloc[user_loc,indices.flatten()[i]] * (similarities[i])
            wtd_sum = wtd_sum + product                              
    prediction = int(round(wtd_sum/sum_wt))
    
    #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings
    #which are handled here as below //code has been validated without the code snippet below, below snippet is to avoid negative
    #predictions which might arise in case of very sparse datasets when using correlation metric
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10

    print('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))      
    
    return prediction

In [88]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = metric, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices=findksimilarusers(user_id,ratings,metric,k)
    mean_ratings= ratings.iloc[user_loc,:].mean()
    
    sum_wt = np.sum(similarities)-1
    product = 1
    wtd_sum = 0
    for i in range(0,len(indices.flatten() )):
        flatten_i=indices.flatten()[i]
        if flatten_i == user_loc:
            continue
        else:
            ratings_diff = ratings.iloc[flatten_i,item_loc]-np.mean(ratings.iloc[flatten_i,:])
            product = ratings_diff * (similarities[i])
            wtd_sum+=product
    if prediction < 0:
        prediction = 1
    elif prediction>10:
        prediction=10
        
    prediction = int(mean_ratings + (wtd_sum/sum_wt))
    print(f'Prediction rating for User {user_id} -> item {item_id}: {prediction}')

    return prediction
            

In [79]:
prediction = predict_itembased(11676,'0001056107',ratings_matrix)


Predicted rating for user 11676 -> item 0001056107: 1


In [75]:
#This function utilizes above functions to recommend items for item/user based approach and cosine/correlation. 
#Recommendations are made if the predicted rating for an item is >= to 6,and the items have not been rated already
def recommendItem(user_id, ratings, metric=metric):    
    if (user_id not in ratings.index.values) or type(user_id) is not int:
        print("User id should be a valid integer from this list :\n\n {} ".format(np.array_str(ratings_matrix.index.values)))
    else:    
        ids = ['Item-based (correlation)','Item-based (cosine)','User-based (correlation)','User-based (cosine)']
        select = widgets.Dropdown(options=ids, value=ids[0],description='Select approach', width='1000px')
        def on_change(change):
            clear_output(wait=True)
            prediction = []            
            if change['type'] == 'change' and change['name'] == 'value':            
                if (select.value == 'Item-based (correlation)') | (select.value == 'User-based (correlation)') :
                    metric = 'correlation'
                else:                       
                    metric = 'cosine'   
                with suppress_stdout():
                    if (select.value == 'Item-based (correlation)') | (select.value == 'Item-based (cosine)'):
                        for i in range(ratings.shape[1]):
                            if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already
                                prediction.append(predict_itembased(user_id, str(ratings.columns[i]) ,ratings, metric))
                            else:                    
                                prediction.append(-1) #for already rated items
                    else:
                        for i in range(ratings.shape[1]):
                            if (ratings[str(ratings.columns[i])][user_id] !=0): #not rated already
                                prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))
                            else:                    
                                prediction.append(-1) #for already rated items
                prediction = pd.Series(prediction)
                prediction = prediction.sort_values(ascending=False)
                recommended = prediction[:10]
                print(f"As per {select.value} approach....Following books are recommended...")
                for i in range(len(recommended)):
                    toprint=books.bookTitle[recommended.index[i]].encode('utf-8')
                    print("{0}. {1}".format(i+1,toprint))                 
        select.observe(on_change)
        display(select)

In [93]:
similarities,indices=findksimilaritems('0001056107',ratings_matrix)
prediction = predict_itembased(11676,'0001056107',ratings_matrix)
#checking for incorrect entries
recommendItem(999999,ratings_matrix)


Predicted rating for user 11676 -> item 0001056107: 1
User id should be a valid integer from this list :

 [   183    254    507 ... 278137 278188 278418] 


In [99]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')

In [100]:
recommendItem(4385,ratings_matrix)