In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [15]:
class NBCF:
    
    def __init__(self, Y_data, k, dist_func=cosine_similarity, uuCF=1): # uuCF=1: user-user oriented, uuCF=0: item-item oriented rec system
        '''
        Generate utility matrix Y
        '''
        self.uuCF = uuCF
        # exchange position of the first 2 columns in item-item rec system, representing matrix transpose
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]] 
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        
    def add(self, new_data):
        '''
        Update Utility matrix by adding new rows to the bottom of Utility Matrix
        For simplicity, suppose there's no new users or items, nor changes in ratings
        '''
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        '''
        Normalize utility matrix Yhat
        '''
        users = self.Y_data[:, 0] # first column of the Y_data- all users
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,)) #a vector with size=number of users tp store mean user ratings
        
        for n in range(self.n_users):
            # row indices of ratings by user n
            # indices need to be converted to intergers 
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all  ratings by user n
            item_ids = self.Y_data[ids, 1]
            # the corresponding rating values
            ratings = self.Y_data[ids, 2]
            # caculate mean of those ratings
            m = np.mean(ratings)
            # avoid empty array and nan value
            if np.isnan(m):
                m = 0
            # update vector mu
            self.mu[n] = m
            #normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n] #only save the ratings that user n did
        
        # Rating matris is a sparse matrix. 
        # ex: n_users=1M, n_items=100k -> rating matrix has shape (100k, 1M), 
        # many of its value maybe 0 and we may not have enough memory to store this matrix
        # For memory and computing efficiency, only store non-zeros values and their locations
        # using sparse coo matrix (row location, col location, value)
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                                      (self.Ybar_data[:,1], 
                                       self.Ybar_data[:, 0])), 
                                      (self.n_items, self.n_users))
        # convert this into Compressed sparse row fomat
        self.Ybar = self.Ybar.tocsr()
    
    def similarity(self):
        '''
        User similarity matrix S
        '''
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
        
    def fit(self):
        '''
        Normalize data and calculate similarity matrix again after adding ratings
        '''
        self.normalize_Y()
        self.similarity()
        
    def __pred(self, u, i, normalized=1):
        '''
        Predict the rating of user u for item i (normalized)  
        '''
        # find indices of all users that rated item i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # find users of those indices
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # find similarity between the user u with other users who also rated item i
        sim = self.S[u, users_rated_i]
        # find the k most similar users
        a = np.argsort(sim)[-self.k:]  #index of k most similar users
        # the corresponding similarity levels
        nearest_s = sim[a]
        # the rating for item i rated by k most similar users
        r = self.Ybar[i, users_rated_i[a]]
        #apply the formula
        if normalized:
             # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
        
        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized=1):
        if self.uuCF:
            return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized) #if item-item, transpose matrix
    
    def recommend(self, u, normalized=1):
        """
        Determine all items should be recommended for user u. (uuCF =1)
        or all users who might have interest on item u (uuCF = 0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:,0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)
                    
        return recommended_items
    
    def print_recommendation(self):
        '''
        Print items recommended for each users
        '''
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Recommend item(s):', recommended_items, 'to user', u)
            else: 
                print ('Recommend item', u, 'to user(s) : ', recommended_items)   

In [16]:
#test

r_cols =['user_id', 'item_id', 'rating']
ratings = pd.read_csv('./DATA/ex.dat.txt', sep = ' ',names = r_cols, encoding = 'latin-1')
Y_data = ratings.values

rs = NBCF(Y_data, k = 2, uuCF = 1)
rs.fit()

rs.print_recommendation()

Recommendation: 
Recommend item(s): [2] to user 0
Recommend item(s): [1] to user 1
Recommend item(s): [] to user 2
Recommend item(s): [4] to user 3
Recommend item(s): [4] to user 4
Recommend item(s): [0, 3, 4] to user 5
Recommend item(s): [1] to user 6


In [17]:
rs = NBCF(Y_data, k = 2, uuCF = 0)
rs.fit()

rs.print_recommendation()

Recommendation: 
Recommend item 0 to user(s) :  []
Recommend item 1 to user(s) :  [1]
Recommend item 2 to user(s) :  [0]
Recommend item 3 to user(s) :  [5]
Recommend item 4 to user(s) :  [3, 4, 5]


APPLY TO MovieLens100k DATASET

In [20]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('./DATA/ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('./DATA/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [24]:
# result with User-user NDCF:

rs = NBCF(rate_train, k = 30, uuCF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print ('User-user CF, RMSE =', RMSE)


User-user CF, RMSE = 0.9951981100882598


In [25]:
rs = NBCF(rate_train, k = 30, uuCF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print ('Item-item CF, RMSE =', RMSE)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Item-item CF, RMSE = 0.9867912132705384
