# Collaborative Filtering example 

In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in xrange(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
    
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print 'Recommendation: '
        for u in xrange(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print '    Recommend item(s):', recommended_items, 'for user', u
            else: 
                print '    Recommend item', u, 'for user(s) : ', recommended_items

In [2]:
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.as_matrix()

rs = CF(Y_data, k = 2, uuCF = 1)
rs.fit()

rs.print_recommendation()

Recommendation: 
    Recommend item(s): [2] for user 0
    Recommend item(s): [1] for user 1
    Recommend item(s): [] for user 2
    Recommend item(s): [4] for user 3
    Recommend item(s): [4] for user 4
    Recommend item(s): [0, 3, 4] for user 5
    Recommend item(s): [1] for user 6


In [3]:
rs = CF(Y_data, k = 2, uuCF = 0)
rs.fit()

rs.print_recommendation()

Recommendation: 
    Recommend item 0 for user(s) :  []
    Recommend item 1 for user(s) :  [1]
    Recommend item 2 for user(s) :  [0]
    Recommend item 3 for user(s) :  [5]
    Recommend item 4 for user(s) :  [3, 4, 5]


## MovieLens

In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [5]:
rs = CF(rate_train, k = 30, uuCF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in xrange(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print 'User-user CF, RMSE =', RMSE

User-user CF, RMSE = 0.995198110088


In [7]:
rs = CF(rate_train, k = 30, uuCF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in xrange(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print 'Item-item CF, RMSE =', RMSE

Item-item CF, RMSE = 0.986791213271


# Matrix Factorization

In [104]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

class MF(object):
    """docstring for CF"""
    def __init__(self, Y_data, K, lam = 0.01, Xinit = None, Winit = None, 
                 learning_rate = 0.01, max_iter = 1000, print_every = 100):
        self.Y_data = Y_data
        self.K = K
        self.lam = lam
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.print_every = print_every
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        
        if Xinit is None: 
            self.X = np.random.randn(self.n_items, K)
        else:
            self.X = Xinit 
        
        if Winit is None: 
            self.W = np.random.randn(K, self.n_users)
        else: 
            self.W = Winit
            
        #self.all_users = self.Y_data[:,0] # all users (may be duplicated)
        self.n_ratings = Y_data.shape[0]
#         self.all_items = 
        
    def loss(self):
        L = 0 
        for i in xrange(self.Y_data.shape[0]):
            # user, item, rating
            n, m, rate = int(self.Y_data[i, 0]), int(self.Y_data[i, 1]), self.Y_data[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
        # regularization, don't ever forget this 
        L /= self.n_ratings
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L 

    
    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user n, and the corresponding ratings
        """
        # y = self.Y_data[:,0] # all users (may be duplicated)
        # item indices rated by user_id
        # we need to +1 to user_id since in the rate_matrix, id starts from 1 
        # while index in python starts from 0
        ids = np.where(self.Y_data[:,0] == user_id)[0] 
        item_ids = self.Y_data[ids, 1].astype(np.int32) # index starts from 0 
        ratings = self.Y_data[ids, 2]
        return (item_ids, ratings)
        
        
    def get_users_who_rate_items(self, item_id):
        """
        get all users who rated item m and get the corresponding ratings
        """
        ids = np.where(self.Y_data[:,1] == item_id)[0] 
        user_ids = self.Y_data[ids, 0].astype(np.int32)
        ratings = self.Y_data[ids, 2]
        return (user_ids, ratings)
        
    def updateX(self):
        for m in xrange(self.n_items):
            user_ids, ratings = self.get_users_who_rate_items(m)
            Wm = self.W[:, user_ids]
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + \
                                               self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.K,))
    
    def updateW(self):
        for n in xrange(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            grad_wn = -Xn.T.dot(ratings- Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
    
    def fit(self):
        for it in xrange(self.max_iter):
            self.updateX()
            self.updateW()
            if it % self.print_every == 0:
                print 'iter =', it, ', loss =', self.loss()
    
    
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        if you need the un
        """
        return self.X[i, :].dot(self.W[:, u])
    
    def pred_for_user(self, user_id):
        ids = np.where(self.Y_data[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        
        y_pred = self.X.dot(self.W[:, user_id])
        predicted_ratings= []
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        
        return predicted_ratings
        
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.as_matrix()


# rs = MF(Y_data, K = 2, max_iter = 10000, print_every = 1000)
# print rs.loss()
# print rs.get_items_rated_by_user(0)
# print rs.get_users_who_rate_items(0)
# rs.fit()
# rs.pred(6, 1)

iter = 0 , loss = 98.7008775425
iter = 1000 , loss = 0.342079701171
iter = 2000 , loss = 0.337830611833
iter = 3000 , loss = 0.337028421695
iter = 4000 , loss = 0.336791618403
iter = 5000 , loss = 0.336705926441
iter = 6000 , loss = 0.336671618102
iter = 7000 , loss = 0.336657033935
iter = 8000 , loss = 0.336650531035
iter = 9000 , loss = 0.336647486818


2.0722618757536737

In [76]:
rs.pred(5, 3)

3.5935106759483237

In [82]:
print rs.pred_for_user(5)

[(0, 2.3778602942313158), (3, 3.5935106759483237), (4, 4.7285231643348915)]


# Áp dụng lên MovieLens 100k

In [83]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [88]:
rs = MF(rate_train, K = 50, lam = 0.01, print_every = 10, learning_rate = 0.001, max_iter = 500)
rs.fit()

iter = 0 , loss = 1569278.5507
iter = 10 , loss = 149286.939727
iter = 20 , loss = 66308.0309511
iter = 30 , loss = 44584.8958369
iter = 40 , loss = 35328.1342293
iter = 50 , loss = 30316.6241823
iter = 60 , loss = 27185.1790385
iter = 70 , loss = 25032.6215557
iter = 80 , loss = 23448.6925012
iter = 90 , loss = 22222.2541493
iter = 100 , loss = 21234.4851404
iter = 110 , loss = 20413.8069973
iter = 120 , loss = 19714.7231755
iter = 130 , loss = 19107.0285835
iter = 140 , loss = 18569.9323588
iter = 150 , loss = 18088.6797059
iter = 160 , loss = 17652.5210832
iter = 170 , loss = 17253.4440334
iter = 180 , loss = 16885.354923
iter = 190 , loss = 16543.5357269
iter = 200 , loss = 16224.2742722
iter = 210 , loss = 15924.6069141
iter = 220 , loss = 15642.1358856
iter = 230 , loss = 15374.8973365
iter = 240 , loss = 15121.2644635
iter = 250 , loss = 14879.8753676
iter = 260 , loss = 14649.578621
iter = 270 , loss = 14429.3917064
iter = 280 , loss = 14218.468939
iter = 290 , loss = 14016.076

In [91]:
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in xrange(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print 'Item-item CF, RMSE =', RMSE

Item-item CF, RMSE = 2.96112237793


In [102]:
rs2 = MF(rate_train, K = 10, lam = 0.01, print_every = 20, learning_rate = 0.001, max_iter = 200)
rs2.fit()

iter = 0 , loss = 834968.255442
iter = 20 , loss = 63485.6626282
iter = 40 , loss = 42930.3071275
iter = 60 , loss = 37754.5380156
iter = 80 , loss = 35325.5213646
iter = 100 , loss = 33808.7359909
iter = 120 , loss = 32706.0820541
iter = 140 , loss = 31837.7213261
iter = 160 , loss = 31122.6640069
iter = 180 , loss = 30516.7608351


In [109]:
rs2 = MF(rate_train, K = 10, lam = 0.1, print_every = 20, \
         learning_rate = 0.001, max_iter = 200, Xinit = rs2.X, Winit = rs2.W)
rs2.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in xrange(n_tests):
    pred = rs2.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print 'Item-item CF, RMSE =', RMSE

iter = 0 , loss = 24260.0454056
iter = 20 , loss = 24197.0029039
iter = 40 , loss = 24140.5045481
iter = 60 , loss = 24083.5045692
iter = 80 , loss = 24017.802411
iter = 100 , loss = 23940.2497239
iter = 120 , loss = 23855.1684985
iter = 140 , loss = 23770.0085465
iter = 160 , loss = 23690.5411917
iter = 180 , loss = 23620.0167905
Item-item CF, RMSE = 1.27275453169


In [None]:
rs3 = MF(rate_train, K = 20, lam = 0.1, print_every = 20, \
         learning_rate = 0.001, max_iter = 500)
rs3.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in xrange(n_tests):
    pred = rs3.pred(rate_test[n, 0], rate_test[n, 1])
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print 'Item-item CF, RMSE =', RMSE

iter = 0 , loss = 1034909.86738
iter = 20 , loss = 67304.2053151
iter = 40 , loss = 41571.4099447
iter = 60 , loss = 34918.5504421
iter = 80 , loss = 31836.6336816
iter = 100 , loss = 29956.6596148
iter = 120 , loss = 28619.950963
iter = 140 , loss = 27579.2825843
iter = 160 , loss = 26721.9522912
iter = 180 , loss = 25988.912468
iter = 200 , loss = 25345.9122325
iter = 220 , loss = 24771.5424811
iter = 240 , loss = 24251.6451836
iter = 260 , loss = 23776.4215032
iter = 280 , loss = 23338.8127281
iter = 300 , loss = 22933.5416392
iter = 320 , loss = 22556.5238398
iter = 340 , loss = 22204.4976021
iter = 360 , loss = 21874.786983
iter = 380 , loss = 21565.1476009
iter = 400 , loss = 21273.6640641
iter = 420 , loss = 20998.6798201
iter = 440 , loss = 20738.7475182
