# Matrix Factorization

In [45]:
from __future__ import print_function 
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

class MF(object):
    def __init__(self, Y, K, lam = 0.02, Xinit = None, Winit = None, 
                 learning_rate = 0.005, max_epoch = 20, print_every = 2):
        self.Y      = Y
        self.user   = Y[:, 0]
        self.item   = Y[:, 1]
        self.rating = Y[:, 2]
        self.n_users       = int(np.max(self.user)) + 1 
        self.n_items       = int(np.max(self.item)) + 1
        self.n_ratings     = Y.shape[0] # number of known ratings
        self.X = .1*np.random.randn(self.n_items, K) if Xinit is None else Xinit 
        self.W = .1*np.random.randn(self.n_users, K) if Winit is None else Winit 
        self.b = np.zeros(self.n_items) # item biases
        self.d = np.zeros(self.n_users) # user biases
        self.mu = np.mean(self.rating)
        self.K      = K    # 
        self.lam    = lam  # regularization parameter 
        self.learning_rate = learning_rate
        self.max_epoch      = max_epoch # maximum number of iterations 
        self.print_every   = print_every # print loss+ RMSE on training data after each ? iters 

    def _loss(self):
        L = 0 
        for n in range(self.n_ratings):
            # user_id, item_id, rating
            u, i, rating = self.user[n], self.item[n], self.rating[n]
            pred_rating = self.X[i].dot(self.W[u]) + self.b[i] + self.d[u] + self.mu 
            L += 0.5*(pred_rating - rating)**2 + .5*self.lam*(self.b[i]*2 + 
                    self.d[u]*2 + np.sum(self.X[i]**2) + np.sum(self.W[u]**2))
        
        return L/self.n_ratings
    
    def fit(self):
        for it in range(self.max_epoch):
            # mix data 
            idx = np.random.permutation(self.n_ratings)
            for n in idx:
                u, i, rating = self.user[n], self.item[n], self.rating[n]
                pred_rating = self.X[i].dot(self.W[u]) + self.b[i] + self.d[u] + self.mu 
                error = pred_rating - rating 
                self.b[i] -= self.learning_rate*(error + self.lam*self.b[i])
                self.d[u] -= self.learning_rate*(error + self.lam*self.d[u])
                self.X[i] -= self.learning_rate*(error*self.W[u] + self.lam*self.X[i])
                self.W[u] -= self.learning_rate*(error*self.X[i] + self.lam*self.W[u])

            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y)
                print('iter = %d, loss = %.4f, RMSE train = %.4f'%(it + 1, self._loss(), rmse_train))
    
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        """
        u, i = int(u), int(i)
        pred = self.X[i].dot(self.W[u]) + self.b[i] + self.d[u] + self.mu# + bias
        return max(0, min(5, pred)) # pred should be between 0 and 5 in MoviesLen 
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0] # number of test 
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE
        

# Áp dụng lên MovieLens 100k

In [46]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [47]:
rs = MF(rate_train, K = 100, lam = .1, print_every = 1, learning_rate = 0.005, max_epoch = 50)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nMatrix Factorization CF, RMSE = %.4f' %RMSE)

iter = 1, loss = 0.5727, RMSE train = 0.9862
iter = 2, loss = 0.5272, RMSE train = 0.9508
iter = 3, loss = 0.5020, RMSE train = 0.9332
iter = 4, loss = 0.4855, RMSE train = 0.9219
iter = 5, loss = 0.4730, RMSE train = 0.9136
iter = 6, loss = 0.4640, RMSE train = 0.9076
iter = 7, loss = 0.4567, RMSE train = 0.9027
iter = 8, loss = 0.4511, RMSE train = 0.8986
iter = 9, loss = 0.4459, RMSE train = 0.8950
iter = 10, loss = 0.4417, RMSE train = 0.8918
iter = 11, loss = 0.4379, RMSE train = 0.8887
iter = 12, loss = 0.4348, RMSE train = 0.8857
iter = 13, loss = 0.4317, RMSE train = 0.8827
iter = 14, loss = 0.4290, RMSE train = 0.8799
iter = 15, loss = 0.4265, RMSE train = 0.8770
iter = 16, loss = 0.4243, RMSE train = 0.8740
iter = 17, loss = 0.4217, RMSE train = 0.8707
iter = 18, loss = 0.4196, RMSE train = 0.8676
iter = 19, loss = 0.4177, RMSE train = 0.8643
iter = 20, loss = 0.4154, RMSE train = 0.8608
iter = 21, loss = 0.4136, RMSE train = 0.8573
iter = 22, loss = 0.4112, RMSE train = 0.85

In [36]:
rs = MF(rate_train, K = 100, lam = .1, print_every = 1, learning_rate = 0.005, max_epoch = 50, Xinit = rs.X, Winit = rs.W)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nMatrix Factorization CF, RMSE = %.4f' %RMSE)

iter = 1, loss = 0.4503, RMSE train = 0.5233
iter = 2, loss = 0.4244, RMSE train = 0.5069
iter = 3, loss = 0.4116, RMSE train = 0.5101
iter = 4, loss = 0.4039, RMSE train = 0.5180
iter = 5, loss = 0.3982, RMSE train = 0.5263
iter = 6, loss = 0.3938, RMSE train = 0.5343
iter = 7, loss = 0.3904, RMSE train = 0.5418
iter = 8, loss = 0.3878, RMSE train = 0.5487
iter = 9, loss = 0.3852, RMSE train = 0.5547
iter = 10, loss = 0.3832, RMSE train = 0.5602
iter = 11, loss = 0.3813, RMSE train = 0.5652
iter = 12, loss = 0.3798, RMSE train = 0.5696
iter = 13, loss = 0.3783, RMSE train = 0.5739
iter = 14, loss = 0.3773, RMSE train = 0.5777
iter = 15, loss = 0.3761, RMSE train = 0.5812
iter = 16, loss = 0.3750, RMSE train = 0.5842
iter = 17, loss = 0.3741, RMSE train = 0.5872
iter = 18, loss = 0.3729, RMSE train = 0.5898
iter = 19, loss = 0.3722, RMSE train = 0.5923
iter = 20, loss = 0.3716, RMSE train = 0.5947
iter = 21, loss = 0.3707, RMSE train = 0.5967
iter = 22, loss = 0.3699, RMSE train = 0.59

In [40]:
rs.X.shape
rs.n_items

5