In [236]:
import numpy as np
from sklearn.cross_validation import train_test_split

## データの用意

In [237]:
dir_name = './ml-100k/'
# read the number of users and items
f = open(dir_name + 'u.info')
user_num, item_num = [(int)(li.split(" ")[0]) for li in f.readlines()[:-1]]
f.close()

# read ratings
R = np.zeros((user_num,item_num))
print R.shape
f = open(dir_name + 'u.data')
lines = [map(lambda x: int(x), li.split("\t")[:-1]) for li in f.readlines()]
f.close()
for li in lines:
    R[li[0]-1,li[1]-1] = li[2]


(943, 1682)


## Matrix Factorization

誤差関数は以下の通り
\begin{equation*} \min_{p,q,b_u,b_i} \sum_{(u,i) \in R} (r_{u,i} - p_u q^{T}_i - \mu - b_u - b_i)^{2} + \lambda (\|p\|^{2} + \|q\|^{2}  + b^{2}_u + b^{2}_i)\end{equation*}


In [249]:
def accuracy_score(ans,pred):
    """
    要素が0以外の正解率を計算
    """
    cmp_num = (ans != 0).sum() # ansの0でない要素の数
    mistake_num = (pred * (ans != 0) != ans).sum() #ansの０以外の要素のうちpredと不一致の数
    return (cmp_num - mistake_num) / float(cmp_num)

def cost_func(R,P,Q,mu,b_u,b_i,lamb):
    """
    return error fucntion
    """
    sigma = np.sum(((R - mu - b_u[:,np.newaxis] - b_i[np.newaxis,:] - np.dot(P,Q.T))*(R != 0))**2)#二乗誤差
    cost = sigma + (lamb/2) * (np.sum(b_i**2) + np.sum(b_u**2) + np.linalg.norm(P) + np.linalg.norm(Q))
    return cost

def matrix_factorization(R,latent_dim = 20,alpha=0.005,lamb=0.03):
    """
    R : 2-D np.array (the number of Users, the number of Items)
    latent_dim : the number of latent dimention 
    alpha : the learning rate of SGD
    lamb : regularization parameter
    """
    #split data
    test_R = R[:R.shape[0]/2, :R.shape[1]/2].copy()    
    R[:R.shape[0]/2, :R.shape[1]/2] = 0
    
    # initialize parameter
    low = 0.1
    high = 0.5
    P = np.random.uniform(low=low,high =high,size=(R.shape[0],latent_dim))
    Q = np.random.uniform(low=low,high =high,size=(R.shape[1],latent_dim))
    mu = R.sum() / (R != 0).sum() #  the average rating over all items
    b_u = np.random.uniform(low=low, high=high,size=(R.shape[0])) # assume the average rating of each user - mu
    b_i = np.random.uniform(low=low,high=high,size=(R.shape[1])) # assume the average rating of each item - mu
    
    #train and test
    for epoch in xrange(300):  
        for i,j in np.argwhere(R != 0):# i: user ID, j: Item ID
            err = R[i,j] - np.dot(P[i],Q[j].T) - mu - b_u[i] - b_i[j] 
            #update(SGD)
            b_u[i] += alpha*(err - lamb*b_u[i])
            b_i[j] += alpha*(err - lamb*b_i[j])
            Q[j] += alpha*(err*P[i] - lamb*Q[j])            
            P[i] += alpha*(err*Q[j] - lamb*P[i])
        #test
        train_cost = cost_func(R,P,Q,mu,b_u,b_i,lamb)
        pred = np.around(np.dot(P,Q.T)+ mu + b_u[:,np.newaxis] + b_i[np.newaxis,:]).astype("int8")
        test_cost = cost_func(R[:R.shape[0]/2, :R.shape[1]/2],P[:R.shape[0]/2],Q[:R.shape[1]/2],mu,b_u[:R.shape[0]/2],b_i[:R.shape[1]/2],lamb)
        if (epoch+1) % 10 == 0 or epoch == 0:
            print "epoch: %s, train cost: %.3f, train accuracy : %.3f, test cost: %.3f, test accuracy: %.3f" \
                % (epoch+1, train_cost, accuracy_score(R,pred), test_cost, accuracy_score(test_R,pred[:R.shape[0]/2, :R.shape[1]/2]))
            
        
    return pred
        

In [None]:
pred = matrix_factorization(R.copy() ,50)


epoch: 1, train cost: 120075.149, train accuracy : 0.266, test cost: 3.212, test accuracy: 0.219
epoch: 10, train cost: 42502.599, train accuracy : 0.430, test cost: 3.916, test accuracy: 0.319
epoch: 20, train cost: 32416.104, train accuracy : 0.483, test cost: 4.525, test accuracy: 0.345
epoch: 30, train cost: 23339.034, train accuracy : 0.552, test cost: 4.780, test accuracy: 0.355
epoch: 40, train cost: 16933.663, train accuracy : 0.626, test cost: 4.901, test accuracy: 0.358
epoch: 50, train cost: 12921.768, train accuracy : 0.695, test cost: 4.962, test accuracy: 0.359
epoch: 60, train cost: 10396.895, train accuracy : 0.750, test cost: 4.987, test accuracy: 0.358
epoch: 70, train cost: 8744.863, train accuracy : 0.791, test cost: 4.992, test accuracy: 0.356
epoch: 80, train cost: 7617.492, train accuracy : 0.822, test cost: 4.983, test accuracy: 0.355
epoch: 90, train cost: 6817.294, train accuracy : 0.846, test cost: 4.966, test accuracy: 0.354
