In [255]:
import numpy as np
from sklearn.cross_validation import train_test_split

## データの用意

In [256]:
dir_name = './ml-100k/'
# read the number of users and items
f = open(dir_name + 'u.info')
user_num, item_num = [(int)(li.split(" ")[0]) for li in f.readlines()[:-1]]
f.close()

# read ratings
R = np.zeros((user_num,item_num))
print R.shape
f = open(dir_name + 'u.data')
lines = [map(lambda x: int(x), li.split("\t")[:-1]) for li in f.readlines()]
f.close()
for li in lines:
    R[li[0]-1,li[1]-1] = li[2]


(943, 1682)


## Matrix Factorization

誤差関数は以下の通り
\begin{equation*} \min_{p,q,b_u,b_i} \sum_{(u,i) \in R} (r_{u,i} - p_u q^{T}_i - \mu - b_u - b_i)^{2} + \lambda (\|p\|^{2} + \|q\|^{2}  + b^{2}_u + b^{2}_i)\end{equation*}


In [257]:
def accuracy_score(ans,pred):
    """
    要素が0以外の正解率を計算
    """
    cmp_num = (ans != 0).sum() # ansの0でない要素の数
    mistake_num = (pred * (ans != 0) != ans).sum() #ansの０以外の要素のうちpredと不一致の数
    return (cmp_num - mistake_num) / float(cmp_num)

def cost_func(R,P,Q,mu,b_u,b_i,lamb):
    """
    return error fucntion
    """
    sigma = np.sum(((R - mu - b_u[:,np.newaxis] - b_i[np.newaxis,:] - np.dot(P,Q.T))*(R != 0))**2)#二乗誤差
    cost = sigma + (lamb/2) * (np.sum(b_i**2) + np.sum(b_u**2) + np.linalg.norm(P) + np.linalg.norm(Q))
    return cost

def matrix_factorization(R,latent_dim = 20,alpha=0.005,lamb=0.03):
    """
    R : 2-D np.array (the number of Users, the number of Items)
    latent_dim : the number of latent dimention 
    alpha : the learning rate of SGD
    lamb : regularization parameter
    """
    #split data
    test_R = R[:R.shape[0]/2, :R.shape[1]/2].copy()    
    R[:R.shape[0]/2, :R.shape[1]/2] = 0
    
    # initialize parameter
    low = 0.1
    high = 0.5
    P = np.random.uniform(low=low,high =high,size=(R.shape[0],latent_dim))
    Q = np.random.uniform(low=low,high =high,size=(R.shape[1],latent_dim))
    mu = R.sum() / (R != 0).sum() #  the average rating over all items
    b_u = np.random.uniform(low=low, high=high,size=(R.shape[0])) # assume the average rating of each user - mu
    b_i = np.random.uniform(low=low,high=high,size=(R.shape[1])) # assume the average rating of each item - mu
    
    #train and test
    for epoch in xrange(300):  
        for i,j in np.argwhere(R != 0):# i: user ID, j: Item ID
            err = R[i,j] - np.dot(P[i],Q[j].T) - mu - b_u[i] - b_i[j] 
            #update(SGD)
            b_u[i] += alpha*(err - lamb*b_u[i])
            b_i[j] += alpha*(err - lamb*b_i[j])
            Q[j] += alpha*(err*P[i] - lamb*Q[j])            
            P[i] += alpha*(err*Q[j] - lamb*P[i])
        #test
        train_cost = cost_func(R,P,Q,mu,b_u,b_i,lamb)
        pred = np.around(np.dot(P,Q.T)+ mu + b_u[:,np.newaxis] + b_i[np.newaxis,:]).astype("int8")
        test_cost = cost_func(R[:R.shape[0]/2, :R.shape[1]/2],P[:R.shape[0]/2],Q[:R.shape[1]/2],mu,b_u[:R.shape[0]/2],b_i[:R.shape[1]/2],lamb)
        if (epoch+1) % 20 == 0 or epoch == 0:
            print "epoch: %s, train cost: %.3f, train accuracy : %.3f, test cost: %.3f, test accuracy: %.3f" \
                % (epoch+1, train_cost, accuracy_score(R,pred), test_cost, accuracy_score(test_R,pred[:R.shape[0]/2, :R.shape[1]/2]))
            
        
    return pred
        

In [258]:
pred = matrix_factorization(R.copy() ,50)


epoch: 1, train cost: 119814.368, train accuracy : 0.266, test cost: 3.234, test accuracy: 0.220
epoch: 20, train cost: 32389.288, train accuracy : 0.483, test cost: 4.556, test accuracy: 0.342
epoch: 40, train cost: 16977.212, train accuracy : 0.628, test cost: 4.912, test accuracy: 0.356
epoch: 60, train cost: 10374.159, train accuracy : 0.752, test cost: 4.984, test accuracy: 0.357
epoch: 80, train cost: 7579.391, train accuracy : 0.826, test cost: 4.971, test accuracy: 0.355
epoch: 100, train cost: 6188.980, train accuracy : 0.865, test cost: 4.929, test accuracy: 0.351
epoch: 120, train cost: 5395.588, train accuracy : 0.887, test cost: 4.879, test accuracy: 0.349
epoch: 140, train cost: 4895.185, train accuracy : 0.901, test cost: 4.831, test accuracy: 0.347
epoch: 160, train cost: 4556.067, train accuracy : 0.911, test cost: 4.787, test accuracy: 0.346
epoch: 180, train cost: 4313.713, train accuracy : 0.917, test cost: 4.748, test accuracy: 0.344
epoch: 200, train cost: 4133.24