In [9]:
import numpy as np
import pandas as pd

def make_data(arr):
    arr = np.where(arr<0, 0, arr)
    arr = np.where(arr>10, 10, arr)
    res = np.zeros((arr.shape[0], arr.shape[1]))
    for i in range(arr.shape[0]):
        rand = np.random.randint(10, 31)
        rand_index = np.random.choice(arr.shape[1], rand, replace=False)
        for j in rand_index:
            res[i][j] = arr[i][j]
    return res

# 一様分布
rate_uniform = np.round(np.random.rand(100, 1000) * 10, 2)
rate_uniform = make_data(rate_uniform)
# 正規分布(平均=5, 分散=2)
rate_normal = np.round(np.random.normal(5, 1.8, (100, 1000)))
rate_normal = make_data(rate_normal)

In [10]:
def data_save(arr, name):
    index = list(range(1, 101))
    columns = list(range(1, 1001))
    df = pd.DataFrame(data=arr, index=index, columns=columns, dtype='float')
    df.to_csv('data/{}.csv'.format(name))
    return None

data_save(rate_uniform, 'rate_uniform')
data_save(rate_normal, 'rate_normal')

In [7]:
np.random.seed(seed=0)

# 更新式の誤差
def get_rating_error(r, p, q):
    rating_error = r - np.dot(p, q)
    return rating_error

# 損失関数
def get_error(R, P, Q, beta):
    error = 0
    for i in range(len(R)):
        for j in range(len(R[i])):
            if R[i][j] == 0:
                continue
            error += pow(get_rating_error(R[i][j], P[:,i], Q[:,j]), 2)
    error += beta/2 * (np.linalg.norm(P) + np.linalg.norm(Q))
    return error

# R=近似したい行列、K=次元数
def matrix_factorization(R, K, steps=1000, alpha=0.0005, beta=0.1, threshold=100):
    P = np.random.rand(K, len(R))
    Q = np.random.rand(K, len(R[0]))
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] == 0:
                    continue
                err = get_rating_error(R[i][j], P[:, i], Q[:, j])
                for k in range(K):
                    P[k][i] += alpha * (2 * err * Q[k][j])
                    Q[k][j] += alpha * (2 * err * P[k][i])
        error = get_error(R, P, Q, beta)
        if step%10 == 0:
            print('Step{}  error : {}'.format(step, error))
        if error < threshold:
            print('Step{}  error : {}'.format(step, error))
            print('学習終了')
            break
    return P, Q

In [11]:
P, Q = matrix_factorization(rate_uniform, 10)
pred_rate_uniform = np.dot(P.T, Q)
pred_rate_uniform = np.where(pred_rate_uniform<0, 0, pred_rate_uniform)
pred_rate_uniform = np.where(pred_rate_uniform>10, 10, pred_rate_uniform)
data_save(pred_rate_uniform, 'pred_rate_uniform')

P, Q = matrix_factorization(rate_normal, 10)
pred_rate_normal = np.dot(P.T, Q)
pred_rate_normal = np.where(pred_rate_normal<0, 0, pred_rate_normal)
pred_rate_normal = np.where(pred_rate_normal>10, 10, pred_rate_uniform)
data_save(pred_rate_normal, 'pred_rate_normal')

Step0  error : 27381.892462899297
Step10  error : 17594.603792161368
Step20  error : 13695.111601741399
Step30  error : 11679.922451678167
Step40  error : 10344.916226373378
Step50  error : 9325.132835238779
Step60  error : 8476.404668950563
Step70  error : 7726.791825938097
Step80  error : 7036.82311862267
Step90  error : 6385.396474067415
Step100  error : 5763.033357642666
Step110  error : 5167.707336529846
Step120  error : 4601.77689274637
Step130  error : 4069.595708281962
Step140  error : 3575.730135494936
Step150  error : 3123.782827469289
Step160  error : 2715.7772018797345
Step170  error : 2351.998965169603
Step180  error : 2031.1593132137484
Step190  error : 1750.742583279611
Step200  error : 1507.4199205118573
Step210  error : 1297.4415127478358
Step220  error : 1116.9557611948671
Step230  error : 962.2366752944125
Step240  error : 829.8245782094683
Step250  error : 716.5976765778994
Step260  error : 619.795085729804
Step270  error : 537.0093664668856
Step280  error : 466.162