In [None]:
# google drive 마운트
from google.colab import drive
drive.mount("/content/drive")

import numpy as np
import pandas as pd

# 데이터 읽어 오기 
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/My Drive/data/u.data', sep='\t', names=rating_cols) 

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)

# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 매트릭스 실제값들로 RMSE 계산
def mf_rmse(R, PQ):
  rows, columns = R.shape

  R_non_zeros = []
  PQ_non_zeros = []

  for i in range(rows):
    for j in range(columns):
      if ~(np.isnan(R[i, j])):
        R_non_zeros.append(R[i, j])
        PQ_non_zeros.append(PQ[i, j])

  return RMSE(R_non_zeros, PQ_non_zeros)

Mounted at /content/drive


In [None]:
# train 데이터로 Full matrix 구하기 
R = ratings.pivot(index='user_id', columns='movie_id', values='rating').to_numpy()
display(R)

array([[ 5.,  3.,  4., ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 5., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan,  5., nan, ..., nan, nan, nan]])

In [None]:
num_users, num_movies = R.shape   # R.shape : (높이, 길이)

K = 100  # feature 개수
P = np.random.normal(size=(num_users, K)) # 정규분표(normal distribution)로 랜덤 값을 size의 행렬에 채움
Q = np.random.normal(size=(K, num_movies))

In [None]:
learning_rate = 0.005  # 학습률
r_rate = 0.005 # 정규화 적용률

for epoch in range(20):  # 학습 반복 횟수
  for i in range(num_users):
    for j in range(num_movies):
      true_r = R[i, j]  # 실제 평점
      if np.isnan(true_r) :   # 평점이 없으면 다음 평점으로 이동
        continue
      
      pred_r = np.dot(P[i, :], Q[:, j])   # 예측 평점
      error = true_r - pred_r   # 오차

      # SGD 적용, P, Q 갱신
      P[i, :] = P[i, :] + learning_rate*(error * Q[:, j] - r_rate*P[i, :])
      Q[:, j] = Q[:, j] + learning_rate*(error * P[i, :] - r_rate*Q[:, j])

  print("RMSE", epoch, mf_rmse(R, np.dot(P, Q)))


RMSE 0 3.0137838109842785
RMSE 1 1.5728348248999777
RMSE 2 1.1820292571403717
RMSE 3 0.9989365191805852
RMSE 4 0.8893742905696391
RMSE 5 0.8162674016721403
RMSE 6 0.7638754013060753
RMSE 7 0.7243632899471222
RMSE 8 0.6933943904361902
RMSE 9 0.6683749205968696
RMSE 10 0.6476594997470292
RMSE 11 0.6301545409624354
RMSE 12 0.6151051549569216
RMSE 13 0.6019736801423146
RMSE 14 0.5903669880808485
RMSE 15 0.5799911598336929
RMSE 16 0.5706222202829679
RMSE 17 0.5620866598320579
RMSE 18 0.5542481226954281
RMSE 19 0.5469980952039377


In [None]:
display("R", R)
display("PQ", np.dot(P, Q))

'R'

array([[ 5.,  3.,  4., ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 5., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan,  5., nan, ..., nan, nan, nan]])

'PQ'

array([[  4.27918634,   3.07788811,   4.59180113, ...,   1.66558568,
          0.46932598,  -1.51602565],
       [  3.34069234,   1.20363506,  -6.36627378, ...,   3.27375807,
         -5.79801673,   5.87447271],
       [  0.33454104,   3.89058908,  -7.99997869, ...,  -0.53309784,
         -4.98593468,   0.55576142],
       ...,
       [  5.20846914,  15.63765173, -11.21535474, ...,   9.09609584,
          8.51585366,  -4.53310087],
       [  4.99164637,  -2.3656031 ,   4.10014516, ...,  11.02236546,
          9.36125927,  -6.21162549],
       [  2.95585568,   4.51698819,   7.81731077, ...,  -5.06242686,
         -3.63498128,  -4.71339752]])