### SGD 기반 MF 알고리즘

> 잠재요인 개수 K를 정한다

> 주어진 K에 따라 $P(M \times K)$와 $Q(N \times K)$행렬을 만들고 초기화

> $R = PQ^{T}$ 구하기

> R에 있는 실제 평점에 대해 오차를 줄이기 위한 P,Q 수정

> 오차가 기준값 이하로 줄거나 반복 횟수가 채워질 때까지 반복

In [3]:
import numpy as np
import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recosys/u.data', sep = '\t', names = r_cols, encoding = 'latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/content/drive/MyDrive/recosys/u.user', sep = '|', names = u_cols, encoding = 'latin-1')

i_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB URL', 'unknown', 'Action',
          'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Horror',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'war', 'western']

movies = pd.read_csv('/content/drive/MyDrive/recosys/u.item', sep = '|', names = i_cols, encoding = 'latin-1')

### MF Class

> K: 잠재요인

> alpha: 학습률 $\alpha$

> beta: 정구화 계수 $\beta$

> iterations: SGD 계산 반복 횟수

> verbose: SGD의 중간 학습 과정 출력 여부

In [7]:
class MF():
  def __init__(self, ratings, K, alpha, beta, iterations, verbose = True):
    self.R = np.array(ratings)      #DataFrame 형식의 평점(ratings)을 array로 변환에서 self.R에 저장
    self.num_users, self.num_items = np.shape(self.R)     #사용자 수와 아이템 수를 받아온다
    self.K = K
    self.alpha = alpha
    self.beta = beta
    self.iterations = iterations
    self.verbose = verbose

  def rmse(self) :
    xs, ys = self.R.nonzero()     #R에 평점이 있는 요소(0이 아닌 요소) 인덱스 가져오기
    self.predictions = []
    self.errors = []

    for x, y in zip(xs, ys):      #평점이 있는 요소 각각(사용자 x, 아이템 y)에 대해 아래 코드 작성
      prediction = self.get_prediction(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y] - prediction)

    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)
    return np.sqrt(np.mean(self.errors**2))     #error로 RMSE 계산

  def train(self):      #P, Q, bu, bd값 업데이트하는 함수
    self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
    self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()      #평점행렬 R 중 0이 아닌 요소의 인덱스를 가져온다
    self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]      #SGD를 적용할 대상, 즉 평점이 있는 요소의 인덱스와 평점을 리스트로 만들어서 samples에 저장한다.

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse = self.rmse()
      training_process.append((i+1, rmse))
      if self.verbose:
        if (i + 1)% 10 == 0:
          print("Iteration: %d; Train RMSE = %.4f" % (i+1, rmse))

    return training_process

  def get_prediction(self, i, j):
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
    return prediction

  def sgd(self):
    for i, j, r in self.samples:
      prediction = self.get_prediction(i, j)
      e = (r - prediction)

      self.b_u[i] += self.alpha * (e - self.beta*self.b_u[i])
      self.b_d[j] += self.alpha * (e - self.beta*self.b_d[j])

      self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta*self.P[i, :])
      self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta*self.Q[j, :])

R_temp = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
mf = MF(R_temp, K = 30, alpha = 0.001, beta = 0.02, iterations = 100, verbose = True)
train_process = mf.train()
train_process

Iteration: 10; Train RMSE = 0.9585
Iteration: 20; Train RMSE = 0.9374
Iteration: 30; Train RMSE = 0.9281
Iteration: 40; Train RMSE = 0.9225
Iteration: 50; Train RMSE = 0.9183
Iteration: 60; Train RMSE = 0.9144
Iteration: 70; Train RMSE = 0.9097
Iteration: 80; Train RMSE = 0.9032
Iteration: 90; Train RMSE = 0.8942
Iteration: 100; Train RMSE = 0.8827


[(1, 1.0676652566074398),
 (2, 1.034363542913019),
 (3, 1.013055951517938),
 (4, 0.9982590950603741),
 (5, 0.987374581696917),
 (6, 0.9790177116413347),
 (7, 0.9723604021790533),
 (8, 0.9669296595627146),
 (9, 0.9623814446721866),
 (10, 0.9585141800037442),
 (11, 0.9551721433208192),
 (12, 0.952245157643139),
 (13, 0.9496584131980759),
 (14, 0.9473540464314776),
 (15, 0.9452811855325286),
 (16, 0.9434019580092713),
 (17, 0.9416891750888297),
 (18, 0.9401265219794797),
 (19, 0.9386878063435309),
 (20, 0.9373634208172436),
 (21, 0.9361342980774443),
 (22, 0.9349920176049518),
 (23, 0.933927569625219),
 (24, 0.9329301413501324),
 (25, 0.931995468273633),
 (26, 0.9311181728782818),
 (27, 0.9302932700260507),
 (28, 0.9295099275024064),
 (29, 0.9287706770987305),
 (30, 0.9280680276093409),
 (31, 0.9274028030597279),
 (32, 0.9267631892908171),
 (33, 0.9261532789986427),
 (34, 0.9255724205637507),
 (35, 0.925011297398623),
 (36, 0.9244768699124747),
 (37, 0.9239619946473585),
 (38, 0.923460602