### Train/Test 분리 MF 알고리즘

In [1]:
import numpy as np
import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recosys/u.data', sep = '\t', names = r_cols, encoding = 'latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('/content/drive/MyDrive/recosys/u.user', sep = '|', names = u_cols, encoding = 'latin-1')

i_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB URL', 'unknown', 'Action',
          'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Horror',
          'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'war', 'western']

movies = pd.read_csv('/content/drive/MyDrive/recosys/u.item', sep = '|', names = i_cols, encoding = 'latin-1')

#### Train/Test 분리

In [2]:
from sklearn.utils import  shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state = 1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

### 새로운 class NEW_MF 생성

In [13]:
class NEW_MF():
  def __init__(self, ratings, K, alpha, beta, iterations, verbose = True):
    self.R = np.array(ratings)      #DataFrame 형식의 평점(ratings)을 array로 변환에서 self.R에 저장
    item_id_index = []
    index_item_id = []

    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id, i])
      index_item_id.append([i, one_id])

    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)
    user_id_index = []
    index_user_id = []

    for i, one_id in enumerate(ratings.T):
      user_id_index.append([one_id, i])
      index_user_id.append([i, one_id])

    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)

    self.num_users, self.num_items = np.shape(self.R)     #사용자 수와 아이템 수를 받아온다
    self.K = K
    self.alpha = alpha
    self.beta = beta
    self.iterations = iterations
    self.verbose = verbose


  def rmse(self) :
    xs, ys = self.R.nonzero()     #R에 평점이 있는 요소(0이 아닌 요소) 인덱스 가져오기
    self.predictions = []
    self.errors = []

    for x, y in zip(xs, ys):      #평점이 있는 요소 각각(사용자 x, 아이템 y)에 대해 아래 코드 작성
      prediction = self.get_prediction(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y] - prediction)

    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)
    return np.sqrt(np.mean(self.errors**2))     #error로 RMSE 계산


  def set_test(self, ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i,0]]
      y = self.item_id_index[ratings_test.iloc[i,1]]
      z = ratings_test.iloc[i,2]
      test_set.append([x,y,z])
      self.R[x,y] = 0

    self.test_set = test_set
    return test_set

  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_prediction(one_set[0], one_set[1])
      error += pow(one_set[2] - predicted, 2)
    return np.sqrt(error/len(self.test_set))

  def train(self):      #P, Q, bu, bd값 업데이트하는 함수
    self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
    self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()      #평점행렬 R 중 0이 아닌 요소의 인덱스를 가져온다
    self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]      #SGD를 적용할 대상, 즉 평점이 있는 요소의 인덱스와 평점을 리스트로 만들어서 samples에 저장한다.

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse = self.rmse()
      training_process.append((i+1, rmse))
      if self.verbose:
        if (i + 1)% 10 == 0:
          print("Iteration: %d; Train RMSE = %.4f" % (i+1, rmse))

    return training_process

  def test(self):
    self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
    self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))

    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])

    rows, columns = self.R.nonzero()      #평점행렬 R 중 0이 아닌 요소의 인덱스를 가져온다
    self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]      #SGD를 적용할 대상, 즉 평점이 있는 요소의 인덱스와 평점을 리스트로 만들어서 samples에 저장한다.

    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse1 = self.rmse()
      rmse2 = self.test_rmse()
      training_process.append((i+1, rmse1, rmse2))
      if self.verbose:
        if (i + 1)% 10 == 0:
          print("Iteration: %d; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))

    return training_process

  def get_prediction(self, i, j):
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
    return prediction

  def get_one_prediction(self, user_id, item_id):
    return np.round(self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id]), 4)

  def full_prediction(self):
    return np.round(self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T), 4)

  def sgd(self):
    for i, j, r in self.samples:
      prediction = self.get_prediction(i, j)
      e = (r - prediction)

      self.b_u[i] += self.alpha * (e - self.beta*self.b_u[i])
      self.b_d[j] += self.alpha * (e - self.beta*self.b_d[j])

      self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta*self.P[i, :])
      self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta*self.Q[j, :])

R_temp = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
mf = NEW_MF(R_temp, K = 30, alpha = 0.001, beta = 0.02, iterations = 100, verbose = True)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration: 10; Train RMSE = 0.9659 ; Test RMSE = 0.9834
Iteration: 20; Train RMSE = 0.9410 ; Test RMSE = 0.9645
Iteration: 30; Train RMSE = 0.9298 ; Test RMSE = 0.9566
Iteration: 40; Train RMSE = 0.9231 ; Test RMSE = 0.9524
Iteration: 50; Train RMSE = 0.9184 ; Test RMSE = 0.9497
Iteration: 60; Train RMSE = 0.9146 ; Test RMSE = 0.9479
Iteration: 70; Train RMSE = 0.9111 ; Test RMSE = 0.9466
Iteration: 80; Train RMSE = 0.9073 ; Test RMSE = 0.9454
Iteration: 90; Train RMSE = 0.9028 ; Test RMSE = 0.9443
Iteration: 100; Train RMSE = 0.8970 ; Test RMSE = 0.9429


In [14]:
print(mf.full_prediction())
print(mf.get_one_prediction(1,2))

[[3.7953 3.3796 3.05   ... 3.3422 3.4693 3.4858]
 [3.9307 3.4865 3.1209 ... 3.4242 3.5528 3.5534]
 [3.3253 2.8801 2.5401 ... 2.8067 2.9407 2.9307]
 ...
 [4.1931 3.7796 3.45   ... 3.7151 3.8132 3.8206]
 [4.3722 3.9039 3.5822 ... 3.8435 3.9505 3.9634]
 [3.8422 3.3788 3.021  ... 3.2897 3.4495 3.4018]]
3.3796
