In [1]:
# 행렬요인화(MF) - Train/Test 분리

import numpy as np
import pandas as pd

r_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
    "./ml-100k/u.data", names=r_cols, sep="\t", encoding="latin-1"
)
ratings = ratings[["user_id", "movie_id", "rating"]].astype(int)  # timestamp 제거

In [2]:
# train test 분리
from sklearn.utils import shuffle

TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [3]:
# New MF class for training & testing
class NEW_MF:
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        ##### >>>>> (2) user_id, item_id를 R의 index와 매핑하기 위한 dictionary 생성
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
        #### <<<<< (2)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    # train set의 RMSE 계산
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    # Ratings for user i and item j
    def get_prediction(self, i, j):
        prediction = (
            self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        )
        return prediction

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = r - prediction

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])

    ##### >>>>> (3)
    # Test set을 선정
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):  # test 데이터에 있는 각 데이터에 대해서
            x = self.user_id_index[ratings_test.iloc[i, 0]]
            y = self.item_id_index[ratings_test.iloc[i, 1]]
            z = ratings_test.iloc[i, 2]
            test_set.append([x, y, z])
            self.R[x, y] = 0  # Setting test set ratings to 0
        self.test_set = test_set
        return test_set  # Return test set

    # Test set의 RMSE 계산
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error / len(self.test_set))

    # Training 하면서 test set의 정확도를 계산
    def test(self):
        # Initializing user-feature and item-feature matrix
        self.P = np.random.normal(scale=1.0 / self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1.0 / self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i + 1, rmse1, rmse2))
            if self.verbose:
                if (i + 1) % 10 == 0:
                    print(
                        "Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f"
                        % (i + 1, rmse1, rmse2)
                    )
        return training_process

    # Ratings for given user_id and item_id
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(
            self.user_id_index[user_id], self.item_id_index[item_id]
        )

    # Full user-movie rating matrix
    def full_prediction(self):
        return (
            self.b
            + self.b_u[:, np.newaxis]
            + self.b_d[np.newaxis, :]
            + self.P.dot(self.Q.T)
        )


##### <<<<< (3)

In [4]:
# Testing MF RMSE
R_temp = ratings.pivot(index="user_id", columns="movie_id", values="rating").fillna(0)
mf = NEW_MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

# Printing predictions
print(mf.full_prediction())
print(mf.get_one_prediction(1, 2))

Iteration: 10 ; Train RMSE = 0.9659 ; Test RMSE = 0.9834
Iteration: 20 ; Train RMSE = 0.9410 ; Test RMSE = 0.9645
Iteration: 30 ; Train RMSE = 0.9298 ; Test RMSE = 0.9566
Iteration: 40 ; Train RMSE = 0.9230 ; Test RMSE = 0.9523
Iteration: 50 ; Train RMSE = 0.9183 ; Test RMSE = 0.9496
Iteration: 60 ; Train RMSE = 0.9144 ; Test RMSE = 0.9477
Iteration: 70 ; Train RMSE = 0.9106 ; Test RMSE = 0.9462
Iteration: 80 ; Train RMSE = 0.9064 ; Test RMSE = 0.9447
Iteration: 90 ; Train RMSE = 0.9012 ; Test RMSE = 0.9431
Iteration: 100 ; Train RMSE = 0.8945 ; Test RMSE = 0.9410
[[3.74848262 3.37034646 3.0617788  ... 3.36274278 3.45639603 3.47250255]
 [3.89805395 3.49129002 3.11861408 ... 3.4410111  3.5470898  3.55439958]
 [3.33794338 2.88497424 2.53519301 ... 2.82080945 2.94234044 2.93944035]
 ...
 [4.22920018 3.7770487  3.44020472 ... 3.71893366 3.82406039 3.83406692]
 [4.35576741 3.88953168 3.57792934 ... 3.83085527 3.94562007 3.93397369]
 [3.84419952 3.37629022 3.0184947  ... 3.3158461  3.4084072

- 4-1.ipynb에서 train/test set을 분리하는 방법을 shuffle() 대신에 앞 장에서 사용한 train_test_split()을 사용해서 분리하도록 수정하고 실행해 보세요. 실행 결과 RMSE에 차이가 많이 난다면 왜 차이가 발생했을지 설명하세요.