In [1]:
import os
os.chdir('/content/drive/MyDrive/yeonjun/공부/RecSys/intro_to_recsys/data')

In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from datetime import datetime

In [3]:
train = np.load('./ml-100k/ml_100k_train.npy')
test = np.load('./ml-100k/ml_100k_test.npy')

In [4]:
class Config:
    learning_rate = 0.01
    early_stopping_round = 0
    epochs = 30
    seed = 1995
    D = 30
    batch_size = 1024
    sigma = 0.01
    sigma_u = 0.1
    sigma_v = 0.1

config = Config()

In [5]:
class PMF():
    
    def __init__(self, train_df, test_df):
        self.R_tr = train_df 
        self.R_tst = test_df
        
        self.N = train_df.shape[0]
        self.M = train_df.shape[1]
        self.K = np.max(train_df)
        
        self.lambda_u = (config.sigma/config.sigma_u)**2
        self.lambda_v = (config.sigma/config.sigma_v)**2

        self.U = np.random.normal(0, config.sigma_u, (config.D, self.N))
        self.V = np.random.normal(0, config.sigma_v, (config.D, self.M))
        
        self.loss_tr = defaultdict(float)
        self.loss_tst_sc = defaultdict(float)
        self.loss_tst_dsc = defaultdict(float)

        self.batch_size = config.batch_size

    def fit(self):
        start = datetime.now()
        for epoch in range(config.epochs):
            # stochastic 
            n = 0
            for i in range(self.N):
                for j in range(self.M):
                    # rating 있는 애들만
                    if self.R_tr[i, j] != 0:
                        # p, q, bu, bi update
                        self.loss_tr[epoch] += self.gradient_descent(i, j)
                        n += 1
            
            self.loss_tr[epoch] = np.sqrt(self.loss_tr[epoch]/n)
            self.loss_tst_sc[epoch], self.loss_tst_dsc[epoch] = self.evaluate()

            if epoch % 10 == 0 or epoch == config.epochs-1:
                print(f'EPOCH {epoch+1} : TRAINING RMSE {self.loss_tr[epoch]:.5f}, VALID SCALED RMSE {self.loss_tst_sc[epoch]:.5f}, ORG RMSE {self.loss_tst_dsc[epoch]:.5f}')
        end = datetime.now()
        print(f'Training takes time {end-start}')
        
    def scoring(self, i, j):
        dot_prod = np.dot(self.U[:, i], self.V[:, j])
        return 1/(1 + np.exp(-dot_prod)) 
    
    def scale(self, value):
        return (value-1)/(self.K-1)
    
    def descale(self, value):
        return (self.K - 1)*value + 1

    def gradient(self, i, j):
        loss =  self.scale(self.R_tr[i,j]) - self.scoring(i, j)
        du = loss*self.V[:, j] - self.lambda_u*self.U[:, i]
        dv = loss*self.U[:, i] - self.lambda_v*self.V[:, j]
        return du, dv, loss**2

    def gradient_descent(self, i, j):
        du, dv, loss = self.gradient(i, j)
        self.U[:, i] = self.U[:, i] + config.learning_rate * du
        self.V[:, j] = self.V[:, j] + config.learning_rate * dv
        return loss

    def predict(self):
        dot_prod = np.dot(self.U.T, self.V)
        return 1/(1 + np.exp(-dot_prod)) 

    def evaluate(self):
        pred_scaled = self.predict()
        pred_descaled = self.descale(pred_scaled)

        rating_idx = self.R_tst != 0
        
        loss_pred_scaled = np.sqrt(np.mean(((self.scale(self.R_tst) - pred_scaled)[rating_idx])**2))
        loss_pred_descaled = np.sqrt(np.mean(((self.R_tst - pred_descaled)[rating_idx])**2))

        return loss_pred_scaled, loss_pred_descaled

    def plot_loss(self):
        fig, ax = plt.subplots(1,1, figsize=(10, 5))
        
        ax.plot(list(self.loss_tr.keys()), list(self.loss_tr.values()), color='orange', label='train')
        ax.plot(list(self.loss_tst_sc.keys()), list(self.loss_tst.values()), color='green', label='valid')
        plt.legend()
        plt.show()

        

In [6]:
pmf = PMF(train, test)

In [7]:
np.random.seed(config.seed)
pmf.fit()

EPOCH 1 : TRAINING RMSE 0.30954, VALID SCALED RMSE 0.31823, ORG RMSE 1.27294
EPOCH 11 : TRAINING RMSE 0.29117, VALID SCALED RMSE 0.30526, ORG RMSE 1.22104
EPOCH 21 : TRAINING RMSE 0.24295, VALID SCALED RMSE 0.26079, ORG RMSE 1.04316
EPOCH 30 : TRAINING RMSE 0.22912, VALID SCALED RMSE 0.25105, ORG RMSE 1.00420
Training takes time 0:01:10.029979
