In [1]:
# memory-based recsys : CF
# model-based recsys : MF, DL-based

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [3]:
# data directory absolute path
data_dir = '../../../../data/'

# Read data
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Childeren\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
         ]
movies = pd.read_csv(data_dir+'u.item', sep='|', names=i_cols, encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(data_dir+'u.data', sep='\t', names=r_cols, encoding='latin-1')


In [13]:
# MF Class
class MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K # number of latent factor
        self.alpha = alpha # learning rate
        self.beta = beta # regularization factor
        self.iterations = iterations # SGD iterations
        self.verbose = verbose # print SGD training process or not
    
    def rmse(self):
        xs, ys = self.R.nonzero() # non-zero index 
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x,y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x,y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors**2))
    
    def train(self):
        # Initialize user, item feature matrix
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize bias terms
        self.bias_u = np.zeros(self.num_users)
        self.bias_i = np.zeros(self.num_items)
        self.bias = np.mean(self.R[self.R.nonzero()]) # total mean
        
        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i,j in zip(rows, columns)]
        
        # SGD for given iterations
        training_processes = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.SGD()
            rmse = self.rmse()
            training_processes.append((i+1, rmse))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration : %d ; Train RMSE = %.4f" % (i+1, rmse))
        
        return training_processes
    
    def get_prediction(self, i, j):
        prediction = self.bias + self.bias_u[i] + self.bias_i[j] + self.P[i,:].dot(self.Q[j,:].T)
        
        return prediction
    
    def SGD(self):
        for i,j,r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)
            
            self.bias_u[i] += self.alpha * (e - self.beta * self.bias_u[i])
            self.bias_i[j] += self.alpha * (e - self.beta * self.bias_i[j])
            
            self.P[i,:] += self.alpha * (e * self.Q[j,:] - self.beta * self.P[i,:])
            self.Q[j,:] += self.alpha * (e * self.P[i,:] - self.beta * self.Q[j,:])

In [None]:
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
train_process = mf.train()

Iteration : 10 ; Train RMSE = 0.9585
Iteration : 20 ; Train RMSE = 0.9374
Iteration : 30 ; Train RMSE = 0.9281
Iteration : 40 ; Train RMSE = 0.9226
Iteration : 50 ; Train RMSE = 0.9185
Iteration : 60 ; Train RMSE = 0.9147
Iteration : 70 ; Train RMSE = 0.9103
