In [1]:
# memory-based recsys : CF
# model-based recsys : MF, DL-based

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [3]:
# data directory absolute path
data_dir = '../../../../data/'

# Read data
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(data_dir+'u.user', sep='|', names=u_cols, encoding='latin-1')

i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Childeren\'s', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
         ]
movies = pd.read_csv(data_dir+'u.item', sep='|', names=i_cols, encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(data_dir+'u.data', sep='\t', names=r_cols, encoding='latin-1')


In [13]:
# MF Class
class MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K # number of latent factor
        self.alpha = alpha # learning rate
        self.beta = beta # regularization factor
        self.iterations = iterations # SGD iterations
        self.verbose = verbose # print SGD training process or not
    
    def rmse(self):
        xs, ys = self.R.nonzero() # non-zero index 
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x,y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x,y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors**2))
    
    def train(self):
        # Initialize user, item feature matrix
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize bias terms
        self.bias_u = np.zeros(self.num_users)
        self.bias_i = np.zeros(self.num_items)
        self.bias = np.mean(self.R[self.R.nonzero()]) # total mean
        
        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i,j in zip(rows, columns)]
        
        # SGD for given iterations
        training_processes = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.SGD()
            rmse = self.rmse()
            training_processes.append((i+1, rmse))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration : %d ; Train RMSE = %.4f" % (i+1, rmse))
        
        return training_processes
    
    def get_prediction(self, i, j):
        prediction = self.bias + self.bias_u[i] + self.bias_i[j] + self.P[i,:].dot(self.Q[j,:].T)
        
        return prediction
    
    def SGD(self):
        for i,j,r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)
            
            self.bias_u[i] += self.alpha * (e - self.beta * self.bias_u[i])
            self.bias_i[j] += self.alpha * (e - self.beta * self.bias_i[j])
            
            self.P[i,:] += self.alpha * (e * self.Q[j,:] - self.beta * self.P[i,:])
            self.Q[j,:] += self.alpha * (e * self.P[i,:] - self.beta * self.Q[j,:])

In [14]:
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
train_process = mf.train()

Iteration : 10 ; Train RMSE = 0.9585
Iteration : 20 ; Train RMSE = 0.9374
Iteration : 30 ; Train RMSE = 0.9281
Iteration : 40 ; Train RMSE = 0.9226
Iteration : 50 ; Train RMSE = 0.9185
Iteration : 60 ; Train RMSE = 0.9147
Iteration : 70 ; Train RMSE = 0.9103
Iteration : 80 ; Train RMSE = 0.9045
Iteration : 90 ; Train RMSE = 0.8961
Iteration : 100 ; Train RMSE = 0.8847


### Train/Test split

In [15]:
from sklearn.utils import shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [20]:
# NEW MF Class
class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K # number of latent factor
        self.alpha = alpha # learning rate
        self.beta = beta # regularization factor
        self.iterations = iterations # SGD iterations
        self.verbose = verbose # print SGD training process or not
        
        # user_id, item_id가 내부 인덱스와 데이터 인덱스가 일치하지 않는 경우를 방지
        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)
        
        user_id_index = []
        index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
    
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x = self.user_id_index[ratings_test.iloc[i, 0]]
            y = self.item_id_index[ratings_test.iloc[i, 1]]
            z = ratings_test.iloc[i, 2]
            test_set.append([x, y, z])
            self.R[x,y] = 0
        self.test_set = test_set
        
        return test_set
    
    def rmse(self):
        xs, ys = self.R.nonzero() # non-zero index 
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x,y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x,y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors**2))
    
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2) # add error square
        
        return np.sqrt(error/len(self.test_set))
    
    def test(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        self.bias_u = np.zeros(self.num_users)
        self.bias_i = np.zeros(self.num_items)
        self.bias = np.mean(self.R[self.R.nonzero()]) # total mean
        
        # List of training samples
        rows, columns = self.R.nonzero()
        self.samples = [(i,j, self.R[i,j]) for i,j in zip(rows, columns)]
        
        # SGD for given iterations
        training_processes = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.SGD()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_processes.append((i+1, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration : %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))
        
        return training_processes
    
    def get_prediction(self, i, j):
        prediction = self.bias + self.bias_u[i] + self.bias_i[j] + self.P[i,:].dot(self.Q[j,:].T)
        
        return prediction
    
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])
    
    def full_prediction(self):
        return self.bias + self.bias_u[:, np.newaxis] + self.bias_i[:, np.newaxis] + self.P.dot(self.Q.T)
    
    def SGD(self):
        for i,j,r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)
            
            self.bias_u[i] += self.alpha * (e - self.beta * self.bias_u[i])
            self.bias_i[j] += self.alpha * (e - self.beta * self.bias_i[j])
            
            self.P[i,:] += self.alpha * (e * self.Q[j,:] - self.beta * self.P[i,:])
            self.Q[j,:] += self.alpha * (e * self.P[i,:] - self.beta * self.Q[j,:])

In [21]:
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = NEW_MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration : 10 ; Train RMSE = 0.9659 ; Test RMSE = 0.9834
Iteration : 20 ; Train RMSE = 0.9409 ; Test RMSE = 0.9645
Iteration : 30 ; Train RMSE = 0.9297 ; Test RMSE = 0.9566
Iteration : 40 ; Train RMSE = 0.9230 ; Test RMSE = 0.9523
Iteration : 50 ; Train RMSE = 0.9183 ; Test RMSE = 0.9497
Iteration : 60 ; Train RMSE = 0.9144 ; Test RMSE = 0.9479
Iteration : 70 ; Train RMSE = 0.9108 ; Test RMSE = 0.9466
Iteration : 80 ; Train RMSE = 0.9069 ; Test RMSE = 0.9454
Iteration : 90 ; Train RMSE = 0.9021 ; Test RMSE = 0.9441
Iteration : 100 ; Train RMSE = 0.8960 ; Test RMSE = 0.9427


In [22]:
# Find optimal K
results = []
index =[]
for K in range(50,261,10):
    print('K =', K)
    R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
    mf = NEW_MF(R_temp, K=K, alpha=0.001, beta=0.02, iterations=100, verbose=True)
    test_set = mf.set_test(ratings_test)
    result = mf.test()
    index.append(K)
    results.append(result)

K = 50
Iteration : 10 ; Train RMSE = 0.9661 ; Test RMSE = 0.9834
Iteration : 20 ; Train RMSE = 0.9414 ; Test RMSE = 0.9644
Iteration : 30 ; Train RMSE = 0.9305 ; Test RMSE = 0.9566
Iteration : 40 ; Train RMSE = 0.9241 ; Test RMSE = 0.9523
Iteration : 50 ; Train RMSE = 0.9197 ; Test RMSE = 0.9496
Iteration : 60 ; Train RMSE = 0.9163 ; Test RMSE = 0.9478
Iteration : 70 ; Train RMSE = 0.9133 ; Test RMSE = 0.9464
Iteration : 80 ; Train RMSE = 0.9102 ; Test RMSE = 0.9452
Iteration : 90 ; Train RMSE = 0.9065 ; Test RMSE = 0.9439
Iteration : 100 ; Train RMSE = 0.9018 ; Test RMSE = 0.9423
K = 60
Iteration : 10 ; Train RMSE = 0.9662 ; Test RMSE = 0.9834
Iteration : 20 ; Train RMSE = 0.9415 ; Test RMSE = 0.9644
Iteration : 30 ; Train RMSE = 0.9307 ; Test RMSE = 0.9566
Iteration : 40 ; Train RMSE = 0.9243 ; Test RMSE = 0.9523
Iteration : 50 ; Train RMSE = 0.9201 ; Test RMSE = 0.9497
Iteration : 60 ; Train RMSE = 0.9168 ; Test RMSE = 0.9478
Iteration : 70 ; Train RMSE = 0.9140 ; Test RMSE = 0.9465

KeyboardInterrupt: 

In [None]:
# Find optimal iterations
summary = []
for i in range(len(results)):
    RMSE = []
    for result in results[i]:
        RMSE.append(result[2])
    
    min = np.min(RMSE)
    j = RMSE.index(min)
    summary.append([index[i], j+1, RMSE[j]])

In [None]:
# Plot
import matplotlib.pyplot as plt
plt.plot(index, [x[2] for x in summary])
plt.ylim(0.89, 0.94)
plt.xlabel('K')
plt.ylabel('RMSE')
plt.show()