In [None]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from collections import defaultdict

In [None]:
# laod raw data
start_time = time.time()
all_data = pickle.load(open("all_data.pickle", "rb"))
print(time.time() - start_time)

In [None]:
# get train and test set
train_data = all_data[:900000]
valid_data = all_data[900000:]

In [None]:
# pre-process 0: build id <-> index infastructure

# get all items and users
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))
item_ids = sorted(list(set([d['itemID'] for d in all_data])))

# user and item numbers
num_users = len(user_ids)
num_items = len(item_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [None]:
# pre-process 1: build train_rating_array, valid_rating_array

# build array [user_index, item_index, rating]
train_rating_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    train_rating_array.append([user_index, item_index, rating])
train_rating_array = np.array(train_rating_array).astype(int)

# build array [user_index, item_index, rating]
valid_rating_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    valid_rating_array.append([user_index, item_index, rating])
valid_rating_array = np.array(valid_rating_array).astype(int)

# build array [user_index, item_index, rating]
all_rating_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    all_rating_array.append([user_index, item_index, rating])
all_rating_array = np.array(all_rating_array).astype(int)

In [None]:
def get_mse(ratings, ratings_predict):
    return np.mean((np.array(ratings.astype('float')) - 
                    np.array(ratings_predict).astype('float')) ** 2.)

def get_rmse(ratings, ratings_predict):
    return get_mse(ratings, ratings_predict) ** 0.5

In [None]:
# global variableds
K = 10
lam = 0.02
alpha = np.mean(train_rating_array[:, 2])

user_num = len(user_ids)
item_num = len(item_ids)
theta = init_theta(K) # all parameters

[a, b, c, d] = unpack(theta)
theta_new = pack(a,b,c,d)
assert(np.array_equal(theta, theta_new))

In [None]:
def pack(beta_users, beta_items, gamma_users, gamma_items):
    return np.concatenate((beta_users.ravel(),
                           beta_items.ravel(),
                           gamma_users.ravel(),
                           gamma_items.ravel()))

def unpack(theta):
    curr_ind = 0
    beta_users = theta[curr_ind : curr_ind + user_num]
    curr_ind += user_num
    beta_items = theta[curr_ind : curr_ind + item_num]
    curr_ind += item_num
    gamma_users = theta[curr_ind : curr_ind + K * user_num].reshape((K, user_num))
    curr_ind += K * user_num
    gamma_items = theta[curr_ind :].reshape((K, item_num))
    return [beta_users, beta_items, gamma_users, gamma_items]

def init_theta(K):
    beta_users = np.random.normal(0, 0.5, (user_num, ))
    beta_items = np.random.normal(0, 0.5, (item_num, ))
    gamma_users = np.random.normal(0, 0.5, (K, user_num))
    gamma_items = np.random.normal(0, 0.5, (K, item_num))
    return pack(beta_users, beta_items, gamma_users, gamma_items)

In [None]:
def objective(theta):
    [beta_users, beta_items, gamma_users, gamma_items] = unpack(theta)
    cost = 0.0
    for datum in train_rating_array:
        user_index = datum[0]
        item_index = datum[1]
        cost += (float(alpha)
                 + beta_users[user_index]
                 + beta_items[item_index]
                 + np.dot(gamma_users[:, user_index], gamma_items[:, item_index])
                 - float(datum[2])
                ) ** 2.0
    cost += lam * (np.linalg.norm(theta) ** 2.0)
    return 0.5 * cost

In [None]:
def gradient(theta):
    # unpack
    [beta_users, beta_items, gamma_users, gamma_items] = unpack(theta)
    # init gradient buffers
    beta_users_grad = np.zeros((user_num, ))
    beta_items_grad = np.zeros((item_num, ))
    gamma_users_grad = np.zeros((K, user_num))
    gamma_items_grad = np.zeros((K, item_num))
    # accumulate gradients
    for datum in train_rating_array:
        user_index = datum[0]
        item_index = datum[1]
        prediction = (float(alpha)
                      + beta_users[user_index]
                      + beta_items[item_index]
                      + np.dot(gamma_users[:, user_index], gamma_items[:, item_index]))
        common_offset = (prediction - float(datum[2]))

        beta_users_grad[user_index] += common_offset
        beta_items_grad[item_index] += common_offset
        gamma_users_grad[:, user_index] += common_offset * gamma_items[:, item_index]
        gamma_items_grad[:, item_index] += common_offset * gamma_users[:, user_index]
    # pack
    grad = pack(beta_users_grad,
                beta_items_grad,
                gamma_users_grad,
                gamma_items_grad)
    # reguilization gradient
    grad = grad + lam * theta
    return grad

In [None]:
def predict_one_rating(user_index, item_index, theta):
    user_index = int(user_index)
    item_index = int(item_index)
    [beta_users, beta_items, gamma_users, gamma_items] = unpack(theta)
    
    # user
    beta_user = beta_users[user_index]
    gamma_user = gamma_users[user_index]
    
    # item
    beta_item = beta_items[item_index]
    gamma_item = gamma_items[item_index]
    
    return alpha + beta_user + beta_item + np.dot(gamma_user, gamma_item)

In [None]:
def test_and_get_rmse(data, theta):
    ratings_predict = [predict_one_rating(user_index, item_index, theta) 
                       for user_index, item_index in data[:, :2]]
    gt_predict = data[:, 2]
    return get_rmse(pd_ratings, gt_ratings)

In [None]:
def progress_callback(theta):
    print("train rmse:", test_and_get_rmse(train_rating_array, theta))
    print("valid rmse:", test_and_get_rmse(valid_rating_array, theta))

In [None]:
from scipy.optimize import minimize
res = minimize(objective, 
               theta, 
               method='L-BFGS-B', 
               jac=gradient, 
               options={'disp': True, 'maxiter': 200},
               callback=progress_callback)