In [50]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from collections import defaultdict

In [51]:
# laod raw data
start_time = time.time()
all_data = pickle.load(open("all_data.pickle", "rb"))
print(time.time() - start_time)

15.4132521152


In [52]:
# get train and test set
train_data = all_data[:900000]
valid_data = all_data[900000:]

In [53]:
# pre-process 0: build id <-> index infastructure

# get all items and users
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))
item_ids = sorted(list(set([d['itemID'] for d in all_data])))

# user and item numbers
num_users = len(user_ids)
num_items = len(item_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [54]:
# pre-process 1: build train_rating_array, valid_rating_array

# build array [user_index, item_index, rating]
train_rating_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    train_rating_array.append([user_index, item_index, rating])
train_rating_array = np.array(train_rating_array).astype(int)

# build array [user_index, item_index, rating]
valid_rating_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    valid_rating_array.append([user_index, item_index, rating])
valid_rating_array = np.array(valid_rating_array).astype(int)

# build array [user_index, item_index, rating]
all_rating_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    all_rating_array.append([user_index, item_index, rating])
all_rating_array = np.array(all_rating_array).astype(int)

In [80]:
def get_mse(ratings, ratings_predict):
    return np.mean((np.array(ratings).astype('float') - 
                    np.array(ratings_predict).astype('float')) ** 2.)

def get_rmse(ratings, ratings_predict):
    return get_mse(ratings, ratings_predict) ** 0.5

In [81]:
# global variableds
K = 5
lam = 0.1
alpha = np.mean(train_rating_array[:, 2])
user_num = len(user_ids)
item_num = len(item_ids)

In [82]:
def init_theta(K):
    beta_users = np.random.normal(0, 0.5, (user_num, ))
    beta_items = np.random.normal(0, 0.5, (item_num, ))
    gamma_users = np.random.normal(0, 0.5, (user_num, K))
    gamma_items = np.random.normal(0, 0.5, (item_num, K))
    return pack(beta_users, beta_items, gamma_users, gamma_items)

def pack(beta_users, beta_items, gamma_users, gamma_items):
    return np.concatenate((beta_users.ravel(),
                           beta_items.ravel(),
                           gamma_users.ravel(),
                           gamma_items.ravel()))

def unpack(theta):
    curr_ind = 0
    beta_users = theta[curr_ind : curr_ind + user_num]
    curr_ind += user_num
    beta_items = theta[curr_ind : curr_ind + item_num]
    curr_ind += item_num
    gamma_users = theta[curr_ind : curr_ind + user_num * K].reshape((user_num, K))
    curr_ind += user_num * K
    gamma_items = theta[curr_ind :].reshape((item_num, K))
    return [beta_users, beta_items, gamma_users, gamma_items]

In [83]:
# init thea
theta = init_theta(K) # all parameters

# check pack and unpack function
[a, b, c, d] = unpack(theta)
theta_new = pack(a,b,c,d)
assert(np.array_equal(theta, theta_new))

In [84]:
def objective(theta):
    [beta_users, beta_items, gamma_users, gamma_items] = unpack(theta)
    cost = 0.0
    for datum in train_rating_array:
        user_index = datum[0]
        item_index = datum[1]
        cost += (float(alpha)
                 + beta_users[user_index]
                 + beta_items[item_index]
                 + np.dot(gamma_users[user_index], gamma_items[item_index])
                 - float(datum[2])
                ) ** 2.0
    cost += lam * (np.linalg.norm(theta) ** 2.0)
    return 0.5 * cost

In [85]:
def gradient(theta):
    # unpack
    [beta_users, beta_items, gamma_users, gamma_items] = unpack(theta)
    # init gradient buffers
    beta_users_grad = np.zeros((user_num, ))
    beta_items_grad = np.zeros((item_num, ))
    gamma_users_grad = np.zeros((user_num, K))
    gamma_items_grad = np.zeros((item_num, K))
    # accumulate gradients
    for datum in train_rating_array:
        user_index = datum[0]
        item_index = datum[1]
        prediction = (float(alpha)
                      + beta_users[user_index]
                      + beta_items[item_index]
                      + np.dot(gamma_users[user_index], gamma_items[item_index]))
        common_offset = (prediction - float(datum[2]))

        beta_users_grad[user_index] += common_offset
        beta_items_grad[item_index] += common_offset
        gamma_users_grad[user_index] += common_offset * gamma_items[item_index]
        gamma_items_grad[item_index] += common_offset * gamma_users[user_index]
    # pack
    grad = pack(beta_users_grad,
                beta_items_grad,
                gamma_users_grad,
                gamma_items_grad)
    # reguilization gradient
    grad = grad + lam * theta
    return grad

In [86]:
def predict_one_rating(user_index, item_index, theta):
    user_index = int(user_index)
    item_index = int(item_index)
    [beta_users, beta_items, gamma_users, gamma_items] = unpack(theta)
    
    # user
    beta_user = beta_users[user_index]
    gamma_user = gamma_users[user_index]
    
    # item
    beta_item = beta_items[item_index]
    gamma_item = gamma_items[item_index]
    
    return alpha + beta_user + beta_item + np.dot(gamma_user, gamma_item)

In [87]:
def test_and_get_rmse(data, theta):
    ratings_predict = [predict_one_rating(user_index, item_index, theta) 
                       for user_index, item_index in data[:, :2]]
    ratings = data[:, 2]
    return get_rmse(ratings_predict, ratings)

In [88]:
def progress_callback(theta):
    print("train rmse:", test_and_get_rmse(train_rating_array, theta))
    print("valid rmse:", test_and_get_rmse(valid_rating_array, theta))

In [89]:
from scipy.optimize import minimize
res = minimize(objective, 
               theta, 
               method='L-BFGS-B', 
               jac=gradient, 
               options={'disp': True, 'maxiter': 30},
               callback=progress_callback)
# res = minimize(objective, 
#                theta, 
#                method='L-BFGS-B', 
#                jac=gradient, 
#                options={'disp': True, 'maxiter': 200})

train rmse: 1.29650094144
valid rmse: 1.30389540733
train rmse: 1.17836835016
valid rmse: 1.20858732859
train rmse: 1.06475543159
valid rmse: 1.12014975647
train rmse: 0.998287729727
valid rmse: 1.07068844843
train rmse: 0.928642141916
valid rmse: 1.02719702524
train rmse: 0.871930025005
valid rmse: 0.979331873854
train rmse: 0.855027444141
valid rmse: 0.966213190552
train rmse: 0.829484229965
valid rmse: 0.948542254602
train rmse: 0.825875629486
valid rmse: 0.956138577208
train rmse: 0.789287216926
valid rmse: 0.924402873369
train rmse: 0.779613289794
valid rmse: 0.917921912801
train rmse: 0.768115943628
valid rmse: 0.912069557618
train rmse: 0.752295422855
valid rmse: 0.904906905998
train rmse: 0.74411602731
valid rmse: 0.903449279289
train rmse: 0.728953255333
valid rmse: 0.899319134626
train rmse: 0.722057614637
valid rmse: 0.897911046667
train rmse: 0.714289723952
valid rmse: 0.897876705598
train rmse: 0.706690318592
valid rmse: 0.898989272496
train rmse: 0.699305539342
valid rmse

KeyboardInterrupt: 