In [1]:
# my version of the algorithm used in the baseline

from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os
from __future__ import print_function

data_root = os.path.expanduser("~") + '/data/CSE255/'

In [2]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

17.0732281208


In [3]:
all_size = len(all_data)
train_size = 900000
# train_size = all_size
valid_size = 100000
train_data = all_data[:train_size]
valid_data = all_data[all_size - valid_size:]

In [21]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

def get_valid_mae(valid_data, alpha, beta_us, beta_is):
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    helpfuls_predict = np.array([predict_helpful(d, alpha, beta_us, beta_is) for d in valid_data])
    return get_mae(helpfuls, helpfuls_predict)

In [22]:
# pre-process 0: build id <-> index infastructure

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# user and item numbers
num_items = len(item_ids)
num_users = len(user_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [23]:
# pre-process 1: build train_ratio_array, valid_ratio_array

def get_ratio(d):
    return float(d['helpful']['nHelpful']) / float(d['helpful']['outOf'])

# build array [user_index, item_index, ratio]
train_ratio_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    if float(d['helpful']['outOf']) != 0:
        ratio = get_ratio(d)
        train_ratio_array.append([user_index, item_index, ratio])
train_ratio_array = np.array(train_ratio_array)

# build array [user_index, item_index, ratio]
valid_ratio_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    if float(d['helpful']['outOf']) != 0:
        ratio = get_ratio(d)
        valid_ratio_array.append([user_index, item_index, ratio])
valid_ratio_array = np.array(valid_ratio_array)

# build array [user_index, item_index, ratio]
all_ratio_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    if float(d['helpful']['outOf']) != 0:
        ratio = get_ratio(d)
        all_ratio_array.append([user_index, item_index, ratio])
all_ratio_array = np.array(all_ratio_array)

In [31]:
# utility and update functions

def get_valid_mse(lam, alpha, beta_us, beta_is, ratio_array, valid_ratio_array):
    predicts = alpha + beta_us[valid_ratio_array[:, 0].astype(int)] + beta_is[valid_ratio_array[:, 1].astype(int)]
    ratios = valid_ratio_array[:, 2].astype(float)
    return (1. / valid_ratio_array.shape[0]) * np.sum((predicts - ratios) ** 2.0)

def get_cost(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    predicts = alpha + beta_us[train_ratio_array[:, 0].astype(int)] + beta_is[train_ratio_array[:, 1].astype(int)]
    ratios = train_ratio_array[:, 2].astype(float)
    return np.sum((predicts - ratios) ** 2.) + lam * (np.sum(beta_us ** 2.) + np.sum(beta_is ** 2.))

def alpha_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    sum_Rui = np.sum(train_ratio_array[:, 2])
    sum_beta_u = np.sum(beta_us[train_ratio_array[:, 0].astype(int)]) # fancy indexing
    sum_beta_i = np.sum(beta_is[train_ratio_array[:, 1].astype(int)]) # fancy indexing
    return (sum_Rui - sum_beta_u - sum_beta_i) / train_ratio_array.shape[0]

def beta_us_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    new_beta_us = np.zeros_like(beta_us)
    for user_index in xrange(num_users):
        # [the set of items] reviewed by user u
        Iu = Ruis[user_index].keys()
        Iu_size = len(Iu)
        # sums
        sum_Rui = np.sum(Ruis[user_index].values())
        sum_alpha = Iu_size * alpha
        sum_beta_i = np.sum(beta_is[Iu])
        # write result
        new_beta_us[user_index] = float(sum_Rui - sum_alpha - sum_beta_i) / (lam + Iu_size)
    return new_beta_us

def beta_is_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    new_beta_is = np.zeros_like(beta_is)
    for item_index in xrange(num_items):
        # [the set of users] reviewd item i
        Ui = Rius[item_index].keys()
        Ui_size = len(Ui)
        # sums
        sum_Rui = np.sum(Rius[item_index].values())
        sum_alpha = Ui_size * alpha
        sum_beta_u = np.sum(beta_us[Ui])
        # write result
        new_beta_is[item_index] = float(sum_Rui - sum_alpha - sum_beta_u) / (lam + Ui_size)
    return new_beta_is

def train_and_eval(max_iter,
                   lam, alpha, beta_us, beta_is,
                   train_ratio_array, valid_ratio_array, valid_data,
                   print_step = False):

    # build Mapping of Ruis and Rius
    Ruis = defaultdict(dict)
    Rius = defaultdict(dict)

    # Notes:
    # Iu = Ruis[user_index].keys() # [the set of items] reviewed by user u
    # Ui = Ruis[item_index].keys() # [the set of users] reviewed item i

    for t in train_ratio_array:
        user_index = t[0]
        item_index = t[1]
        ratio = t[2]
        Ruis[user_index][item_index] = ratio
        Rius[item_index][user_index] = ratio

    # train on this dataset
    for i in xrange(max_iter):
        alpha = alpha_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
        beta_us = beta_us_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
        beta_is = beta_is_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
        if print_step:
            cost = get_cost(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
            valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is,
                                      train_ratio_array, valid_ratio_array)
            valid_mae = get_valid_mae(valid_data, alpha, beta_us, beta_is)
            print(i, alpha, np.mean(beta_us), np.mean(beta_is), cost, valid_mse, valid_mae)

    cost = get_cost(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
    valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, train_ratio_array, valid_ratio_array)

    return(cost, valid_mse, alpha, beta_us, beta_is)

def predict_helpful(d, alpha, beta_us, beta_is):
    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])
    ratio = alpha + beta_us[user_id_map_index[user_id]] + beta_is[item_id_map_index[item_id]]
    ratio = min(5.0, ratio)
    ratio = max(0.0, ratio)
    helpful_predict = ratio * outof
    
    return helpful_predict

In [34]:
# fit linear model: ratio(u, i) = alpha + beta_u + beta_i

# parameters
max_iter = 30
lam = 1.0
alpha = 0.7704
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, valid_mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                          lam, alpha, beta_us, beta_is, 
                                                          train_ratio_array, valid_ratio_array, valid_data,
                                                          print_step=True)

0 0.745353848972 -0.0145881555102 0.00875687477617 41840.9328846 0.112306128071 0.774115423878
1 0.746048669228 -0.0127770609623 0.00956923584893 34552.2232366 0.0939125888091 0.685657955272
2 0.746807946832 -0.0131928454579 0.00886756892515 34361.9194486 0.0934033256027 0.682451768427
3 0.747511550192 -0.0135522725803 0.00778264668251 34333.7012457 0.0933302383421 0.681402556252
4 0.748129069649 -0.0136255763937 0.00660757437299 34321.3906151 0.0932982836038 0.680710114535
5 0.748653352185 -0.0134611448797 0.00544412285244 34314.1388249 0.0932792186206 0.680201546818
6 0.74908531986 -0.0131324403296 0.00433532949666 34309.4183305 0.093266674903 0.679814341853
7 0.749429309563 -0.0126979240306 0.00330104253613 34306.1816021 0.093257941424 0.679507871883
8 0.749691233054 -0.0121990507425 0.00234994071764 34303.8831614 0.0932515763703 0.679259004656
9 0.749877690718 -0.0116643334393 0.00148461141377 34302.2056658 0.0932467447065 0.679053844586
10 0.749995488232 -0.0111132631518 0.0007041

In [38]:
get_valid_mae(valid_data, alpha, beta_us, beta_is)

0.6779338574730378