In [None]:
# my version of the algorithm used in the baseline

from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os
from __future__ import print_function

data_root = os.path.expanduser("~") + '/data/CSE255/'

In [None]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

In [None]:
all_size = len(all_data)
train_size = 900000
# train_size = all_size
valid_size = 100000
train_data = all_data[:train_size]
valid_data = all_data[all_size - valid_size:]

In [None]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

def get_valid_mae(valid_data, alpha, beta_us, beta_is):
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    helpfuls_predict = np.array([predict_helpful(d, alpha, beta_us, beta_is) for d in valid_data])
    return get_mae(helpfuls, helpfuls_predict)

In [None]:
# pre-process 0: build id <-> index infastructure

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# user and item numbers
num_items = len(item_ids)
num_users = len(user_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [None]:
# pre-process 1: build train_ratio_array, valid_ratio_array

def get_ratio(d):
    return float(d['helpful']['nHelpful']) / float(d['helpful']['outOf'])

# build array [user_index, item_index, ratio]
train_ratio_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    if float(d['helpful']['outOf']) != 0:
        ratio = get_ratio(d)
        train_ratio_array.append([user_index, item_index, ratio])
train_ratio_array = np.array(train_ratio_array)

# build array [user_index, item_index, ratio]
valid_ratio_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    if float(d['helpful']['outOf']) != 0:
        ratio = get_ratio(d)
        valid_ratio_array.append([user_index, item_index, ratio])
valid_ratio_array = np.array(valid_ratio_array)

# build array [user_index, item_index, ratio]
all_ratio_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    if float(d['helpful']['outOf']) != 0:
        ratio = get_ratio(d)
        all_ratio_array.append([user_index, item_index, ratio])
all_ratio_array = np.array(all_ratio_array)

In [None]:
# utility and update functions

def get_valid_mse(lam, alpha, beta_us, beta_is, ratio_array, valid_ratio_array):
    predicts = alpha + beta_us[valid_ratio_array[:, 0].astype(int)] + beta_is[valid_ratio_array[:, 1].astype(int)]
    ratios = valid_ratio_array[:, 2].astype(float)
    return (1. / valid_ratio_array.shape[0]) * np.sum((predicts - ratios) ** 2.0)

def get_cost(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    predicts = alpha + beta_us[train_ratio_array[:, 0].astype(int)] + beta_is[train_ratio_array[:, 1].astype(int)]
    ratios = train_ratio_array[:, 2].astype(float)
    return np.sum((predicts - ratios) ** 2.) + lam * (np.sum(beta_us ** 2.) + np.sum(beta_is ** 2.))

def alpha_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    sum_Rui = np.sum(train_ratio_array[:, 2])
    sum_beta_u = np.sum(beta_us[train_ratio_array[:, 0].astype(int)]) # fancy indexing
    sum_beta_i = np.sum(beta_is[train_ratio_array[:, 1].astype(int)]) # fancy indexing
    return (sum_Rui - sum_beta_u - sum_beta_i) / train_ratio_array.shape[0]

def beta_us_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    new_beta_us = np.zeros_like(beta_us)
    for user_index in xrange(num_users):
        # [the set of items] reviewed by user u
        Iu = Ruis[user_index].keys()
        Iu_size = len(Iu)
        # sums
        sum_Rui = np.sum(Ruis[user_index].values())
        sum_alpha = Iu_size * alpha
        sum_beta_i = np.sum(beta_is[Iu])
        # write result
        new_beta_us[user_index] = float(sum_Rui - sum_alpha - sum_beta_i) / (lam + Iu_size)
    return new_beta_us

def beta_is_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius):
    new_beta_is = np.zeros_like(beta_is)
    for item_index in xrange(num_items):
        # [the set of users] reviewd item i
        Ui = Rius[item_index].keys()
        Ui_size = len(Ui)
        # sums
        sum_Rui = np.sum(Rius[item_index].values())
        sum_alpha = Ui_size * alpha
        sum_beta_u = np.sum(beta_us[Ui])
        # write result
        new_beta_is[item_index] = float(sum_Rui - sum_alpha - sum_beta_u) / (lam + Ui_size)
    return new_beta_is

def train_and_eval(max_iter,
                   lam, alpha, beta_us, beta_is,
                   train_ratio_array, valid_ratio_array, valid_data,
                   print_step = False):
    # print init valid mae
    print('init valid mae', get_valid_mae(valid_data, alpha, beta_us, beta_is))

    # build Mapping of Ruis and Rius
    Ruis = defaultdict(dict)
    Rius = defaultdict(dict)

    # Notes:
    # Iu = Ruis[user_index].keys() # [the set of items] reviewed by user u
    # Ui = Ruis[item_index].keys() # [the set of users] reviewed item i

    for t in train_ratio_array:
        user_index = t[0]
        item_index = t[1]
        ratio = t[2]
        Ruis[user_index][item_index] = ratio
        Rius[item_index][user_index] = ratio

    # train on this dataset
    for i in xrange(max_iter):
        alpha = alpha_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
        print('alpha valid mae', get_valid_mae(valid_data, alpha, beta_us, beta_is))
        beta_us = beta_us_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
        print('beta_us valid mae', get_valid_mae(valid_data, alpha, beta_us, beta_is))
        beta_is = beta_is_update(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
        print('beta_is valid mae', get_valid_mae(valid_data, alpha, beta_us, beta_is))
        if print_step:
            cost = get_cost(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
            valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is,
                                      train_ratio_array, valid_ratio_array)
            valid_mae = get_valid_mae(valid_data, alpha, beta_us, beta_is)
            print(i, alpha, np.mean(beta_us), np.mean(beta_is), cost, valid_mse, valid_mae)

    cost = get_cost(lam, alpha, beta_us, beta_is, train_ratio_array, Ruis, Rius)
    valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, train_ratio_array, valid_ratio_array)

    return(cost, valid_mse, alpha, beta_us, beta_is)

def predict_helpful(d, alpha, beta_us, beta_is):
    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])
    ratio = alpha + beta_us[user_id_map_index[user_id]] + beta_is[item_id_map_index[item_id]]
    ratio = min(5.0, ratio)
    ratio = max(0.0, ratio)
    helpful_predict = ratio * outof
    
    return helpful_predict

In [None]:
# fit linear model: ratio(u, i) = alpha + beta_u + beta_i

# parameters
max_iter = 30
lam = 1.0
alpha = 0.7704
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, valid_mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                          lam, alpha, beta_us, beta_is, 
                                                          train_ratio_array, valid_ratio_array, valid_data,
                                                          print_step=True)

In [None]:
# simulate alpha, beta_us, beta_is by hand

# get global average
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])
train_avg_ratio = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))

# get average for a user
users_outof = dict()
users_helpful = dict()

for d in train_data:
    user_id = d['reviewerID']
    users_outof[user_id] = users_outof.get(user_id, 0.0) + float(d['helpful']['outOf'])
    users_helpful[user_id] = users_helpful.get(user_id, 0.0) + float(d['helpful']['nHelpful'])
    
users_ratio = dict()
for user_id in users_outof:
    if users_outof[user_id] != 0:
        users_ratio[user_id] = users_helpful[user_id] / users_outof[user_id]
    else:
        users_outof[user_id] = train_avg_ratio

# simulate!
alpha = train_avg_ratio
beta_us = np.zeros((num_users,))
beta_is = np.zeros((num_items,))

for user_id, ratio_value in users_ratio.iteritems():
    beta_us[user_id_map_index[user_id]] = ratio_value - alpha
    
# get valid mae
print(get_valid_mae(valid_data, alpha, beta_us, beta_is))
print(get_valid_mae(train_data, alpha, beta_us, beta_is))

In [None]:
# with the initialized value, do update
# doesn't quite work, since objective not the same
max_iter = 30
lam = 1.0

cost, valid_mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                          lam, alpha, beta_us, beta_is, 
                                                          train_ratio_array, valid_ratio_array, valid_data,
                                                          print_step=True)