In [3]:
# idea:
# - every user, every item, and global have an average
# - after averaging statistics, these average is fixe|d
# - fit regression model to fit this average to the original training data to get coefficient
# - thus must be better than the provided baseline

In [19]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os

data_root = os.path.expanduser("~") + '/data/CSE255/'

import cvxopt as co
from l1 import l1

%matplotlib inline
import matplotlib.pyplot as plt

In [79]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

20.086466074


In [80]:
all_size = len(all_data)
train_size = 900000
# train_size = all_size # uncomment this to produce test
valid_size = 100000
train_data = all_data[:train_size]
valid_data = all_data[all_size - valid_size:]

# remove the outlier
for i in reversed(range(train_size)):
    if train_data[i]['helpful']['outOf'] > 4500:
        train_data.pop(i)

for i in reversed(range(valid_size)):
    if valid_data[i]['helpful']['outOf'] > 4500:
        valid_data.pop(i)

In [81]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

def get_valid_mae(valid_data, train_ratio_list):
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    helpfuls_predict = np.array([predict_helpful(d, train_ratio_list) for d in valid_data])
    return get_mae(helpfuls, helpfuls_predict)

In [82]:
# gather statistics about every outofs
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])

train_unique_outofs = np.array(sorted(list(set(train_outofs))))
train_unique_outofs_mean_helpfuls = np.array([np.mean(train_helpfuls[train_outofs == o]) 
                                              for o in train_unique_outofs])
train_unique_outofs_ratio = train_unique_outofs_mean_helpfuls[1:] / train_unique_outofs[1:]
train_unique_outofs_ratio = np.array([0.] + list(train_unique_outofs_ratio))
# plt.plot(train_unique_outofs, train_unique_outofs_ratio)
# plt.plot(train_unique_outofs[:600], train_unique_outofs_ratio[:600])

In [156]:
# build a train ratio dict

# set cut of, anything larger than cutof use global
cut_off = 20
train_unique_outofs = list(train_unique_outofs)
train_unique_outofs_ratio = list(train_unique_outofs_ratio)

def linear_search_ratio(helpfuls, outofs, search_range=(0.3, 1.0, 0.001)):
    alphas = np.arange(*search_range)
    errors = [get_mae(helpfuls, outofs * alpha) for alpha in alphas]
    optimal_alpha = alphas[np.argmin(errors)]
    return optimal_alpha

# training set global
train_avg_ratio = linear_search_ratio(train_helpfuls[train_outofs > cut_off],
                                      train_outofs[train_outofs > cut_off], 
                                      search_range=(0.3, 1.0, 0.0001))
print('optimal helpfulness ratio', train_avg_ratio)


train_ratio_list = [0.0] * 6000
for i in range(6000):
    if i <= cut_off:
        train_ratio_list[i] = train_unique_outofs_ratio[train_unique_outofs.index(i)]
    else:
        train_ratio_list[i] = train_avg_ratio

optimal helpfulness ratio 0.8811


In [158]:
def predict_helpful(d, train_ratio_list):
    outof = float(d['helpful']['outOf'])
    ratio = train_ratio_list[int(outof)]
    return ratio * outof

In [159]:
print(get_valid_mae(valid_data, train_ratio_list))

0.648657128046


In [160]:
import pickle
with open('train_ratio_list.pickle', 'w') as f:
    pickle.dump(train_ratio_list, f)
    
with open('train_ratio_list.pickle') as f:
    train_ratio_list = pickle.load(f)

In [17]:
# ############ produce test ############

# # load helpful_data.json
# test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))

# # on test set
# test_helpfuls_predict = [predict_helpful(d, theta, train_avg_ratio, users_ratio, items_ratio) for d in test_data]

# # load 'pairs_Helpful.txt'
# # get header_str and user_item_outofs
# with open('pairs_Helpful.txt') as f:
#     # read and strip lines
#     lines = [l.strip() for l in f.readlines()]
#     # stirip out the headers
#     header_str = lines.pop(0)
#     # get a list of user_item_ids
#     user_item_outofs = [l.split('-') for l in lines]
#     user_item_outofs = [[d[0], d[1], float(d[2])] for d in user_item_outofs]
    
# # make sure `data.json` and `pairs_Helpful.txt` the same order
# for (user_id, item_id, outof), d in zip(user_item_outofs, test_data):
#     assert d['reviewerID'] == user_id
#     assert d['itemID'] == item_id
#     assert d['helpful']['outOf'] == outof
    
# # write to output file
# f = open('predictions_Helpful.txt', 'w')
# print(header_str, file=f)
# for (user_id, item_id, outof), test_helpful_predict in zip(user_item_outofs, test_helpfuls_predict):
#     print('%s-%s-%s,%s' % (user_id, item_id, int(outof), test_helpful_predict), file=f)
# f.close()