In [2]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os

data_root = os.path.expanduser("~") + '/data/CSE255/'
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

17.0889589787


In [4]:
all_size = len(all_data)
train_size = 900000
train_data = all_data[:train_size]
valid_data = all_data[train_size:]

In [5]:
datum = train_data[0]
print(datum)

{'itemID': 'I572782694', 'rating': 5.0, 'helpful': {'nHelpful': 0, 'outOf': 0}, 'reviewText': 'favorite of the series...May not have been as steamy as some of the others...but the characters, their depth, and believability were amazing.  wanted to curl up with Devlin and make it all better(wink wink). an amazing series...found Laura Kate when I stumbled onto Hearts in Darkness(one of my all time faves)...this series ranks up there with my Kresley Cole and Gena Showalter favorites.', 'reviewerID': 'U243261361', 'summary': 'Loved it', 'unixReviewTime': 1399075200, 'category': [['Books']], 'reviewTime': '05 3, 2014'}


In [56]:
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

In [58]:
# linear search best ratio
def linear_search_ratio(helpfuls, outofs, search_range=(0.3, 1.0, 0.001)):
    alphas = np.arange(*search_range)
    errors = [get_mae(helpfuls, outofs * alpha) for alpha in alphas]
    optimal_alpha = alphas[np.argmin(errors)]
    return optimal_alpha

# training set global
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])
train_avg_ratio = linear_search_ratio(train_helpfuls, train_outofs, search_range=(0.3, 1.0, 0.001))
print('optimal helpfulness ratio', train_avg_ratio)

# mean absolute error in validation set
valid_helpfuls = np.array([d['helpful']['nHelpful'] for d in valid_data])
valid_outofs =  np.array([d['helpful']['outOf'] for d in valid_data])
valid_helpfuls_predict = valid_outofs * alpha
print('valid mean_abs_error', get_mean_abs_error(valid_helpfuls, valid_helpfuls_predict))

optimal helpfulness ratio 0.857
valid mean_abs_error 0.6626407


In [53]:
# user ratio
users_outofs = defaultdict(list)
users_helpfuls = defaultdict(list)
for d in train_data:
    user_id = d['reviewerID']
    users_outofs[user_id].append(float(d['helpful']['outOf']))
    users_helpfuls[user_id].append(float(d['helpful']['nHelpful']))
    
users_ratio = dict()
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        ratio = linear_search_ratio(np.array(users_helpfuls[user_id]), 
                                    np.array(users_outofs[user_id]), 
                                    search_range=(0.0, 1.01, 0.01))
        if ratio < 0.1:
            ratio = train_avg_ratio
    else:
        ratio = train_avg_ratio
    users_ratio[user_id] = ratio
    
print('finish users ratio')
    
# item ratio
items_outofs = defaultdict(list)
items_helpfuls = defaultdict(list)
for d in train_data:
    item_id = d['itemID']
    items_outofs[item_id].append(float(d['helpful']['outOf']))
    items_helpfuls[item_id].append(float(d['helpful']['nHelpful']))
    
items_ratio = dict()
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        ratio = linear_search_ratio(np.array(items_helpfuls[item_id]), 
                                    np.array(items_outofs[item_id]), 
                                    search_range=(0.0, 1.01, 0.01))
        if ratio < 0.1:
            ratio = train_avg_ratio
    else:
        ratio = train_avg_ratio
    items_ratio[item_id] = ratio
    
print('finish items ratio')

In [59]:
items_ratio

{'I149027464': 0.25,
 'I592950666': 0.92000000000000004,
 'I234759324': 0.5,
 'I451038723': 0.90000000000000002,
 'I149027465': 0.79000000000000004,
 'I757892241': 1.0,
 'I690027210': 1.0,
 'I051878450': 0.93000000000000005,
 'I848686579': 1.0,
 'I623631179': 0.87,
 'I368977740': 0.89000000000000001,
 'I734960315': 1.0,
 'I650332729': 0.83000000000000007,
 'I755935643': 0.91000000000000003,
 'I685431180': 0.87,
 'I236546080': 1.0,
 'I543680709': 0.91000000000000003,
 'I315824957': 0.66000000000000003,
 'I594028873': 0.87,
 'I980382150': 0.88,
 'I353035619': 1.0,
 'I404930015': 0.79000000000000004,
 'I700288839': 0.42999999999999999,
 'I019094800': 0.64000000000000001,
 'I038790941': 0.27000000000000002,
 'I403451902': 0.75,
 'I042595433': 0.91000000000000003,
 'I352467587': 1.0,
 'I332526546': 1.0,
 'I620971945': 1.0,
 'I770874890': 1.0,
 'I680942557': 0.80000000000000004,
 'I920638060': 0.83999999999999997,
 'I781832362': 0.93000000000000005,
 'I994226269': 0.85700000000000043,
 'I994

1882