In [1]:
# my version of the algorithm used in the baseline
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os

data_root = os.path.expanduser("~") + '/data/CSE255/'

In [2]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

43.0540509224


In [3]:
all_size = len(all_data)
train_size = 900000
# train_size = all_size
valid_size = 100000
train_data = all_data[:train_size]
valid_data = all_data[all_size - valid_size:]

In [4]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

def get_valid_mae(valid_data):
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    helpfuls_predict = np.array([predict_helpful(d, train_avg_ratio, users_ratio, items_ratio) for d in valid_data])
    return get_mae(helpfuls, helpfuls_predict)

In [5]:
# get global average
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])
train_avg_ratio = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))
print('avg helpfulness ratio', train_avg_ratio)

# get average for a user
users_outof = dict()
users_helpful = dict()

for d in train_data:
    user_id = d['reviewerID']
    users_outof[user_id] = users_outof.get(user_id, 0.0) + float(d['helpful']['outOf'])
    users_helpful[user_id] = users_helpful.get(user_id, 0.0) + float(d['helpful']['nHelpful'])
    
users_ratio = dict()
for user_id in users_outof:
    if users_outof[user_id] != 0:
        users_ratio[user_id] = users_helpful[user_id] / users_outof[user_id]
    else:
        users_outof[user_id] = train_avg_ratio
        
# get average for a item
items_outof = dict()
items_helpful = dict()

for d in train_data:
    item_id = d['itemID']
    items_outof[item_id] = items_outof.get(item_id, 0.0) + float(d['helpful']['outOf'])
    items_helpful[item_id] = items_helpful.get(item_id, 0.0) + float(d['helpful']['nHelpful'])

items_ratio = dict()
for item_id in items_outof:
    if items_outof[item_id] != 0:
        items_ratio[item_id] = items_helpful[item_id] / items_outof[item_id]
    else:
        items_outof[item_id] = train_avg_ratio

avg helpfulness ratio 0.770464430813


In [6]:
def predict_helpful(d, train_avg_ratio, users_ratio, items_ratio):
    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])
    if user_id in users_ratio:
        helpful_predict = users_ratio[user_id] * outof
    else:
        helpful_predict = train_avg_ratio * outof
    return helpful_predict

# def predict_helpful(d, train_avg_ratio, users_ratio, items_ratio):
#     user_id = d['reviewerID']
#     item_id = d['itemID']
#     outof = float(d['helpful']['outOf'])
#     if item_id in items_ratio:
#         helpful_predict = items_ratio[item_id] * outof
#     else:
#         helpful_predict = train_avg_ratio * outof
#     return helpful_predict

In [7]:
print(get_valid_mae(valid_data))

0.630080214054


In [8]:
def get_occur_stats(valid_data, users_ratio, items_ratio):
    stats = dict()
    stats['in_user_in_item'] = 0
    stats['in_user_not_in_item'] = 0
    stats['not_in_user_in_item'] = 0
    stats['not_in_user_not_in_item'] = 0
    
    for d in valid_data:
        user_id = d['reviewerID']
        item_id = d['itemID']

        if (user_id in users_ratio) and (item_id in items_ratio):
            stats['in_user_in_item'] += 1
        elif (user_id in users_ratio) and (item_id not in items_ratio):
            stats['in_user_not_in_item'] += 1
        elif (user_id not in users_ratio) and (item_id in items_ratio):
            stats['not_in_user_in_item'] += 1
        else:
            stats['not_in_user_not_in_item']
    return stats

print(get_occur_stats(valid_data, users_ratio, items_ratio))

{'in_user_not_in_item': 906, 'not_in_user_not_in_item': 0, 'not_in_user_in_item': 780, 'in_user_in_item': 98302}


In [None]:
# ############ produce test ############

# # load helpful_data.json
# test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))

# # on test set
# test_helpfuls_predict = [predict_helpful(d) for d in test_data]

# # load 'pairs_Helpful.txt'
# # get header_str and user_item_outofs
# with open('pairs_Helpful.txt') as f:
#     # read and strip lines
#     lines = [l.strip() for l in f.readlines()]
#     # stirip out the headers
#     header_str = lines.pop(0)
#     # get a list of user_item_ids
#     user_item_outofs = [l.split('-') for l in lines]
#     user_item_outofs = [[d[0], d[1], float(d[2])] for d in user_item_outofs]
    
# # make sure `data.json` and `pairs_Helpful.txt` the same order
# for (user_id, item_id, outof), d in zip(user_item_outofs, test_data):
#     assert d['reviewerID'] == user_id
#     assert d['itemID'] == item_id
#     assert d['helpful']['outOf'] == outof
    
# # write to output file
# f = open('predictions_Helpful.txt', 'w')
# print(header_str, file=f)
# for (user_id, item_id, outof), test_helpful_predict in zip(user_item_outofs, test_helpfuls_predict):
#     print('%s-%s-%s,%s' % (user_id, item_id, int(outof), test_helpful_predict), file=f)
# f.close()