In [1]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from pprint import pprint
import os
import datetime

def get_mae(helpfuls, helpfuls_predict):
    return np.mean(np.fabs(helpfuls_predict - helpfuls.astype(float)))

In [2]:
start_time = time.time()
all_data = pickle.load(open("all_data.pickle", "rb"))
test_data = pickle.load(open("helpful_data.pickle", "rb"))
print(time.time() - start_time)

19.6515119076


In [3]:
# global feature
global_feature = dict()

# ratio a: global average
train_helpfuls = np.array([d['helpful']['nHelpful']
                           for d in all_data]).astype(float)
train_outofs = np.array([d['helpful']['outOf']
                         for d in all_data]).astype(float)
global_ratio_a = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))

# ratio b: average of individual ratios
pure_helpfuls = np.copy(train_helpfuls)
pure_outofs = np.copy(train_outofs)
pure_helpfuls = pure_helpfuls[pure_outofs != 0]
pure_outofs = pure_outofs[pure_outofs != 0]
global_ratio_b = np.mean(pure_helpfuls / pure_outofs)

# ratio c: linear search best ratio
def linear_search_ratio(helpfuls, outofs, search_range=(0.3, 1.0, 0.001)):
    alphas = np.arange(*search_range)
    errors = [get_mae(helpfuls, outofs * alpha) for alpha in alphas]
    optimal_alpha = alphas[np.argmin(errors)]
    return optimal_alpha

global_ratio_c = linear_search_ratio(
    train_helpfuls, train_outofs, search_range=(0.3, 1.0, 0.001))

# avg review length and summary length
avg_review_length = np.mean(
    [float(len(d['reviewText'].split())) for d in all_data + test_data])
avg_summary_length = np.mean(
    [float(len(d['summary'].split())) for d in all_data + test_data])

global_feature['global_ratio_a'] = global_ratio_a
global_feature['global_ratio_b'] = global_ratio_b
global_feature['global_ratio_c'] = global_ratio_c
global_feature['avg_review_length'] = avg_review_length
global_feature['avg_summary_length'] = avg_summary_length

# add global_feature['level_cats'] to global_feature
raw_cats = [d['category'] for d in all_data + test_data]

level_cats = list()
for level in range(7):
    level_cats.append(set())
    for cat_list_list in raw_cats:
        for cat_list in cat_list_list:
            if len(cat_list) > level:
                level_cats[level].add(cat_list[level])
# convert set to list
for i in range(len(level_cats)):
    level_cats[i] = sorted(list(level_cats[i]))
global_feature['level_cats'] = level_cats

# global average rating
ratings = [d['rating'] for d in all_data + test_data]
global_feature['avg_rating'] = float(np.mean(ratings))

pprint(global_feature)

{'avg_rating': 4.218305714285714,
 'avg_review_length': 204.28218857142858,
 'avg_summary_length': 4.6970666666666663,
 'global_ratio_a': 0.76998328875346433,
 'global_ratio_b': 0.73843176371280272,
 'global_ratio_c': 0.85700000000000043,
 'level_cats': [['Books', 'Kindle Store', 'Toys & Games'],
                ['Arts & Photography',
                 'Biographies & Memoirs',
                 'Business & Money',
                 "Children's Books",
                 'Christian Books & Bibles',
                 'Comics & Graphic Novels',
                 'Computers & Technology',
                 'Cookbooks, Food & Wine',
                 'Crafts, Hobbies & Home',
                 'Education & Teaching',
                 'Games',
                 'Gay & Lesbian',
                 'Health, Fitness & Dieting',
                 'History',
                 'Humor & Entertainment',
                 'Kindle Short Reads',
                 'Kindle Singles',
                 'Kindle Worlds',
    

In [4]:
# make sure we've seen all users, items in all_data
all_data_user_ids = set([d['reviewerID'] for d in all_data])
test_data_user_ids = set([d['reviewerID'] for d in all_data])

for user_id in test_data_user_ids:
    if user_id not in all_data_user_ids:
        print(user_id)
        
all_data_item_ids = set([d['itemID'] for d in all_data])
test_data_item_ids = set([d['itemID'] for d in all_data])

for item_id in test_data_item_ids:
    if item_id not in all_data_item_ids:
        print(item_id)

In [5]:
# user feature
user_ids = list(set([d['reviewerID'] for d in all_data + test_data]))
users_feature = dict()
for user_id in user_ids:
    users_feature[user_id] = dict()

# 1. compute ratios
users_outofs = defaultdict(list)
users_helpfuls = defaultdict(list)

for d in all_data:
    user_id = d['reviewerID']
    users_helpfuls[user_id].append(float(d['helpful']['nHelpful']))
    users_outofs[user_id].append(float(d['helpful']['outOf']))

assert(len(users_outofs) == len(user_ids))

# ratio_a
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        users_feature[user_id]['ratio_a'] = np.sum(
            users_helpfuls[user_id]) / np.sum(users_outofs[user_id])
    else:
        users_feature[user_id]['ratio_a'] = global_feature['global_ratio_a']
        
assert(len(users_feature) == len(user_ids))

# ratio_b
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        helpfuls = np.array(users_helpfuls[user_id])
        outofs = np.array(users_outofs[user_id])
        # remove zero outofs
        helpfuls = helpfuls[outofs != 0]
        outofs = outofs[outofs != 0]
        # ratios
        ratios = helpfuls / outofs
        users_feature[user_id]['ratio_b'] = np.mean(ratios)
    else:
        users_feature[user_id]['ratio_b'] = global_feature['global_ratio_b']

# 2. number of reviews by the user
users_num_review = defaultdict(float)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_num_review[user_id] += 1.0
for user_id in users_feature:
    users_feature[user_id]['num_reviews'] = users_num_review[user_id]

# 3. time line ratio and time spot ratio (store all review_times for usage)
users_review_times = defaultdict(list)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_review_times[user_id].append(d['unixReviewTime'])
    users_review_times[user_id] = sorted(users_review_times[user_id])
for user_id in users_feature:
    users_feature[user_id]['review_times'] = users_review_times[user_id]

# 4. average review length and summary length by the user
users_review_lengths = defaultdict(list)
users_summary_lengths = defaultdict(list)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_review_lengths[user_id].append(float(len(d['reviewText'].split())))
    users_summary_lengths[user_id].append(float(len(d['summary'].split())))

for user_id in users_feature:
    if users_feature[user_id]['num_reviews'] == 0:
        users_feature[user_id]['avg_review_length'] = global_feature[
            'avg_review_length']
        users_feature[user_id]['avg_summary_length'] = global_feature[
            'avg_summary_length']
    else:
        assert len(users_review_lengths[user_id]) > 0
        assert len(users_summary_lengths[user_id]) > 0
        users_feature[user_id]['avg_review_length'] = np.mean(
            users_review_lengths[user_id])
        users_feature[user_id]['avg_summary_length'] = np.mean(
            users_summary_lengths[user_id])
        
# 5. average rating by the user
users_ratings = defaultdict(list)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_ratings[user_id].append(float(d['rating']))
assert(len(users_ratings) == len(user_ids))

for user_id in users_feature:
    assert((len(users_ratings[user_id]) > 0))
    users_feature[user_id]['avg_rating'] = np.mean(users_ratings[user_id])

In [6]:
# print(users_feature['U805524026'])
collect = [d for d in all_data + test_data 
           if d['reviewerID'] == 'U805524026']
print(len(collect))
avg_review_lengths = np.mean(
    [float(len(d['reviewText'].split())) for d in collect])
avg_summary_lengths = np.mean(
    [float(len(d['summary'].split())) for d in collect])
print(avg_review_lengths, avg_summary_lengths)
print(users_feature['U805524026'])

247
229.720647773 3.44534412955
{'num_reviews': 247.0, 'avg_rating': 4.42914979757085, 'avg_review_length': 229.72064777327935, 'ratio_b': 0.69224987624812861, 'ratio_a': 0.76000000000000001, 'review_times': [1311897600, 1314662400, 1317513600, 1317513600, 1317513600, 1317513600, 1317513600, 1317513600, 1317513600, 1319673600, 1322697600, 1322697600, 1322697600, 1322697600, 1322697600, 1322697600, 1322697600, 1323561600, 1324598400, 1325376000, 1325808000, 1328054400, 1328054400, 1328054400, 1330128000, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1331424000, 1331596800, 1332115200, 1332201600, 1332201600, 1332288000, 1333238400, 1334361600, 1334620800, 1334707200, 1334793600, 1335139200, 1335225600, 1335225600, 1335398400, 1336089600, 1336089600, 1336521600, 1336521600, 1337040000, 1337299200, 1338422400, 1338422400, 1338854400, 1340668800, 1340668800, 1340928000, 1340928000, 1340928000, 1340928000, 1340928000, 1340928000, 1343692800,

In [7]:
# item feature
item_ids = list(set([d['itemID'] for d in all_data + test_data]))
items_feature = dict()
for item_id in item_ids:
    items_feature[item_id] = dict()

# 1. compute ratios
items_outofs = defaultdict(list)
items_helpfuls = defaultdict(list)

for d in all_data:
    item_id = d['itemID']
    items_helpfuls[item_id].append(float(d['helpful']['nHelpful']))
    items_outofs[item_id].append(float(d['helpful']['outOf']))
    
assert(len(items_outofs) == len(item_ids))

# ratio_a
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        items_feature[item_id]['ratio_a'] = np.sum(
            items_helpfuls[item_id]) / np.sum(items_outofs[item_id])
    else:
        items_feature[item_id]['ratio_a'] = global_feature['global_ratio_a']
        
assert(len(items_feature) == len(item_ids))

# ratio_b
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        helpfuls = np.array(items_helpfuls[item_id])
        outofs = np.array(items_outofs[item_id])
        # remove zero outofs
        helpfuls = helpfuls[outofs != 0]
        outofs = outofs[outofs != 0]
        # ratios
        ratios = helpfuls / outofs
        items_feature[item_id]['ratio_b'] = np.mean(ratios)
    else:
        items_feature[item_id]['ratio_b'] = global_feature['global_ratio_b']

# 2. number of reviews to the item
items_num_review = defaultdict(float)
for d in all_data + test_data:
    item_id = d['itemID']
    items_num_review[item_id] += 1.0
for item_id in items_feature:
    items_feature[item_id]['num_reviews'] = items_num_review[item_id]

# 3. time line ratio and time spot ratio (store all review_times for usage)
items_review_times = defaultdict(list)
for d in all_data + test_data:
    item_id = d['itemID']
    items_review_times[item_id].append(d['unixReviewTime'])
    items_review_times[item_id] = sorted(items_review_times[item_id])
for item_id in items_feature:
    items_feature[item_id]['review_times'] = items_review_times[item_id]

# 4. average review length and summary length for the item
items_review_lengths = defaultdict(list)
items_summary_lengths = defaultdict(list)
for d in all_data + test_data:
    item_id = d['itemID']
    items_review_lengths[item_id].append(float(len(d['reviewText'].split())))
    items_summary_lengths[item_id].append(float(len(d['summary'].split())))

for item_id in items_feature:
    if items_feature[item_id]['num_reviews'] == 0:
        items_feature[item_id]['avg_review_length'] = global_feature[
            'avg_review_length']
        items_feature[item_id]['avg_summary_length'] = global_feature[
            'avg_summary_length']
    else:
        assert len(items_review_lengths[item_id]) > 0
        assert len(items_summary_lengths[item_id]) > 0
        items_feature[item_id]['avg_review_length'] = np.mean(
            items_review_lengths[item_id])
        items_feature[item_id]['avg_summary_length'] = np.mean(
            items_summary_lengths[item_id])
            
# 5. average rating for the item
items_ratings = defaultdict(list)
for d in all_data + test_data:
    item_id = d['itemID']
    items_ratings[item_id].append(float(d['rating']))
assert(len(items_ratings) == len(item_ids))

for item_id in items_feature:
    assert((len(items_ratings[item_id]) > 0))
    items_feature[item_id]['avg_rating'] = np.mean(items_ratings[item_id])

In [8]:
print(items_feature['I063511736'])

{'num_reviews': 24.0, 'avg_rating': 4.041666666666667, 'avg_review_length': 91.166666666666671, 'ratio_b': 0.875, 'ratio_a': 0.75, 'review_times': [1336521600, 1359158400, 1359676800, 1359676800, 1359676800, 1359763200, 1359936000, 1361491200, 1361577600, 1362009600, 1362355200, 1363219200, 1363737600, 1365811200, 1368576000, 1371945600, 1376956800, 1377820800, 1389744000, 1392854400, 1395273600, 1396828800, 1398643200, 1402963200], 'avg_summary_length': 3.25}


In [9]:
pickle.dump((global_feature, users_feature, items_feature),
            open("global_users_items_feature.feature", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)
global_feature, users_feature, items_feature = pickle.load(
    open("global_users_items_feature.feature", "rb"))