In [None]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from pprint import pprint
import os
import datetime

In [None]:
start_time = time.time()
all_data = pickle.load(open("all_data.pickle", "rb"))
test_data = pickle.load(open("helpful_data.pickle", "rb"))
print(time.time() - start_time)

In [None]:
# remove the outlier
for i in reversed(range(len(all_data))):
    d = all_data[i]
    if d['helpful']['outOf'] > 3000:
        all_data.pop(i)
    elif d['helpful']['outOf'] < d['helpful']['nHelpful']:
        all_data.pop(i)

In [None]:
# global feature
global_feature = dict()

# ratio a: global average
train_helpfuls = np.array([d['helpful']['nHelpful']
                           for d in train_data]).astype(float)
train_outofs = np.array([d['helpful']['outOf']
                         for d in train_data]).astype(float)
global_ratio_a = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))

# ratio b: average of individual ratios
pure_helpfuls = np.copy(train_helpfuls)
pure_outofs = np.copy(train_outofs)
pure_helpfuls = pure_helpfuls[pure_outofs != 0]
pure_outofs = pure_outofs[pure_outofs != 0]
global_ratio_b = np.mean(pure_helpfuls / pure_outofs)

# ratio c: linear search best ratio
def linear_search_ratio(helpfuls, outofs, search_range=(0.3, 1.0, 0.001)):
    alphas = np.arange(*search_range)
    errors = [get_mae(helpfuls, outofs * alpha) for alpha in alphas]
    optimal_alpha = alphas[np.argmin(errors)]
    return optimal_alpha

global_ratio_c = linear_search_ratio(
    train_helpfuls, train_outofs, search_range=(0.3, 1.0, 0.001))

# avg review length and summary length
avg_review_length = np.mean(
    [float(len(d['reviewText'].split())) for d in all_data + test_data])
avg_summary_length = np.mean(
    [float(len(d['summary'].split())) for d in all_data + test_data])

global_feature['global_ratio_a'] = global_ratio_a
global_feature['global_ratio_b'] = global_ratio_b
global_feature['global_ratio_c'] = global_ratio_c
global_feature['avg_review_length'] = avg_review_length
global_feature['avg_summary_length'] = avg_summary_length
pprint(global_feature)

In [None]:
# user feature
user_ids = list(set([d['reviewerID'] for d in all_data + test_data]))
users_feature = dict()
for user_id in user_ids:
    users_feature[user_id] = dict()

# 1. compute ratios
users_outofs = defaultdict(list)
users_helpfuls = defaultdict(list)

for d in train_data:
    user_id = d['reviewerID']
    users_helpfuls[user_id].append(float(d['helpful']['nHelpful']))
    users_outofs[user_id].append(float(d['helpful']['outOf']))

# ratio_a
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        users_feature[user_id]['ratio_a'] = np.sum(
            users_helpfuls[user_id]) / np.sum(users_outofs[user_id])
    else:
        users_feature[user_id]['ratio_a'] = global_feature['global_ratio_a']

# ratio_b
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        helpfuls = np.array(users_helpfuls[user_id])
        outofs = np.array(users_outofs[user_id])
        # remove zero outofs
        helpfuls = helpfuls[outofs != 0]
        outofs = outofs[outofs != 0]
        # ratios
        ratios = helpfuls / outofs
        users_feature[user_id]['ratio_b'] = np.mean(ratios)
    else:
        users_feature[user_id]['ratio_b'] = global_feature['global_ratio_b']

# 2. number of reviews by the user
users_num_review = defaultdict(float)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_num_review[user_id] += 1.0
for user_id in users_feature:
    users_feature[user_id]['num_reviews'] = users_num_review[user_id]

# 3. time line ratio and time spot ratio (store all review_times for usage)
users_review_times = defaultdict(list)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_review_times[user_id].append(d['unixReviewTime'])
    users_review_times[user_id] = sorted(users_review_times[user_id])
for user_id in users_feature:
    users_feature[user_id]['review_times'] = users_review_times[user_id]

# 4. average review length and summary length by the user
users_review_lengths = defaultdict(list)
users_summary_lengths = defaultdict(list)
for d in all_data + test_data:
    user_id = d['reviewerID']
    users_review_lengths[user_id].append(float(len(d['reviewText'].split())))
    users_summary_lengths[user_id].append(float(len(d['summary'].split())))

for user_id in users_feature:
    if users_feature[user_id]['num_reviews'] == 0:
        users_feature[user_id]['avg_review_length'] = global_feature[
            'avg_review_length']
        users_feature[user_id]['avg_summary_length'] = global_feature[
            'avg_summary_length']
    else:
        assert len(users_review_lengths[user_id]) > 0
        assert len(users_summary_lengths[user_id]) > 0
        users_feature[user_id]['avg_review_length'] = np.mean(
            users_review_lengths[user_id])
        users_feature[user_id]['avg_summary_length'] = np.mean(
            users_summary_lengths[user_id])

In [None]:
# print(users_feature['U805524026'])
collect = [d for d in train_data +
           test_data if d['reviewerID'] == 'U805524026']
print(len(collect))
avg_review_lengths = np.mean(
    [float(len(d['reviewText'].split())) for d in collect])
avg_summary_lengths = np.mean(
    [float(len(d['summary'].split())) for d in collect])
print(avg_review_lengths, avg_summary_lengths)

In [None]:
# item feature
item_ids = list(set([d['itemID'] for d in all_data + test_data]))
items_feature = dict()
for item_id in item_ids:
    items_feature[item_id] = dict()

# 1. compute ratios
items_outofs = defaultdict(list)
items_helpfuls = defaultdict(list)

for d in train_data:
    item_id = d['itemID']
    items_helpfuls[item_id].append(float(d['helpful']['nHelpful']))
    items_outofs[item_id].append(float(d['helpful']['outOf']))

# ratio_a
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        items_feature[item_id]['ratio_a'] = np.sum(
            items_helpfuls[item_id]) / np.sum(items_outofs[item_id])
    else:
        items_feature[item_id]['ratio_a'] = global_feature['global_ratio_a']

# ratio_b
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        helpfuls = np.array(items_helpfuls[item_id])
        outofs = np.array(items_outofs[item_id])
        # remove zero outofs
        helpfuls = helpfuls[outofs != 0]
        outofs = outofs[outofs != 0]
        # ratios
        ratios = helpfuls / outofs
        items_feature[item_id]['ratio_b'] = np.mean(ratios)
    else:
        items_feature[item_id]['ratio_b'] = global_feature['global_ratio_b']

# 2. number of reviews to the item
items_num_review = defaultdict(float)
for d in all_data + test_data:
    item_id = d['itemID']
    items_num_review[item_id] += 1.0
for item_id in items_feature:
    items_feature[item_id]['num_reviews'] = items_num_review[item_id]

# 3. time line ratio and time spot ratio (store all review_times for usage)
items_review_times = defaultdict(list)
for d in all_data + test_data:
    item_id = d['itemID']
    items_review_times[item_id].append(d['unixReviewTime'])
    items_review_times[item_id] = sorted(items_review_times[item_id])
for item_id in items_feature:
    items_feature[item_id]['review_times'] = items_review_times[item_id]

# 4. average review length and summary length for the item
items_review_lengths = defaultdict(list)
items_summary_lengths = defaultdict(list)
for d in all_data + test_data:
    item_id = d['itemID']
    items_review_lengths[item_id].append(float(len(d['reviewText'].split())))
    items_summary_lengths[item_id].append(float(len(d['summary'].split())))

for item_id in items_feature:
    if items_feature[item_id]['num_reviews'] == 0:
        items_feature[item_id]['avg_review_length'] = global_feature[
            'avg_review_length']
        items_feature[item_id]['avg_summary_length'] = global_feature[
            'avg_summary_length']
    else:
        assert len(items_review_lengths[item_id]) > 0
        assert len(items_summary_lengths[item_id]) > 0
        items_feature[item_id]['avg_review_length'] = np.mean(
            items_review_lengths[item_id])
        items_feature[item_id]['avg_summary_length'] = np.mean(
            items_summary_lengths[item_id]

In [None]:
print(items_feature['I063511736'])

In [None]:
# pickle.dump((global_feature, users_feature, items_feature), 
#              open("global_users_items_feature.feature", "wb"), 
#              protocol = pickle.HIGHEST_PROTOCOL)
# global_feature, users_feature, items_feature = pickle.load(open("global_users_items_feature.feature", "rb"))