In [1]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from pprint import pprint
import os

# data
data_root = os.path.expanduser("~") + '/data/CSE255/'

In [2]:
# load all_data and test_data
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))
print(time.time() - start_time)
# set train data to all data
train_data = all_data

17.8979709148


In [3]:
# remove the outlier
for i in reversed(range(len(train_data))):
    d = train_data[i] 
    if d['helpful']['outOf'] > 3000:
        train_data.pop(i)
    elif d['helpful']['outOf'] < d['helpful']['nHelpful']:
        train_data.pop(i)

In [4]:
train_data[1200]

{'category': [['Books']],
 'helpful': {'nHelpful': 3, 'outOf': 6},
 'itemID': 'I063511736',
 'rating': 4.0,
 'reviewText': "where she belongs by Cindy Procter-Kingjess is back home to help her ailing mother. She's comfortable with friends in town and her quiet life.Adam love and works in Destiny Falls.Their lives have once again crossed and they renew their friendship and it grows as time goes on.Jess only planned to stay a month then she would go back to her job in the city but the lure of the logging community and people who live in the area have special appeal to her.",
 'reviewTime': '05 9, 2012',
 'reviewerID': 'U805524026',
 'summary': 'great',
 'unixReviewTime': 1336521600}

In [5]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

In [6]:
# global feature
global_feature = dict()

# ratio a: global average
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data]).astype(float)
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data]).astype(float)
global_ratio_a = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))

# ratio b: average of individual ratios
pure_helpfuls = np.copy(train_helpfuls)
pure_outofs = np.copy(train_outofs)
pure_helpfuls = pure_helpfuls[pure_outofs != 0]
pure_outofs = pure_outofs[pure_outofs != 0]
global_ratio_b = np.mean(pure_helpfuls / pure_outofs)

# ratio c: linear search best ratio
def linear_search_ratio(helpfuls, outofs, search_range=(0.3, 1.0, 0.001)):
    alphas = np.arange(*search_range)
    errors = [get_mae(helpfuls, outofs * alpha) for alpha in alphas]
    optimal_alpha = alphas[np.argmin(errors)]
    return optimal_alpha

global_ratio_c = linear_search_ratio(train_helpfuls, train_outofs, search_range=(0.3, 1.0, 0.001))

# avg review length and summary length
avg_review_length = np.mean([float(len(d['reviewText'].split())) for d in train_data + test_data])
avg_summary_length = np.mean([float(len(d['summary'].split())) for d in train_data + test_data])

global_feature['global_ratio_a'] = global_ratio_a
global_feature['global_ratio_b'] = global_ratio_b
global_feature['global_ratio_c'] = global_ratio_c
global_feature['avg_review_length'] = avg_review_length
global_feature['avg_summary_length'] = avg_summary_length
pprint(global_feature)

{'avg_review_length': 204.27815053300557,
 'avg_summary_length': 4.6970453976184512,
 'global_ratio_a': 0.76806459007747785,
 'global_ratio_b': 0.73842172208868673,
 'global_ratio_c': 0.85500000000000043}


In [20]:
# user feature
user_ids = list(set([d['reviewerID'] for d in train_data + test_data]))
users_feature = dict()
for user_id in user_ids:
    users_feature[user_id] = dict()

# 1. compute ratios
users_outofs = defaultdict(list)
users_helpfuls = defaultdict(list)

for d in train_data:
    user_id = d['reviewerID']
    users_helpfuls[user_id].append(float(d['helpful']['nHelpful']))
    users_outofs[user_id].append(float(d['helpful']['outOf']))

# ratio_a
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        users_feature[user_id]['ratio_a'] = np.sum(users_helpfuls[user_id]) / np.sum(users_outofs[user_id])
    else:
        users_feature[user_id]['ratio_a'] = global_feature['global_ratio_a']
        
# ratio_b
for user_id in users_outofs:
    if np.sum(users_outofs[user_id]) != 0:
        helpfuls = np.array(users_helpfuls[user_id])
        outofs = np.array(users_outofs[user_id])
        # remove zero outofs
        helpfuls = helpfuls[outofs != 0]
        outofs = outofs[outofs != 0]
        # ratios
        ratios = helpfuls / outofs
        users_feature[user_id]['ratio_b'] = np.mean(ratios)
    else:
        users_feature[user_id]['ratio_b'] = global_feature['global_ratio_b']
        
# 2. number of reviews by the user
users_num_review = defaultdict(float)
for d in train_data + test_data:
    user_id = d['reviewerID']
    users_num_review[user_id] += 1.0
for user_id in users_feature:
    users_feature[user_id]['num_reviews'] = users_num_review[user_id]
    
# 3. time line ratio and time spot ratio (store all review_times for usage)
users_review_times = defaultdict(list)
for d in train_data + test_data:
    user_id = d['reviewerID']
    users_review_times[user_id].append(d['unixReviewTime'])
    users_review_times[user_id] = sorted(users_review_times[user_id])
for user_id in users_feature:
    users_feature[user_id]['review_times'] = users_review_times[user_id]
    # np.searchsorted(users_feature['U805524026']['review_times'], 1387324800) / float(len(users_feature['U805524026']['review_times']))
    
# 4. average review length and summary length by the user
users_review_lengths = defaultdict(list)
users_summary_lengths = defaultdict(list)
for d in train_data + test_data:
    user_id = d['reviewerID']
    users_review_lengths[user_id].append(float(len(d['reviewText'].split())))
    users_summary_lengths[user_id].append(float(len(d['summary'].split())))

for user_id in users_feature:
    if users_feature[user_id]['num_reviews'] == 0:
        users_feature[user_id]['avg_review_length'] = global_feature['avg_review_length']
        users_feature[user_id]['avg_summary_length'] = global_feature['avg_summary_length']
    else:
        assert len(users_review_lengths[user_id]) > 0
        assert len(users_summary_lengths[user_id]) > 0
        users_feature[user_id]['avg_review_length'] = np.mean(users_review_lengths[user_id])
        users_feature[user_id]['avg_summary_length'] = np.mean(users_summary_lengths[user_id])

print(users_feature['U805524026'])

{'num_reviews': 247.0, 'avg_review_length': 229.72064777327935, 'ratio_b': 0.69224987624812895, 'ratio_a': 0.76000000000000001, 'review_times': [1311897600, 1314662400, 1317513600, 1317513600, 1317513600, 1317513600, 1317513600, 1317513600, 1317513600, 1319673600, 1322697600, 1322697600, 1322697600, 1322697600, 1322697600, 1322697600, 1322697600, 1323561600, 1324598400, 1325376000, 1325808000, 1328054400, 1328054400, 1328054400, 1330128000, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1330473600, 1331424000, 1331596800, 1332115200, 1332201600, 1332201600, 1332288000, 1333238400, 1334361600, 1334620800, 1334707200, 1334793600, 1335139200, 1335225600, 1335225600, 1335398400, 1336089600, 1336089600, 1336521600, 1336521600, 1337040000, 1337299200, 1338422400, 1338422400, 1338854400, 1340668800, 1340668800, 1340928000, 1340928000, 1340928000, 1340928000, 1340928000, 1340928000, 1343692800, 1343692800, 1343692800, 1343692800, 1343865600, 1343865600, 134

In [26]:
collect = [d for d in train_data + test_data if d['reviewerID'] == 'U805524026']
print(len(collect))
avg_review_lengths = np.mean([float(len(d['reviewText'].split())) for d in collect])
avg_summary_lengths = np.mean([float(len(d['summary'].split())) for d in collect])
print(avg_review_lengths, avg_summary_lengths)

247
229.720647773 3.44534412955


In [21]:
# item feature
item_ids = list(set([d['itemID'] for d in train_data + test_data]))
items_feature = dict()
for item_id in item_ids:
    items_feature[item_id] = dict()

# 1. compute ratios
items_outofs = defaultdict(list)
items_helpfuls = defaultdict(list)

for d in train_data:
    item_id = d['itemID']
    items_helpfuls[item_id].append(float(d['helpful']['nHelpful']))
    items_outofs[item_id].append(float(d['helpful']['outOf']))

# ratio_a
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        items_feature[item_id]['ratio_a'] = np.sum(items_helpfuls[item_id]) / np.sum(items_outofs[item_id])
    else:
        items_feature[item_id]['ratio_a'] = global_feature['global_ratio_a']

# ratio_b
for item_id in items_outofs:
    if np.sum(items_outofs[item_id]) != 0:
        helpfuls = np.array(items_helpfuls[item_id])
        outofs = np.array(items_outofs[item_id])
        # remove zero outofs
        helpfuls = helpfuls[outofs != 0]
        outofs = outofs[outofs != 0]
        # ratios
        ratios = helpfuls / outofs
        items_feature[item_id]['ratio_b'] = np.mean(ratios)
    else:
        items_feature[item_id]['ratio_b'] = global_feature['global_ratio_b']

# 2. number of reviews by the item
items_num_review = defaultdict(float)
for d in train_data + test_data:
    item_id = d['itemID']
    items_num_review[item_id] += 1.0
for item_id in items_feature:
    items_feature[item_id]['num_reviews'] = items_num_review[item_id]

# 3. time line ratio and time spot ratio (store all review_times for usage)
items_review_times = defaultdict(list)
for d in train_data + test_data:
    item_id = d['itemID']
    items_review_times[item_id].append(d['unixReviewTime'])
    items_review_times[item_id] = sorted(items_review_times[item_id])
for item_id in items_feature:
    items_feature[item_id]['review_times'] = items_review_times[item_id]
    
# 4. average review length and summary length for the item
items_review_lengths = defaultdict(list)
items_summary_lengths = defaultdict(list)
for d in train_data + test_data:
    item_id = d['itemID']
    items_review_lengths[item_id].append(float(len(d['reviewText'].split())))
    items_summary_lengths[item_id].append(float(len(d['summary'].split())))

for item_id in items_feature:
    if items_feature[item_id]['num_reviews'] == 0:
        items_feature[item_id]['avg_review_length'] = global_feature['avg_review_length']
        items_feature[item_id]['avg_summary_length'] = global_feature['avg_summary_length']
    else:
        assert len(items_review_lengths[item_id]) > 0
        assert len(items_summary_lengths[item_id]) > 0
        items_feature[item_id]['avg_review_length'] = np.mean(items_review_lengths[item_id])
        items_feature[item_id]['avg_summary_length'] = np.mean(items_summary_lengths[item_id])

print(items_feature['I063511736'])

{'num_reviews': 24.0, 'avg_review_length': 91.166666666666671, 'ratio_b': 0.875, 'ratio_a': 0.75, 'review_times': [1336521600, 1359158400, 1359676800, 1359676800, 1359676800, 1359763200, 1359936000, 1361491200, 1361577600, 1362009600, 1362355200, 1363219200, 1363737600, 1365811200, 1368576000, 1371945600, 1376956800, 1377820800, 1389744000, 1392854400, 1395273600, 1396828800, 1398643200, 1402963200], 'avg_summary_length': 3.25}


In [None]:
pickle.dump((global_feature, users_feature, items_feature), 
             open("global_users_items_feature.feature", "wb"), 
             protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
# pre-computed features
with open(os.path.join(data_root, 'style_dict.feature')) as f:
    style_dict = pickle.load(f)
    # style_dict['U243261361']['I572782694']
    # {'avg_word_len': 4.857142857142857,
    #  'capital_count': 11.0,
    #  'capital_ratio': 0.028205128205128206,
    #  'dotdotdot_count': 4.0,
    #  'exclam_count': 0.0,
    #  'exclam_exclam_count': 0.0,
    #  'num_chars': 369.0,
    #  'num_sentences': 3.0,
    #  'num_unique_words': 50,
    #  'num_words': 63.0,
    #  'num_words_summary': 2,
    #  'punctuation_count': 21.0,
    #  'punctuation_ratio': 0.05384615384615385,
    #  'question_count': 0.0,
    #  'redability': 16.65714285714285}

In [None]:
# feature engineering
def get_feature_time(d):
    unix_time = d['unixReviewTime']
    y, m, d = datetime.datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d').split('-')
    y = float(y)
    m = float(m)
    d = float(d)
    return [y, m, d]

def get_feature_style(d):
    # load from style dict
    user_id = d['reviewerID']
    item_id = d['itemID']
    s = style_dict[user_id][item_id]
    
    feature = [s['num_words'],
               s['num_words_summary'],
               s['redability'],
               s['avg_word_len'],
               s['num_words'] / s['num_sentences'],
               s['num_unique_words'],
               s['exclam_exclam_count'] + s['question_count'],
               s['dotdotdot_count'],
               s['capital_ratio']
              ]    
    return feature

def get_time_spot_ratio(times, spot):
    # return the array index ratio to insert spot
    if len(times) == 0:
        return 0.
    index = np.searchsorted(np.array(times), spot)
    return float(index) / float(len(times))

def get_feature_user(d):
    user_id = d['reviewerID']
    unix_time = d['unixReviewTime']
    
    s = users_feature[user_id]
    feature = [s['ratio_a'],
               s['ratio_b'],
               s['num_reviews'],
               s['avg_review_length'],
               s['avg_summary_length'],
               get_time_spot_ratio(s['review_times'], unix_time)
              ]
    return feature

def get_feature_item(d):
    item_id = d['itemID']
    unix_time = d['unixReviewTime']
    
    s = items_feature[item_id]
    feature = [s['ratio_a'],
               s['ratio_b'],
               s['num_reviews'],
               s['avg_review_length'],
               s['avg_summary_length'],
               get_time_spot_ratio(s['review_times'], unix_time)
              ]
    return feature
    
def get_feature(d):
    user_id = d['reviewerID']
    item_id = d['itemID']
    unix_time = d['unixReviewTime']
    
    # offset
    feature = [1.0]
    
    # user
    feature += get_feature_user(d)
    # item
    feature += get_feature_item(d)
    
    # rating
    feature += [float(d['rating'])]
    # styles
    feature += get_feature_style(d)
    # time
    feature += get_feature_time(d)
    
    return feature

In [None]:
# get [feature, label] from single datum
def get_feature_and_ratio_label(d, users_ratio, items_ratio):
    # check valid
    outof = float(d['helpful']['outOf'])
    if outof == 0:
        raise('out of cannot be 0 for ratio')

    # get feature and ratio
    feature = get_feature(d)
    ratio_label = float(d['helpful']['nHelpful']) / float(d['helpful']['outOf'])
    return (feature, ratio_label)

# build [feature, label] list from entire dataset
def make_average_regression_dataset(train_data, users_ratio, items_ratio):
    features = []
    labels = []
    
    for d in train_data:
        if float(d['helpful']['outOf']) == 0:
            continue
        feature, label = get_feature_and_ratio_label(d, users_ratio, items_ratio)
        features.append(feature)
        labels.append(label)
    return (np.array(features), np.array(labels))

# make one prediction
def predict_helpful(d, ratio_predictor, train_avg_ratio, users_ratio, items_ratio):
    # ratio_predictor[func]: y = ratio_predictor(get_feature(d))
    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])
    
    if (user_id in users_ratio) and (item_id in items_ratio):
        # ratio = np.dot(get_feature(d), theta)
        predict = ratio_predictor(get_feature(d))
        ratio = predict[0] # np.ndarray
    elif (user_id in users_ratio) and (item_id not in items_ratio):
        ratio = users_ratio[user_id]
    elif (user_id not in users_ratio) and (item_id in items_ratio):
        ratio = items_ratio[item_id]
    else:
        ratio = train_avg_ratio
    return ratio * outof

# make predictions and get mae on a dataset
def get_valid_mae(valid_data, ratio_predictor, train_avg_ratio, users_ratio, items_ratio):
    # ground truth nhelpful
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    # predited nhelpful
    helpfuls_predict = np.array([predict_helpful(d, ratio_predictor, train_avg_ratio, users_ratio, items_ratio) for d in valid_data])
    # return mae
    return get_mae(helpfuls, helpfuls_predict)

In [None]:
# build dataset
# train_xs, train_ys = make_average_regression_dataset(train_data, users_ratio, items_ratio)
# valid_xs, valid_ys = make_average_regression_dataset(valid_data, users_ratio, items_ratio)
all_xs, all_ys = make_average_regression_dataset(all_data, users_ratio, items_ratio)

In [None]:
# # 2-norm linear regression problem
# class RegressorTwoNorm():
#     def __init__(self):
#         self.theta = None
#         self.resitudals = None
#         self.residuals = None
#         self.s = None
        
#     def fit(self, xs, ys):
#         self.theta, self.residuals, self.rank, self.s = np.linalg.lstsq(xs, ys)
    
#     def __call__(self, x):
#         return np.array(np.dot(x, self.theta)).reshape((-1,))

# regressor_two_norm = RegressorTwoNorm()
# regressor_two_norm.fit(train_xs, train_ys)
# print(get_valid_mae(valid_data, regressor_two_norm, train_avg_ratio, users_ratio, items_ratio))

In [None]:
# from sklearn.linear_model import LinearRegression
# regressor_linear = LinearRegression()
# regressor_linear.fit(train_xs, train_ys)
# print(get_valid_mae(valid_data, regressor_linear.predict, train_avg_ratio, users_ratio, items_ratio))

In [None]:
# # 1-norm predictor
# class RegressorOneNorm():
#     def __init__(self):
#         pass
    
#     def fit(self, xs, ys):
#         self.P = co.matrix(xs)
#         self.q = co.matrix(ys.reshape((ys.shape[0], 1)))
#         self.u = l1(self.P, self.q)
#         self.theta = np.array(self.u).reshape((-1,))
    
#     def __call__(self, x):
#         return np.array(np.dot(x, self.theta)).reshape((-1,))

# regressor_one_norm = RegressorOneNorm()
# regressor_one_norm.fit(train_xs, train_ys)

In [None]:
# print(get_valid_mae(valid_data, regressor_one_norm, train_avg_ratio, users_ratio, items_ratio))

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# regressor_rf = RandomForestRegressor()
# regressor_rf.fit(train_xs, train_ys)
# print(get_valid_mae(valid_data, regressor_rf.predict, train_avg_ratio, users_ratio, items_ratio))

In [None]:
# from sklearn.ensemble import GradientBoostingRegressor
# regressor_gb = GradientBoostingRegressor(learning_rate=0.001, n_estimators=1000, max_depth=6, loss='lad')
# regressor_gb.fit(train_xs[:5000], train_ys[:5000])

# print(get_valid_mae(valid_data, regressor_gb.predict, train_avg_ratio, users_ratio, items_ratio))

In [None]:
# ############ produce test ############

# # load helpful_data.json
# test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))

# # on test set
# test_helpfuls_predict = [predict_helpful(d, regressor_gb.predict, train_avg_ratio, users_ratio, items_ratio) for d in test_data]

# # load 'pairs_Helpful.txt'
# # get header_str and user_item_outofs
# with open('pairs_Helpful.txt') as f:
#     # read and strip lines
#     lines = [l.strip() for l in f.readlines()]
#     # stirip out the headers
#     header_str = lines.pop(0)
#     # get a list of user_item_ids
#     user_item_outofs = [l.split('-') for l in lines]
#     user_item_outofs = [[d[0], d[1], float(d[2])] for d in user_item_outofs]
    
# # make sure `data.json` and `pairs_Helpful.txt` the same order
# for (user_id, item_id, outof), d in zip(user_item_outofs, test_data):
#     assert d['reviewerID'] == user_id
#     assert d['itemID'] == item_id
#     assert d['helpful']['outOf'] == outof
    
# # write to output file
# f = open('predictions_Helpful.txt', 'w')
# print(header_str, file=f)
# for (user_id, item_id, outof), test_helpful_predict in zip(user_item_outofs, test_helpfuls_predict):
#     print('%s-%s-%s,%s' % (user_id, item_id, int(outof), test_helpful_predict), file=f)
# f.close()

In [None]:
# test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))
# outofs = sorted([d['helpful']['outOf'] for d in test_data])
# outofs[-10:]