In [1]:
# idea:
# - every user, every item, and global have an average
# - after averaging statistics, these average is fixed
# - fit regression model to fit this average to the original training data to get coefficient
# - thus must be better than the provided baseline

In [2]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os

data_root = os.path.expanduser("~") + '/data/CSE255/'

import cvxopt as co
from l1 import l1

In [3]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

16.8827280998


In [4]:
all_size = len(all_data)
train_size = 900000
# train_size = all_size # uncomment this to produce test
valid_size = 100000
train_data = all_data[:train_size]
valid_data = all_data[all_size - valid_size:]

In [5]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

def get_valid_mae(valid_data):
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    helpfuls_predict = np.array([predict_helpful(d, theta, train_avg_ratio, users_ratio, items_ratio) for d in valid_data])
    return get_mae(helpfuls, helpfuls_predict)

In [6]:
# get global average
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])
train_avg_ratio = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))
print('avg helpfulness ratio', train_avg_ratio)

# get average for a user
users_outof = dict()
users_helpful = dict()

for d in train_data:
    user_id = d['reviewerID']
    users_outof[user_id] = users_outof.get(user_id, 0.0) + float(d['helpful']['outOf'])
    users_helpful[user_id] = users_helpful.get(user_id, 0.0) + float(d['helpful']['nHelpful'])
    
users_ratio = dict()
for user_id in users_outof:
    if users_outof[user_id] != 0:
        users_ratio[user_id] = users_helpful[user_id] / users_outof[user_id]
    else:
        users_ratio[user_id] = train_avg_ratio
        
# get average for a item
items_outof = dict()
items_helpful = dict()

for d in train_data:
    item_id = d['itemID']
    items_outof[item_id] = items_outof.get(item_id, 0.0) + float(d['helpful']['outOf'])
    items_helpful[item_id] = items_helpful.get(item_id, 0.0) + float(d['helpful']['nHelpful'])

items_ratio = dict()
for item_id in items_outof:
    if items_outof[item_id] != 0:
        items_ratio[item_id] = items_helpful[item_id] / items_outof[item_id]
    else:
        items_ratio[item_id] = train_avg_ratio

avg helpfulness ratio 0.770464430813


In [7]:
with open('betas.pickle') as f:
    beta_us, beta_is = pickle.load(f)

# get date time statistics
def get_y_m_d(d):
    unix_time = d['unixReviewTime']
    y, m, d = datetime.datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d').split('-')
    y = int(y)
    m = int(m)
    d = int(d)
    return(y, m, d)

def get_time_feature(d):
    y, m, d = get_y_m_d(d)
    y = min(y, 2014)
    y = max(y, 1996)
    # 1996 [1,0,..,0] 2014 [0,0,...,0]
    y_feature = [0] * (2014 - 1996 + 1)
    y_feature[y - 1996] = 1
    # jan [1,0,...,0] dec [0,0,...,0]
    m_feature = [0] * 12
    m_feature[m - 1] = 1
    # date1 [1,0,...,0] date31 [0,0,...,0]
    d_feature = [0] * 31
    d_feature[d - 1] = 1
    # concatenate
    # feature = y_feature[:-1] + m_feature[:-1] + d_feature[:-1]
    feature = m_feature[:-1]
    return feature

def get_feature(d):
    feature = [1.0, 
               users_ratio[d['reviewerID']],
               users_ratio[d['reviewerID']] ** 2.,
               items_ratio[d['itemID']],
               items_ratio[d['itemID']] ** 2.,
               len(d['reviewText'].split()),
               len(d['reviewText'].split()) ** 2.,
               d['rating'],
               d['rating'] ** 2.]
    return feature

# fit a linear regressor to the train ratio
def get_feature_and_ratio_label(d, users_ratio, items_ratio):
    # check valid
    outof = float(d['helpful']['outOf'])
    if outof == 0:
        raise('out of cannot be 0 for ratio')

    # get feature and ratio
    feature = get_feature(d)
    ratio_label = float(d['helpful']['nHelpful']) / float(d['helpful']['outOf'])
    
    return (feature, ratio_label)

def make_average_regression_dataset(train_data, users_ratio, items_ratio):
    features = []
    labels = []
    
    for d in train_data:
        if float(d['helpful']['outOf']) == 0:
            continue
        feature, label = get_feature_and_ratio_label(d, users_ratio, items_ratio)
        features.append(feature)
        labels.append(label)
    
    return (np.array(features), np.array(labels))

In [8]:
# build dataset
train_xs, train_ys = make_average_regression_dataset(train_data, users_ratio, items_ratio)

In [None]:
# solve 2-norm minimize problem
theta, residuals, rank, s = np.linalg.lstsq(train_xs, train_ys)

In [None]:
# solve 1-norm minimize problem
P = co.matrix(train_xs)
q = co.matrix(train_ys.reshape((train_ys.shape[0], 1)))
u = l1(P,q)
theta = np.array(u).reshape((-1,))
print(theta)

     pcost       dcost       gap    pres   dres   k/t
 0:  1.1415e+05  2.0235e+04  9e+04  1e-17  2e-05  1e+00
 1:  1.1415e+05  2.0235e+04  9e+04  2e-16  2e-05  1e+00
 2:  1.1415e+05  2.0235e+04  9e+04  2e-16  2e-05  1e+00
 3:  1.1415e+05  2.0235e+04  9e+04  2e-16  2e-05  1e+00
 4:  1.1415e+05  2.0235e+04  9e+04  2e-16  2e-05  1e+00
 5:  1.1415e+05  2.0235e+04  9e+04  2e-16  2e-05  1e+00
 6:  1.1415e+05  2.0237e+04  9e+04  3e-16  2e-05  1e+00
 7:  1.1416e+05  2.0241e+04  9e+04  3e-16  2e-05  1e+00
 8:  1.1416e+05  2.0246e+04  9e+04  3e-16  2e-05  1e+00
 9:  1.1419e+05  2.0277e+04  9e+04  3e-16  2e-05  1e+00
10:  1.1424e+05  2.0334e+04  9e+04  3e-16  2e-05  1e+00
11:  1.1457e+05  2.0685e+04  9e+04  3e-16  2e-05  1e+00
12:  1.1550e+05  2.1725e+04  9e+04  3e-16  2e-05  1e+00
13:  1.1812e+05  2.5368e+04  9e+04  4e-16  2e-05  9e-01
14:  1.2022e+05  2.8776e+04  9e+04  3e-16  2e-05  9e-01
15:  1.2346e+05  4.0717e+04  8e+04  4e-16  1e-05  7e-01
16:  1.2475e+05  5.3776e+04  7e+04  3e-16  1e-05  

In [None]:
# # using all data
# theta = np.array([ -1.53413948e+00,   1.77200914e+00,  -9.14233327e-01,   2.58803493e+00,
#                    -1.17773635e+00,  -1.10778809e-06,  -8.47219802e-09,   1.16800069e-01,
#                    -1.09893330e-02])

# # using train data
# theta = np.array([ -1.44115432e+00,   1.74586291e+00,  -8.87805851e-01,   2.41169942e+00,
#                    -1.07400193e+00,  -1.48174098e-06,  -9.10511120e-09,   1.08660865e-01,
#                    -1.00238718e-02])

In [None]:
def get_occur_stats(valid_data, users_ratio, items_ratio):
    stats = dict()
    stats['in_user_in_item'] = 0
    stats['in_user_not_in_item'] = 0
    stats['not_in_user_in_item'] = 0
    stats['not_in_user_not_in_item'] = 0
    
    for d in valid_data:
        user_id = d['reviewerID']
        item_id = d['itemID']

        if (user_id in users_ratio) and (item_id in items_ratio):
            stats['in_user_in_item'] += 1
        elif (user_id in users_ratio) and (item_id not in items_ratio):
            stats['in_user_not_in_item'] += 1
        elif (user_id not in users_ratio) and (item_id in items_ratio):
            stats['not_in_user_in_item'] += 1
        else:
            stats['not_in_user_not_in_item']
    return stats

In [None]:
def predict_helpful(d, theta, train_avg_ratio, users_ratio, items_ratio):
    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])
    
    if (user_id in users_ratio) and (item_id in items_ratio):
        ratio = np.dot(get_feature(d), theta)
    elif (user_id in users_ratio) and (item_id not in items_ratio):
        ratio = users_ratio[user_id]
    elif (user_id not in users_ratio) and (item_id in items_ratio):
        ratio = items_ratio[item_id]
    else:
        ratio = train_avg_ratio
    return ratio * outof

In [None]:
print(get_valid_mae(valid_data))

In [None]:
print(get_occur_stats(valid_data, users_ratio, items_ratio))

In [None]:
# ############ produce test ############

# # load helpful_data.json
# test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))

# # on test set
# test_helpfuls_predict = [predict_helpful(d, theta, train_avg_ratio, users_ratio, items_ratio) for d in test_data]

# # load 'pairs_Helpful.txt'
# # get header_str and user_item_outofs
# with open('pairs_Helpful.txt') as f:
#     # read and strip lines
#     lines = [l.strip() for l in f.readlines()]
#     # stirip out the headers
#     header_str = lines.pop(0)
#     # get a list of user_item_ids
#     user_item_outofs = [l.split('-') for l in lines]
#     user_item_outofs = [[d[0], d[1], float(d[2])] for d in user_item_outofs]
    
# # make sure `data.json` and `pairs_Helpful.txt` the same order
# for (user_id, item_id, outof), d in zip(user_item_outofs, test_data):
#     assert d['reviewerID'] == user_id
#     assert d['itemID'] == item_id
#     assert d['helpful']['outOf'] == outof
    
# # write to output file
# f = open('predictions_Helpful.txt', 'w')
# print(header_str, file=f)
# for (user_id, item_id, outof), test_helpful_predict in zip(user_item_outofs, test_helpfuls_predict):
#     print('%s-%s-%s,%s' % (user_id, item_id, int(outof), test_helpful_predict), file=f)
# f.close()