In [14]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os

# data
data_root = os.path.expanduser("~") + '/data/CSE255/'

# l1-norm
import cvxopt as co
from l1 import l1

# # natural language processing
# import nltk
# import nltk.data
# import string
# sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# stemmer = nltk.stem.porter.PorterStemmer()
# punctuation = set(string.punctuation)

In [2]:
# load all_data
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

18.9671149254


In [3]:
# split training and valid set
# all
all_size = len(all_data)

# train
train_size = 900000
# train_size = all_size # uncomment this to produce test
train_data = all_data[:train_size]

# valid
valid_size = 100000
valid_data = all_data[all_size - valid_size:]

In [4]:
# remove the outlier
for i in reversed(range(train_size)):
    d = train_data[i] 
    if d['helpful']['outOf'] > 5000:
        train_data.pop(i)

In [5]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.sum(np.fabs(helpfuls_predict - helpfuls.astype(float))) / helpfuls.shape[0]

In [6]:
# get global average
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])
train_avg_ratio = np.sum(train_helpfuls) / np.sum(train_outofs.astype(float))
print('avg helpfulness ratio', train_avg_ratio)

# linear search best ratio
def linear_search_ratio(helpfuls, outofs, search_range=(0.3, 1.0, 0.001)):
    alphas = np.arange(*search_range)
    errors = [get_mae(helpfuls, outofs * alpha) for alpha in alphas]
    optimal_alpha = alphas[np.argmin(errors)]
    return optimal_alpha

# training set global
train_helpfuls = np.array([d['helpful']['nHelpful'] for d in train_data])
train_outofs =  np.array([d['helpful']['outOf'] for d in train_data])
train_avg_ratio = linear_search_ratio(train_helpfuls, train_outofs, search_range=(0.3, 1.0, 0.001))
print('optimal helpfulness ratio', train_avg_ratio)

# get average for a user
users_outof = dict()
users_helpful = dict()

for d in train_data:
    user_id = d['reviewerID']
    users_outof[user_id] = users_outof.get(user_id, 0.0) + float(d['helpful']['outOf'])
    users_helpful[user_id] = users_helpful.get(user_id, 0.0) + float(d['helpful']['nHelpful'])
    
users_ratio = dict()
for user_id in users_outof:
    if users_outof[user_id] != 0:
        users_ratio[user_id] = users_helpful[user_id] / users_outof[user_id]
    else:
        users_ratio[user_id] = train_avg_ratio
        
# get average for a item
items_outof = dict()
items_helpful = dict()

for d in train_data:
    item_id = d['itemID']
    items_outof[item_id] = items_outof.get(item_id, 0.0) + float(d['helpful']['outOf'])
    items_helpful[item_id] = items_helpful.get(item_id, 0.0) + float(d['helpful']['nHelpful'])

items_ratio = dict()
for item_id in items_outof:
    if items_outof[item_id] != 0:
        items_ratio[item_id] = items_helpful[item_id] / items_outof[item_id]
    else:
        items_ratio[item_id] = train_avg_ratio

avg helpfulness ratio 0.768819898316
optimal helpfulness ratio 0.856


In [7]:
# pre-computed features
with open('betas.pickle') as f:
    beta_us, beta_is = pickle.load(f)
    
with open('train_ratio_list.pickle') as f:
    train_ratio_list = pickle.load(f)
    
with open(os.path.join(data_root, 'num_unique_word.feature')) as f:
    num_unique_word_dict = pickle.load(f)
    
with open(os.path.join(data_root, 'style_dict.feature')) as f:
    style_dict = pickle.load(f)
    # style_dict['U243261361']['I572782694']
    # {'avg_word_len': 4.857142857142857,
    #  'capital_count': 11.0,
    #  'capital_ratio': 0.028205128205128206,
    #  'dotdotdot_count': 4.0,
    #  'exclam_count': 0.0,
    #  'exclam_exclam_count': 0.0,
    #  'num_chars': 369.0,
    #  'num_sentences': 3.0,
    #  'num_unique_words': 50,
    #  'num_words': 63.0,
    #  'num_words_summary': 2,
    #  'punctuation_count': 21.0,
    #  'punctuation_ratio': 0.05384615384615385,
    #  'question_count': 0.0,
    #  'redability': 16.65714285714285}

In [8]:
# feature engineering
# get date time statistics
def get_y_m_d(d):
    unix_time = d['unixReviewTime']
    y, m, d = datetime.datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d').split('-')
    y = float(y)
    m = float(m)
    d = float(d)
    return(y, m, d)

def get_feature_time(d):
    y, m, d = get_y_m_d(d)
    # y = min(y, 2014)
    # y = max(y, 1996)
    # # 1996 [1,0,..,0] 2014 [0,0,...,0]
    # y_feature = [0] * (2014 - 1996 + 1)
    # y_feature[y - 1996] = 1
    # # jan [1,0,...,0] dec [0,0,...,0]
    # m_feature = [0] * 12
    # m_feature[m - 1] = 1
    # # date1 [1,0,...,0] date31 [0,0,...,0]
    # d_feature = [0] * 31
    # d_feature[d - 1] = 1
    # # concatenate
    # feature = y_feature[:-1] + m_feature[:-1] + d_feature[:-1]
    return [y, m, d]

def get_num_uique_word(d):
    wordCount = defaultdict(int)
    for w in d["reviewText"].split():
        w = "".join([c for c in w.lower() if not c in punctuation])
        w = stemmer.stem(w)
        wordCount[w] += 1
    return len(wordCount)

def get_feature(d):
    user_id = d['reviewerID']
    item_id = d['itemID']
    
    # offset
    feature = [1.0]
    # user, item average
    feature += [users_ratio[user_id], items_ratio[item_id]]
    # rating
    feature += [float(d['rating'])]
    # styles
    s = style_dict[user_id][item_id]
    feature += [s['num_words'], s['redability'], s['exclam_exclam_count']+s['question_count']]
    # time
    feature += get_feature_time(d)
    
    return feature

In [9]:
# get [feature, label] from single datum
def get_feature_and_ratio_label(d, users_ratio, items_ratio):
    # check valid
    outof = float(d['helpful']['outOf'])
    if outof == 0:
        raise('out of cannot be 0 for ratio')

    # get feature and ratio
    feature = get_feature(d)
    ratio_label = float(d['helpful']['nHelpful']) / float(d['helpful']['outOf'])
    return (feature, ratio_label)

# build [feature, label] list from entire dataset
def make_average_regression_dataset(train_data, users_ratio, items_ratio):
    features = []
    labels = []
    
    for d in train_data:
        if float(d['helpful']['outOf']) == 0:
            continue
        feature, label = get_feature_and_ratio_label(d, users_ratio, items_ratio)
        features.append(feature)
        labels.append(label)
    return (np.array(features), np.array(labels))

# make one prediction
def predict_helpful(d, ratio_predictor, train_avg_ratio, users_ratio, items_ratio):
    # ratio_predictor[func]: y = ratio_predictor(get_feature(d))
    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])
    
    if (user_id in users_ratio) and (item_id in items_ratio):
        # ratio = np.dot(get_feature(d), theta)
        predict = ratio_predictor(get_feature(d))
        ratio = predict[0] # np.ndarray
    elif (user_id in users_ratio) and (item_id not in items_ratio):
        ratio = users_ratio[user_id]
    elif (user_id not in users_ratio) and (item_id in items_ratio):
        ratio = items_ratio[item_id]
    else:
        ratio = train_avg_ratio
    return ratio * outof

# make predictions and get mae on a dataset
def get_valid_mae(valid_data, ratio_predictor, train_avg_ratio, users_ratio, items_ratio):
    # ground truth nhelpful
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    # predited nhelpful
    helpfuls_predict = np.array([predict_helpful(d, ratio_predictor, train_avg_ratio, users_ratio, items_ratio) for d in valid_data])
    # return mae
    return get_mae(helpfuls, helpfuls_predict)

In [10]:
# build dataset
# train_xs, train_ys = make_average_regression_dataset(train_data, users_ratio, items_ratio)
# valid_xs, valid_ys = make_average_regression_dataset(valid_data, users_ratio, items_ratio)
all_xs, all_ys = make_average_regression_dataset(all_data, users_ratio, items_ratio)

In [11]:
# # 2-norm linear regression problem
# class RegressorTwoNorm():
#     def __init__(self):
#         self.theta = None
#         self.resitudals = None
#         self.residuals = None
#         self.s = None
        
#     def fit(self, xs, ys):
#         self.theta, self.residuals, self.rank, self.s = np.linalg.lstsq(xs, ys)
    
#     def __call__(self, x):
#         return np.array(np.dot(x, self.theta)).reshape((-1,))

# regressor_two_norm = RegressorTwoNorm()
# regressor_two_norm.fit(train_xs, train_ys)
# print(get_valid_mae(valid_data, regressor_two_norm, train_avg_ratio, users_ratio, items_ratio))

In [12]:
# from sklearn.linear_model import LinearRegression
# regressor_linear = LinearRegression()
# regressor_linear.fit(train_xs, train_ys)
# print(get_valid_mae(valid_data, regressor_linear.predict, train_avg_ratio, users_ratio, items_ratio))

In [13]:
# # 1-norm predictor
# class RegressorOneNorm():
#     def __init__(self):
#         pass
    
#     def fit(self, xs, ys):
#         self.P = co.matrix(xs)
#         self.q = co.matrix(ys.reshape((ys.shape[0], 1)))
#         self.u = l1(self.P, self.q)
#         self.theta = np.array(self.u).reshape((-1,))
    
#     def __call__(self, x):
#         return np.array(np.dot(x, self.theta)).reshape((-1,))

# regressor_one_norm = RegressorOneNorm()
# regressor_one_norm.fit(train_xs, train_ys)

In [14]:
# print(get_valid_mae(valid_data, regressor_one_norm, train_avg_ratio, users_ratio, items_ratio))

In [15]:
# from sklearn.ensemble import RandomForestRegressor
# regressor_rf = RandomForestRegressor()
# regressor_rf.fit(train_xs, train_ys)
# print(get_valid_mae(valid_data, regressor_rf.predict, train_avg_ratio, users_ratio, items_ratio))

In [15]:
# from sklearn.ensemble import GradientBoostingRegressor
# regressor_gb = GradientBoostingRegressor(learning_rate=0.001, n_estimators=1000, max_depth=6, loss='lad')
# regressor_gb.fit(train_xs[:5000], train_ys[:5000])

# print(get_valid_mae(valid_data, regressor_gb.predict, train_avg_ratio, users_ratio, items_ratio))

In [17]:
# ############ produce test ############

# # load helpful_data.json
# test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))

# # on test set
# test_helpfuls_predict = [predict_helpful(d, regressor_gb.predict, train_avg_ratio, users_ratio, items_ratio) for d in test_data]

# # load 'pairs_Helpful.txt'
# # get header_str and user_item_outofs
# with open('pairs_Helpful.txt') as f:
#     # read and strip lines
#     lines = [l.strip() for l in f.readlines()]
#     # stirip out the headers
#     header_str = lines.pop(0)
#     # get a list of user_item_ids
#     user_item_outofs = [l.split('-') for l in lines]
#     user_item_outofs = [[d[0], d[1], float(d[2])] for d in user_item_outofs]
    
# # make sure `data.json` and `pairs_Helpful.txt` the same order
# for (user_id, item_id, outof), d in zip(user_item_outofs, test_data):
#     assert d['reviewerID'] == user_id
#     assert d['itemID'] == item_id
#     assert d['helpful']['outOf'] == outof
    
# # write to output file
# f = open('predictions_Helpful.txt', 'w')
# print(header_str, file=f)
# for (user_id, item_id, outof), test_helpful_predict in zip(user_item_outofs, test_helpfuls_predict):
#     print('%s-%s-%s,%s' % (user_id, item_id, int(outof), test_helpful_predict), file=f)
# f.close()