In [8]:
from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from pprint import pprint
import os
import datetime

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV


# load all_data and test_data
start_time = time.time()
all_data = pickle.load(open('all_data.pickle', 'rb'))
print('data loading time:', time.time() - start_time)

# remove the outlier
for i in reversed(range(len(all_data))):
    d = all_data[i]
    if d['helpful']['outOf'] > 3000:
        all_data.pop(i)
    elif d['helpful']['outOf'] < d['helpful']['nHelpful']:
        all_data.pop(i)

data loading time: 21.3262369633


In [12]:
# utility functions
def get_mae(helpfuls, helpfuls_predict):
    return np.mean(np.fabs(helpfuls_predict - helpfuls.astype(float)))

# load pre computed features
global_feature, users_feature, items_feature = pickle.load(
    open('global_users_items_feature.feature', 'rb'))
style_dict = pickle.load(open('style_dict.feature', 'rb'))

# feature engineering
def get_feature_time(d):
    unix_time = d['unixReviewTime']
    y, m, d = datetime.datetime.fromtimestamp(
        unix_time).strftime('%Y-%m-%d').split('-')
    y = float(y)
    m = float(m)
    d = float(d)
    return [y, m, d]

def get_feature_style(d):
    # load from style dict
    user_id = d['reviewerID']
    item_id = d['itemID']
    s = style_dict[user_id][item_id]

    feature = [s['num_words'],
               s['num_words_summary'],
               s['redability'],
               s['avg_word_len'],
               s['num_words'] /
               s['num_sentences'] if s['num_sentences'] != 0.0 else 0.0,
               s['num_unique_words'],
               s['exclam_exclam_count'] + s['question_count'],
               s['dotdotdot_count'],
               s['capital_ratio']
               ]
    return feature

def get_time_spot_ratio(times, spot):
    # return the array index ratio to insert spot
    if len(times) == 0:
        return 0.
    index = np.searchsorted(np.array(times), spot)
    return float(index) / float(len(times))

def get_feature_user(d):
    user_id = d['reviewerID']
    unix_time = d['unixReviewTime']

    s = users_feature[user_id]
    feature = [s['ratio_a'],
               s['ratio_b'],
               s['num_reviews'],
               s['avg_review_length'],
               s['avg_summary_length'],
               get_time_spot_ratio(s['review_times'], unix_time)
               ]
    return feature

def get_feature_item(d):
    item_id = d['itemID']
    unix_time = d['unixReviewTime']

    s = items_feature[item_id]
    feature = [s['ratio_a'],
               s['ratio_b'],
               s['num_reviews'],
               s['avg_review_length'],
               s['avg_summary_length'],
               get_time_spot_ratio(s['review_times'], unix_time)
               ]
    return feature

def get_feature(d):
    user_id = d['reviewerID']
    item_id = d['itemID']
    unix_time = d['unixReviewTime']

    # offset
    feature = [1.0]

    # user
    feature += get_feature_user(d)
    # item
    feature += get_feature_item(d)

    # outof
    feature += [float(d['helpful']['outOf'])]
    # rating
    feature += [float(d['rating'])]
    # styles
    feature += get_feature_style(d)
    # time
    feature += get_feature_time(d)

    return feature

# get [feature, label] from single datum
def get_feature_label_weight(d, total_outof_weights):
    # check valid
    outof = float(d['helpful']['outOf'])
    assert outof != 0.

    # feature
    feature = get_feature(d)
    # label
    ratio_label = float(d['helpful']['nHelpful']) / \
        float(d['helpful']['outOf'])
    # weight
    weight = float(d['helpful']['outOf']) / total_outof_weights

    return (feature, ratio_label, weight)

# build [feature, label] list from entire dataset
def make_dataset(train_data):
    features = []
    labels = []
    weights = []

    train_outofs = np.array([d['helpful']['outOf']
                             for d in train_data]).astype(float)
    total_outof_weights = np.sum(train_outofs)

    for d in train_data:
        if float(d['helpful']['outOf']) == 0:
            continue
        feature, label, weight = get_feature_label_weight(
            d, total_outof_weights)
        features.append(feature)
        labels.append(label)
        weights.append(weight)

    return (np.array(features), np.array(labels), np.array(weights))

# make one prediction
def predict_helpful(d, ratio_predictor):
    # ratio_predictor[func]: y = ratio_predictor(get_feature(d))

    user_id = d['reviewerID']
    item_id = d['itemID']
    outof = float(d['helpful']['outOf'])

    if (user_id in users_feature) and (item_id in items_feature):
        predict = ratio_predictor(np.array(get_feature(d)).reshape((1, -1)))
        ratio = predict[0]  # np.ndarray
    elif (user_id in users_feature) and (item_id not in items_feature):
        ratio = users_feature[user_id]['ratio_b']
    elif (user_id not in users_feature) and (item_id in items_feature):
        ratio = items_ratio[item_id]['ratio_b']
    else:
        ratio = global_feature['global_ratio_b']
    return ratio * outof

# make predictions and get mae on a dataset
def get_valid_mae(valid_data, ratio_predictor):
    print(len(valid_data))
    # ground truth nhelpful
    helpfuls = np.array([float(d['helpful']['nHelpful']) for d in valid_data])
    # predited nhelpful
    helpfuls_predict = np.array(
        [predict_helpful(d, ratio_predictor) for d in valid_data])
    print(len(helpfuls), len(helpfuls_predict))
    # return mae
    return get_mae(helpfuls, helpfuls_predict)

In [2]:
# build dataset
# all_xs, all_ys, all_weights = make_dataset(all_data)
pickle.dump((all_xs, all_ys, all_weights), 
            open("all_xs_all_ys_all_weights.pickle", "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)
all_xs, all_ys, all_weights = pickle.load(open("all_xs_all_ys_all_weights.pickle", "rb"))
print('dataset prepared')

dataset prepared


In [5]:
# call gradient boosting
print('start fitting regressor')
regressor = GradientBoostingRegressor(learning_rate=0.001,
                                      n_estimators=1000,
                                      max_depth=6,
                                      loss='lad',
                                      verbose=1)
regressor.fit(all_xs[:30000], all_ys[:30000])

start fitting regressor
      Iter       Train Loss   Remaining Time 
         1           0.2595            3.59m
         2           0.2593            3.48m
         3           0.2592            3.42m
         4           0.2591            3.40m
         5           0.2589            3.38m
         6           0.2588            3.36m
         7           0.2587            3.35m
         8           0.2585            3.34m
         9           0.2584            3.33m
        10           0.2583            3.33m
        20           0.2569            3.28m
        30           0.2556            3.24m
        40           0.2543            3.20m
        50           0.2530            3.17m
        60           0.2517            3.13m
        70           0.2505            3.09m
        80           0.2492            3.06m
        90           0.2480            3.02m
       100           0.2467            2.99m
       200           0.2352            2.66m
       300           0.2249   

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
             loss='lad', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=None, subsample=1.0, verbose=1,
             warm_start=False)

In [16]:
regressor.train_score_

array([ 0.25945485,  0.25932139,  0.25918807,  0.25905488,  0.25892183,
        0.25878893,  0.25865602,  0.25852199,  0.25838809,  0.25825433,
        0.25811963,  0.25798506,  0.25785063,  0.25771634,  0.25758221,
        0.25744733,  0.25731258,  0.25717797,  0.25704349,  0.25690925,
        0.25677354,  0.25663798,  0.25650257,  0.25636729,  0.25623214,
        0.25610106,  0.25597016,  0.2558394 ,  0.25570879,  0.25557843,
        0.25544827,  0.25531832,  0.25518477,  0.25505523,  0.25492584,
        0.2547966 ,  0.25466749,  0.25453851,  0.25440967,  0.2542809 ,
        0.25415227,  0.2540238 ,  0.25389548,  0.2537673 ,  0.25363931,
        0.25350749,  0.25337655,  0.25324588,  0.25311533,  0.25298494,
        0.2528547 ,  0.25272459,  0.25259472,  0.25246617,  0.25233774,
        0.25220951,  0.25208143,  0.25195349,  0.25182576,  0.25170044,
        0.25157526,  0.25145021,  0.2513253 ,  0.25120052,  0.25107586,
        0.25095132,  0.25082695,  0.25070277,  0.25057871,  0.25

In [18]:
# set grid search param
param_grid = {'learning_rate': [0.001],
              'max_depth': [4],
              'min_samples_leaf': [9],
              'max_features': [0.5],
              'subsample': [0.15]
              }

# init regressor
regressor = GradientBoostingRegressor(n_estimators=3000,
                                      subsample=0.15,
                                      loss='lad',
                                      verbose=1)

# grid search
grid_searcher = GridSearchCV(regressor, param_grid, verbose=1, n_jobs=36)
grid_searcher.fit(all_xs[:3000], all_ys[:3000])

# print best params
opt_params = grid_searcher.best_params_
print(opt_params)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.2594           0.0001           22.71s
         2           0.2721           0.0001           20.78s
         3           0.2560           0.0001           19.98s
         4           0.2617           0.0001           19.60s
         5           0.2419           0.0001           19.24s
         6           0.2652           0.0001           19.10s
         7           0.2617           0.0001           18.94s
         8           0.2515           0.0001           18.82s      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.2603           0.0001            4.03m         1           0.2424           0.0001            4.37m         1           0.2492           0.0000            4.69m


        

[Parallel(n_jobs=36)]: Done   3 out of   3 | elapsed:   14.1s finished


In [20]:
opt_regressor = GradientBoostingRegressor(n_estimators=3000, loss='lad', verbose=1,
                                          learning_rate    = opt_params['learning_rate'],
                                          max_depth        = opt_params['max_depth'],
                                          min_samples_leaf = opt_params['min_samples_leaf'],
                                          max_features     = opt_params['max_features'],
                                          subsample        = opt_params['subsample']
                                          )
opt_regressor.fit(all_xs[:3000], all_ys[:3000])

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.2720           0.0001           22.67s
         2           0.2563           0.0001           21.62s
         3           0.2434           0.0001           20.89s
         4           0.2508           0.0001           20.54s
         5           0.2444           0.0001           20.39s
         6           0.2494           0.0001           20.19s
         7           0.2637           0.0001           20.01s
         8           0.2623           0.0001           19.90s
         9           0.2647           0.0001           19.93s
        10           0.2458           0.0001           19.76s
        20           0.2513           0.0001           19.00s
        30           0.2695           0.0001           18.56s
        40           0.2419           0.0001           18.25s
        50           0.2714           0.0001           18.03s
        60           0.2498           0.0001           17.86s
       

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.001,
             loss='lad', max_depth=4, max_features=0.5,
             max_leaf_nodes=None, min_samples_leaf=9, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=3000,
             presort='auto', random_state=None, subsample=0.15, verbose=1,
             warm_start=False)

In [26]:
pickle.dump(opt_regressor, 
            open("opt_regressor_%s_%s_%s_%s_%s.pickle" % (3000,
                                                          opt_params['max_depth'],
                                                          opt_params['min_samples_leaf'],
                                                          opt_params['max_features'],
                                                          opt_params['subsample']),
                 "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)

dataset prepared


In [27]:
opt_regressor = pickle.load(open('opt_regressor_3000_4_9_0.5_0.15.pickle', 'rb'))

In [28]:
########## Produce Test ##########

# load helpful_data.json
test_data = pickle.load(open('helpful_data.pickle', 'rb'))

# on test set
test_helpfuls_predict = [
    predict_helpful(d, opt_regressor.predict) for d in test_data]

# load 'pairs_Helpful.txt'
# get header_str and user_item_outofs
with open('pairs_Helpful.txt') as f:
    # read and strip lines
    lines = [l.strip() for l in f.readlines()]
    # stirip out the headers
    header_str = lines.pop(0)
    # get a list of user_item_ids
    user_item_outofs = [l.split('-') for l in lines]
    user_item_outofs = [[d[0], d[1], float(d[2])] for d in user_item_outofs]

# make sure `data.json` and `pairs_Helpful.txt` the same order
for (user_id, item_id, outof), d in zip(user_item_outofs, test_data):
    assert d['reviewerID'] == user_id
    assert d['itemID'] == item_id
    assert d['helpful']['outOf'] == outof

# write to output file
f = open('predictions_Helpful.txt', 'w')
print(header_str, file=f)
for (user_id, item_id, outof), helpful_predict in zip(user_item_outofs,
                                                      test_helpfuls_predict):
    print('%s-%s-%s,%s' %
          (user_id, item_id, int(outof), round(helpful_predict)), file=f)
f.close()


print('total elapsed time:', time.time() - start_time)

total elapsed time: 1635.42340994
