In [None]:
# compute and dump all style data for all_data and test_data

from __future__ import print_function
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os

import cvxopt as co
from l1 import l1

# natural language processing
import nltk
import nltk.data
import string
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = nltk.stem.porter.PorterStemmer()
punctuation = set(string.punctuation)

In [None]:
start_time = time.time()
all_data = pickle.load(open("all_data.pickle", "rb"))
test_data = pickle.load(open("helpful_data.pickle", "rb"))
all_and_test_data = all_data + test_data
print(time.time() - start_time)

In [None]:
# # number of unique words in review text
# num_unique_word = defaultdict(dict)
# punctuation = set(string.punctuation)

# def get_num_uique_word(d):
#     wordCount = defaultdict(int)
#     for w in d["reviewText"].split():
#         w = "".join([c for c in w.lower() if c not in punctuation])
#         w = stemmer.stem(w)
#         wordCount[w] += 1
#     return len(wordCount)

# for idx, d in enumerate(all_and_test_data):
#     if idx % 10000 == 0:
#         print("%d of %d" % (idx, len(all_and_test_data)))
#     user_id = d['reviewerID']
#     item_id = d['itemID']
#     num_unique_word[user_id][item_id] = get_num_uique_word(d)
    
# # dump to pickle
# pickle.dump(num_unique_word, open("num_unique_word.feature", "wb"), 
#             protocol = pickle.HIGHEST_PROTOCOL)

# # read from pickle
# num_unique_word = pickle.load(open("num_unique_word.feature", "rb"))

In [None]:
# writing style features
def get_feature_style(datum):
    style = dict()
    
    # basic info
    user_id = datum['reviewerID']
    item_id = datum['itemID']
    review_summary = datum['summary']
    review_text = datum['reviewText']

    # punctuation_ratio
    review_text_len = float(len(review_text))
    punctuation_count = float(sum(1 for c in review_text if c in punctuation))
    punctuation_ratio = punctuation_count / review_text_len if review_text_len != 0 else 0.
    style['punctuation_count'] = punctuation_count
    style['punctuation_ratio'] = punctuation_ratio

    # capital letter word
    capital_count = float(sum(1 for c in review_text if c.isupper()))
    capital_ratio = capital_count / review_text_len if review_text_len != 0 else 0.
    style['capital_count'] = capital_count
    style['capital_ratio'] = capital_ratio

    # averaged word length
    review_text_pure = "".join([c for c in review_text if c not in punctuation])
    pure_words = review_text_pure.split()
    avg_word_len = sum(len(word) for word in pure_words) / float(len(pure_words)) if review_text_len != 0 else 0.
    style['avg_word_len'] = avg_word_len
    
    # number of '?'
    question_count = float(review_text.count("?"))
    style['question_count'] = question_count

    # number of '!'
    exclam_count = float(review_text.count("!"))
    style['exclam_count'] = exclam_count

    # number of '!!'
    exclam_exclam_count = float(review_text.count("!!"))
    style['exclam_exclam_count'] = exclam_exclam_count

    # number of '...'
    dotdotdot_count = float(review_text.count("..."))
    style['dotdotdot_count'] = dotdotdot_count

    # lengths and redability
    style['num_unique_words'] = num_unique_word[user_id][item_id]
    style['num_words_summary'] = len(review_summary.split())
    
    num_chars = float(sum(1 for c in review_text if c not in punctuation))
    num_words = float(len(pure_words))
    num_sentences = float(len(sentence_tokenizer.tokenize(review_text)))
    if num_words == 0 or num_sentences == 0:
        redability = 0
    else:
        redability = 4.71 * (num_chars / num_words) + 0.5 * (num_words / num_sentences) - 21.43
    style['num_chars'] = num_chars
    style['num_words'] = num_words
    style['num_sentences'] = num_sentences
    style['redability'] = redability
        
    return style

# extract style feature
num_unique_word = pickle.load(open("num_unique_word.feature", "rb"))
style_dict = defaultdict(dict)

for idx, d in enumerate(all_and_test_data):
    if idx % 10000 == 0:
        print("%d of %d" % (idx, len(all_and_test_data)))
    user_id = d['reviewerID']
    item_id = d['itemID']
    style_dict[user_id][item_id] = get_feature_style(d)
    
# dump to pickle
pickle.dump(style_dict, open("style_dict.feature", "wb"), 
            protocol = pickle.HIGHEST_PROTOCOL)

# read from pickle
style_dict = pickle.load(open(d"style_dict.feature", "rb"))