In [1]:
from neon.backends import gen_backend
from neon.data import DataIterator, Text, load_text
from neon.initializers import Uniform, GlorotUniform
from neon.layers import GeneralizedCost, LSTM, Affine, Dropout, LookupTable, RecurrentSum
from neon.models import Model
from neon.optimizers import Adagrad
from neon.transforms import Logistic, Tanh, Softmax, CrossEntropyMulti, Accuracy
from neon.callbacks.callbacks import Callbacks
from neon.util.argparser import NeonArgparser

from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
import os
data_root = os.path.expanduser("~") + '/data/CSE255/'

from nltk.stem.porter import PorterStemmer

%matplotlib inline
import matplotlib.pyplot as plt

import string
stemmer = PorterStemmer()
punctuation = set(string.punctuation)

import random

In [2]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
test_data = pickle.load(open(data_root + "helpful_data.pickle", "rb"))
all_and_test_data = all_data + test_data
print(time.time() - start_time)

19.1929328442


In [3]:
all_size = len(all_data)
train_size = 900000
# train_size = all_size
valid_size = 100000
train_data = all_data[:train_size]
valid_data = all_data[all_size - valid_size:]

In [4]:
# load dictionary
imdb_word_to_idx = pickle.load(open(data_root + "imdb.dict.pkl", "rb"))
imdb_idx_to_word = dict()
for word, idx in imdb_word_to_idx.iteritems():
    imdb_idx_to_word[idx] = word

In [5]:
# convert data to word indecies
features = []
labels = []

# set the dataset!
taget_data = all_data

def data_to_word_index(d, include_summary=True, random_crop=False):
    # split
    if include_summary:
        review_text = d['summary'].split() + d['reviewText'].split()
    else:
        review_text = d['reviewText'].split()
        
    # pre-process text
    review_text = [w.lower() for w in review_text if w not in punctuation]
    review_text = ["".join(c for c in w if c not in punctuation) for w in review_text]
    
    # review_text = [stemmer.stem(w) for w in review_text]
    review_text = [w for w in review_text if w in imdb_word_to_idx]
    
    if random_crop:
        # random_crop
        if len(review_text) >= 150:
            start_index = random.randint(0,int(len(review_text) / 2))
        else:
            start_index = random.randint(0,int(len(review_text) / 3))
        review_text = review_text[start_index:]
    
    # convert to index
    review_text_idx = [imdb_word_to_idx[w] for w in review_text]
    review_text_idx = np.array(review_text_idx).astype(int)
    
    return review_text_idx

def data_to_label(d):
    ratio = float(d['helpful']['nHelpful']) / float(d['helpful']['outOf'])
    if ratio >= 0.499:
        return 1
    else:
        return 0

for idx, d in enumerate(taget_data):
    if idx % 10000 == 0:
        print("%d of %d" % (idx, len(taget_data)))
    
    if float(d['helpful']['outOf']) == 0:
        continue
    
    features.append(data_to_word_index(d, include_summary=True))
    labels.append(data_to_label(d))
    
print("done adding samples")

0 of 1000000
10000 of 1000000
20000 of 1000000
30000 of 1000000
40000 of 1000000
50000 of 1000000
60000 of 1000000
70000 of 1000000
80000 of 1000000
90000 of 1000000
100000 of 1000000
110000 of 1000000
120000 of 1000000
130000 of 1000000
140000 of 1000000
150000 of 1000000
160000 of 1000000
170000 of 1000000
180000 of 1000000
190000 of 1000000
200000 of 1000000
210000 of 1000000
220000 of 1000000
230000 of 1000000
240000 of 1000000
250000 of 1000000
260000 of 1000000
270000 of 1000000
280000 of 1000000
290000 of 1000000
300000 of 1000000
310000 of 1000000
320000 of 1000000
330000 of 1000000
340000 of 1000000
350000 of 1000000
360000 of 1000000
370000 of 1000000
380000 of 1000000
390000 of 1000000
400000 of 1000000
410000 of 1000000
420000 of 1000000
430000 of 1000000
440000 of 1000000
450000 of 1000000
460000 of 1000000
470000 of 1000000
480000 of 1000000
490000 of 1000000
500000 of 1000000
510000 of 1000000
520000 of 1000000
530000 of 1000000
540000 of 1000000
550000 of 1000000
560000

In [6]:
# compansate for the bias of negative
random.seed(0)
positive_count = sum(1 for l in labels if l == 1)
negative_count = sum(1 for l in labels if l == 0)
assert positive_count + negative_count == len(labels)

bias_count = positive_count - negative_count

while bias_count > 0:
    d = random.choice(taget_data)
    
    if (float(d['helpful']['outOf']) == 0 or data_to_label(d) == 1):
        continue
        
    features.append(data_to_word_index(d, random_crop=True))
    labels.append(data_to_label(d))
    
    bias_count -= 1
    if bias_count % 10000 == 0:
        print("ramaining: %d" % (bias_count))

positive_count = sum(1 for l in labels if l == 1)
negative_count = sum(1 for l in labels if l == 0)
assert positive_count == negative_count

print("done fixing bias")

ramaining: 350000
ramaining: 340000
ramaining: 330000
ramaining: 320000
ramaining: 310000
ramaining: 300000
ramaining: 290000
ramaining: 280000
ramaining: 270000
ramaining: 260000
ramaining: 250000
ramaining: 240000
ramaining: 230000
ramaining: 220000
ramaining: 210000
ramaining: 200000
ramaining: 190000
ramaining: 180000
ramaining: 170000
ramaining: 160000
ramaining: 150000
ramaining: 140000
ramaining: 130000
ramaining: 120000
ramaining: 110000
ramaining: 100000
ramaining: 90000
ramaining: 80000
ramaining: 70000
ramaining: 60000
ramaining: 50000
ramaining: 40000
ramaining: 30000
ramaining: 20000
ramaining: 10000
ramaining: 0
done fixing bias


In [7]:
# dump
pickle.dump((features, labels), open(data_root + "train_valid_text_index_in_binary_label.pickle", "wb"), 
            protocol = pickle.HIGHEST_PROTOCOL)

In [8]:
# shuffle and then dump
random.seed(0)

combined = zip(features, labels)
random.shuffle(combined)

features[:], labels[:] = zip(*combined)

pickle.dump((features, labels), open(data_root + "train_valid_text_index_in_binary_label_shuffled.pickle", "wb"), 
            protocol = pickle.HIGHEST_PROTOCOL)

In [9]:
# (features, labels) = pickle.load(open(data_root + "train_text_data.pickle", "rb"))
# pickle.dump((features[:100000], ratios[:100000]), 
#             open(data_root + "train_text_data_100000.pickle", "wb"), 
#             protocol = pickle.HIGHEST_PROTOCOL)