In [None]:
from __future__ import print_function
import numpy as np
import pandas as pd
import time
import yelp_util
np.random.seed(1337)
from keras.preprocessing import sequence
from keras.utils.np_utils import accuracy
from keras.models import Graph
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.initializations import normal, identity
import cPickle as pkl

In [None]:
yelp_review = pd.read_pickle('data/yelp_academic_dataset_review.pickle') # read yelp review pickle

In [None]:
neg_reviews = yelp_review[yelp_review['stars'] < 3]  # 5 stars vs 1-2 stars
pos_reviews = yelp_review[yelp_review['stars'] == 5]
print(len(neg_reviews), len(pos_reviews))

neg_sents = [sent for sent in neg_reviews['text']]
pos_sents = [sent for sent in pos_reviews['text']]
sents = neg_sents + pos_sents
print(len(sents))

In [None]:
wdict, toksents = yelp_util.preprocessing.create_vocab(sents)

In [None]:
f = open('yelp_posneg_review.dict.pkl', 'wb')
pkl.dump(wdict, f, -1)
f.close()

In [None]:
wdseqs = yelp_util.preprocessing.word2id(toksents, wdict)   # word id sequences

In [None]:
f = open('yelp_posneg_review.whole_dataset.pkl', 'wb')
pkl.dump(wdseqs, f, -1)
f.close()

In [None]:
labels = np.ones(len(wdseqs), dtype=np.int32) # Labels for neg and pos reviews
labels[:300419] = 0

In [None]:
(X_train, y_train), (X_test, y_test) = yelp_util.preprocessing.load_yelp_review(wdseqs, labels, nb_words=300000)

In [None]:
max_features = 300000 # Max Vocab
maxlen = 100
batch_size = 128

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

print('Building Bidirectional IRNN')
model = Graph()
model.add_input(name='input', input_shape=(maxlen,), dtype=int)
model.add_node(Embedding(max_features, 128, input_length=maxlen),
               name='embedding', input='input')
model.add_node(SimpleRNN(output_dim=64,
                    init=lambda shape: normal(shape, scale=0.001),
                    inner_init=lambda shape: identity(shape, scale=1.0),
                    activation='relu'), name='forward', input='embedding')
model.add_node(SimpleRNN(output_dim=64,
                    init=lambda shape: normal(shape, scale=0.001),
                    inner_init=lambda shape: identity(shape, scale=1.0),
                    activation='relu', go_backwards=True), name='backward', input='embedding')

model.add_node(Dropout(0.5), name='dropout', inputs=['forward', 'backward'])
model.add_node(Dense(1, activation='sigmoid'), name='sigmoid', input='dropout')
model.add_output(name='output', input='sigmoid')

model.compile('rmsprop', {'output': 'binary_crossentropy'})

print('Train...')
model.fit({'input': X_train, 'output': y_train},
          batch_size=batch_size,
          nb_epoch=3)
acc = accuracy(y_test,
               np.round(np.array(model.predict({'input': X_test},
                                               batch_size=batch_size)['output'])))
print('Test accuracy:', acc)
# ~96% accuracy after 3 epochs

In [None]:
model.save_weights('bdirnn_model_weights.h5')