In [4]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import os

In [14]:
def read_files(filetype):
    path = 'aclImdb'
    file_list = []
    positive_path = os.path.join(path, filetype, 'pos')
    for f in os.listdir(positive_path):
        file_list.append(os.path.join(positive_path,f))
    negative_path = os.path.join(path, filetype, 'neg')
    for f in os.listdir(negative_path):
        file_list.append(os.path.join(negative_path,f))
    print('read {} files: {}'.format(filetype, len(file_list)))
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    print(file_list[:5])
    for f in file_list:
        with open(f, encoding='utf-8', mode='r') as fp:
            all_texts.append(' '.join(fp.readlines()))
    return all_labels, all_texts

In [15]:
train_label, train_text = read_files('train')

read train files: 25000
['aclImdb/train/pos/8335_8.txt', 'aclImdb/train/pos/5954_7.txt', 'aclImdb/train/pos/5687_7.txt', 'aclImdb/train/pos/9006_7.txt', 'aclImdb/train/pos/3376_9.txt']


In [16]:
train_text[0]

"While the sparkling chemistry between Ryan and Robbins alone is reason enough to see this movie, the supporting cast (including Matthau, Fry, Shalub, Durning and the hilarious trio of Jacobi, Saks and Maher) is an additional plus. Matthau shines as Einstein, Fry is perfect as Ryan's clinical fiancé, and Shalub's line about Einstein's gonads is, as has been noted, one of the highlights of the film. The speech that Robbins delivers at his first appearance in public is sheer poetry. Kudos to the writers for handling this froth with wit and levity. I also thought that Keene Curtis was wonderful as Eisenhower. This might be considered something of a chick movie, but I think everyone will get a kick out of it. Eight very solid points."

In [17]:
test_label, test_text = read_files('test')

read test files: 25000
['aclImdb/test/pos/2183_7.txt', 'aclImdb/test/pos/11149_8.txt', 'aclImdb/test/pos/11188_8.txt', 'aclImdb/test/pos/2589_10.txt', 'aclImdb/test/pos/2256_9.txt']


In [18]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

In [19]:
print(token.document_count)

25000


In [21]:
print(token.word_index)



In [22]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [23]:
x_train_seq[0]

[134,
 1,
 1171,
 197,
 2,
 581,
 6,
 279,
 192,
 5,
 64,
 11,
 17,
 1,
 693,
 174,
 583,
 2,
 1,
 639,
 4,
 2,
 6,
 32,
 931,
 14,
 6,
 400,
 14,
 2,
 344,
 41,
 6,
 14,
 44,
 74,
 28,
 4,
 1,
 4,
 1,
 19,
 1,
 12,
 1542,
 30,
 24,
 83,
 1264,
 8,
 1067,
 6,
 5,
 1,
 924,
 15,
 11,
 16,
 2,
 10,
 79,
 194,
 12,
 13,
 386,
 14,
 11,
 235,
 27,
 1189,
 139,
 4,
 3,
 17,
 18,
 10,
 101,
 313,
 77,
 76,
 3,
 1965,
 43,
 4,
 9,
 52,
 1153,
 753]

In [24]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)

In [29]:
len(x_train_seq[3])
len(x_train[3])

100

In [51]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM

In [52]:
model = Sequential()

In [32]:
model.add(Embedding(output_dim=32,
                   input_dim=2000,
                   input_length=100))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


RNN model

In [44]:
model.add(Embedding(output_dim=32,
                   input_dim=2000,
                   input_length=100))
model.add(Dropout(0.35))
model.add(SimpleRNN(units=16))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


LSTM model

In [53]:
model.add(Embedding(output_dim=32,
                   input_dim=2000,
                   input_length=100))
model.add(Dropout(0.35))
model.add(LSTM(units=32))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_5 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 81,025
Trainable params: 81,025
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam',
             metrics=['accuracy'])

In [55]:
train_history = model.fit(x_train, train_label, batch_size=100, epochs=10, verbose=2, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 18s - loss: 0.5027 - acc: 0.7453 - val_loss: 0.5853 - val_acc: 0.7038
Epoch 2/10
 - 17s - loss: 0.3323 - acc: 0.8573 - val_loss: 0.5208 - val_acc: 0.7724
Epoch 3/10
 - 17s - loss: 0.3020 - acc: 0.8729 - val_loss: 0.3214 - val_acc: 0.8454
Epoch 4/10
 - 17s - loss: 0.2858 - acc: 0.8816 - val_loss: 0.5579 - val_acc: 0.7252
Epoch 5/10
 - 17s - loss: 0.2684 - acc: 0.8889 - val_loss: 0.4394 - val_acc: 0.7992
Epoch 6/10
 - 17s - loss: 0.2557 - acc: 0.8940 - val_loss: 0.4818 - val_acc: 0.7768
Epoch 7/10
 - 17s - loss: 0.2501 - acc: 0.8982 - val_loss: 0.7564 - val_acc: 0.7140
Epoch 8/10
 - 17s - loss: 0.2374 - acc: 0.9006 - val_loss: 0.5542 - val_acc: 0.7902
Epoch 9/10
 - 17s - loss: 0.2356 - acc: 0.9027 - val_loss: 0.6355 - val_acc: 0.7878
Epoch 10/10
 - 18s - loss: 0.2251 - acc: 0.9071 - val_loss: 0.4868 - val_acc: 0.8204


In [56]:
scores = model.evaluate(x_test, test_label, verbose=1)



In [57]:
scores

[0.42003280653476716, 0.83912]

In [58]:
predict = model.predict_classes(x_test)

In [50]:
predict[:10]

array([[1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1]], dtype=int32)

In [39]:
input_text = """
I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.
"""

In [40]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=100)
    predict_result = model.predict_classes(pad_input_seq)
    return predict_result[0][0]

In [41]:
print(predict_review(input_text))

1
