In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]
    
    positive_path=path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
        
    negative_path=path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
    
    print('read',filetype, 'files:',len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

Using TensorFlow backend.


In [2]:
y_train,train_text=read_files("train")

read train files: 25000


In [3]:
y_test,test_text=read_files('test')

read test files: 25000


In [4]:
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)

x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen=380)

In [32]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

In [33]:
model = Sequential()

In [34]:
model.add(Embedding(output_dim=32,
                   input_dim=3800,
                   input_length=380))
model.add(Dropout(0.35))

In [35]:
model.add(SimpleRNN(units=16))

In [36]:
model.add(Dense(units=256,activation='relu'))
model.add(Dropout(0.35))

In [37]:
model.add(Dense(units=1,activation='sigmoid'))

In [38]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_6 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
simple_rnn_10 (SimpleRNN)    (None, 16)                784       
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 126,993
Trainable params: 126,993
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [43]:
train_history = model.fit(x_train, y_train, batch_size=100,
                         epochs=10,verbose=2,
                         validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 24s - loss: 0.6569 - acc: 0.6247 - val_loss: 0.9459 - val_acc: 0.0424
Epoch 2/10
 - 23s - loss: 0.5589 - acc: 0.7072 - val_loss: 0.7941 - val_acc: 0.5332
Epoch 3/10
 - 29s - loss: 0.4059 - acc: 0.8188 - val_loss: 0.5367 - val_acc: 0.7622
Epoch 4/10
 - 26s - loss: 0.2859 - acc: 0.8818 - val_loss: 0.4121 - val_acc: 0.8352
Epoch 5/10
 - 27s - loss: 0.2109 - acc: 0.9183 - val_loss: 0.6407 - val_acc: 0.7692
Epoch 6/10
 - 27s - loss: 0.1894 - acc: 0.9250 - val_loss: 0.7748 - val_acc: 0.7702
Epoch 7/10
 - 26s - loss: 0.1359 - acc: 0.9489 - val_loss: 0.8086 - val_acc: 0.7472
Epoch 8/10
 - 26s - loss: 0.1011 - acc: 0.9616 - val_loss: 0.8123 - val_acc: 0.7724
Epoch 9/10
 - 25s - loss: 0.1007 - acc: 0.9619 - val_loss: 0.8082 - val_acc: 0.7672
Epoch 10/10
 - 23s - loss: 0.0962 - acc: 0.9643 - val_loss: 0.8119 - val_acc: 0.7954


In [44]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.82952000000000004