In [1]:
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import os
import re

Using TensorFlow backend.


In [2]:
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [3]:
def read_file(filetype):
    path = 'data/aclImdb/'
    file_list = []
    
    positive_path = path + filetype + '/pos/'
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]

    negative_path = path + filetype + '/neg/'
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]

    print('read', filetype, 'files:', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            all_texts += [rm_tags(' '.join(file_input.readlines()))]
            
    return all_labels, all_texts    

In [4]:
y_train, train_text = read_file('train')

read train files: 25000


In [5]:
y_test, test_text = read_file('test')

read test files: 25000


In [6]:
token = Tokenizer(num_words=3800)

In [7]:
token.fit_on_texts(train_text)

In [8]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [9]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test = sequence.pad_sequences(x_test_seq, maxlen=380)

In [10]:
model = Sequential()

In [11]:
model.add(Embedding(output_dim=32, input_dim=3800, input_length=380))
model.add(Dropout(0.2))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
model.add(LSTM(units=32))

In [13]:
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))

In [14]:
model.add(Dense(units=1, activation='sigmoid'))

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               8448      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 138,625
Trainable params: 138,625
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
train_history = model.fit(x_train, y_train, batch_size=100, epochs=10, verbose=2, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 63s - loss: 0.5075 - acc: 0.7413 - val_loss: 0.3955 - val_acc: 0.8000
Epoch 2/10
 - 61s - loss: 0.2796 - acc: 0.8863 - val_loss: 0.3920 - val_acc: 0.8184
Epoch 3/10
 - 60s - loss: 0.2313 - acc: 0.9092 - val_loss: 0.5280 - val_acc: 0.8116
Epoch 4/10
 - 62s - loss: 0.2092 - acc: 0.9193 - val_loss: 0.5882 - val_acc: 0.7628
Epoch 5/10
 - 60s - loss: 0.1904 - acc: 0.9270 - val_loss: 0.3586 - val_acc: 0.8392
Epoch 6/10
 - 61s - loss: 0.1714 - acc: 0.9339 - val_loss: 0.5127 - val_acc: 0.8222
Epoch 7/10
 - 60s - loss: 0.1577 - acc: 0.9415 - val_loss: 0.6399 - val_acc: 0.7588
Epoch 8/10
 - 61s - loss: 0.1448 - acc: 0.9449 - val_loss: 0.2766 - val_acc: 0.8838
Epoch 9/10
 - 61s - loss: 0.1484 - acc: 0.9448 - val_loss: 0.4346 - val_acc: 0.8386
Epoch 10/10
 - 60s - loss: 0.1165 - acc: 0.9572 - val_loss: 0.4699 - val_acc: 0.8272


In [18]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.85224

In [19]:
predict = model.predict_classes(x_test)

In [20]:
predict[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [21]:
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [22]:
sentiment_dict = {1: '正面的', 0: '負面的'}
def display_test_sentiment(i):
    print(test_text[i])
    print('label真實值：', sentiment_dict[y_test[i]], '預測結果：', sentiment_dict[predict_classes[i]])

In [23]:
display_test_sentiment(3)

I saw this film in a sneak preview, and it is delightful. The cinematography is unusually creative, the acting is good, and the story is fabulous. If this movie does not do well, it won't be because it doesn't deserve to. Before this film, I didn't realize how charming Shia Lebouf could be. He does a marvelous, self-contained, job as the lead. There's something incredibly sweet about him, and it makes the movie even better. The other actors do a good job as well, and the film contains moments of really high suspense, more than one might expect from a movie about golf. Sports movies are a dime a dozen, but this one stands out. This is one I'd recommend to anyone.
label真實值： 正面的 預測結果： 正面的


In [24]:
display_test_sentiment(12502)

First of all I hate those moronic rappers, who could'nt act if they had a gun pressed against their foreheads. All they do is curse and shoot each other and acting like cliché'e version of gangsters.The movie doesn't take more than five minutes to explain what is going on before we're already at the warehouse There is not a single sympathetic character in this movie, except for the homeless guy, who is also the only one with half a brain.Bill Paxton and William Sadler are both hill billies and Sadlers character is just as much a villain as the gangsters. I did'nt like him right from the start.The movie is filled with pointless violence and Walter Hills specialty: people falling through windows with glass flying everywhere. There is pretty much no plot and it is a big problem when you root for no-one. Everybody dies, except from Paxton and the homeless guy and everybody get what they deserve.The only two black people that can act is the homeless guy and the junkie but they're actors by 

In [25]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=380)
    predict_result = model.predict_classes(pad_input_seq)
    print(sentiment_dict[predict_result[0][0]])

In [26]:
predict_review('''
Where do I start. This adaptation of Disney's 1991 Beauty and the Beast was an utter disappointment. Emma Watson as Belle was extremely unconvincing from the start to the end. She had the same expressions as the actress from Twilight. The animators did a terrible job with the Beast. He looked fake and lifeless. They could have used special makeup to create the beast similar to the Grinch where we get to see Jim Carrey's expressions. The side character animations were poorly executed. Overall I felt the film was rushed as there was lack of compassion and chemistry between the characters. There was a lot of CGI and green screen which could have been replaced by normal acting, because then why make an animated version of an animated film? This is by far the worst remake of an animated classic.
''')

負面的


In [27]:
predict_review('''
As a fan of the original cartoon, I also really enjoyed this remake. The songs are as wonderful as ever and the cgi effects really add to the film. The acting seemed very strong to me and the casting is pretty good. I'd definitely recommend seeing this. Will be a classic for future generations.
''')

正面的
